diff --git .github/workflows/build-ci-container.yml .github/workflows/build-ci-container.yml
index 4fa0713b381c..85ecc82fa649 100644
--- .github/workflows/build-ci-container.yml
+++ .github/workflows/build-ci-container.yml
@@ -23,7 +23,9 @@ jobs:
     runs-on: depot-ubuntu-22.04-16
     outputs:
       container-name: ${{ steps.vars.outputs.container-name }}
+      container-name-agent: ${{ steps.vars.outputs.container-name-agent }}
       container-name-tag: ${{ steps.vars.outputs.container-name-tag }}
+      container-name-agent-tag: ${{ steps.vars.outputs.container-name-agent-tag }}
       container-filename: ${{ steps.vars.outputs.container-filename }}
     steps:
       - name: Checkout LLVM
@@ -36,19 +38,22 @@ jobs:
           tag=`date +%s`
           container_name="ghcr.io/$GITHUB_REPOSITORY_OWNER/ci-ubuntu-22.04"
           echo "container-name=$container_name" >> $GITHUB_OUTPUT
+          echo "container-name-agent=$container_name-agent" >> $GITHUB_OUTPUT
           echo "container-name-tag=$container_name:$tag" >> $GITHUB_OUTPUT
+          echo "container-name-agent-tag=$container_name-agent:$tag" >> $GITHUB_OUTPUT
           echo "container-filename=$(echo $container_name:$tag  | sed -e 's/\//-/g' -e 's/:/-/g').tar" >> $GITHUB_OUTPUT
       - name: Build container
         working-directory: ./.github/workflows/containers/github-action-ci/
         run: |
-          podman build -t ${{ steps.vars.outputs.container-name-tag }} .
+          podman build --target ci-container -t ${{ steps.vars.outputs.container-name-tag }} .
+          podman build --target ci-container-agent -t ${{ steps.vars.outputs.container-name-agent-tag }} .
 
       # Save the container so we have it in case the push fails.  This also
       # allows us to separate the push step into a different job so we can
       # maintain minimal permissions while building the container.
       - name: Save container image
         run: |
-          podman save  ${{ steps.vars.outputs.container-name-tag }} >  ${{ steps.vars.outputs.container-filename }}
+          podman save  ${{ steps.vars.outputs.container-name-tag }} ${{ steps.vars.outputs.container-name-agent-tag }} >  ${{ steps.vars.outputs.container-filename }}
 
       - name: Upload container image
         uses: actions/upload-artifact@v4
@@ -86,3 +91,7 @@ jobs:
           podman login -u ${{ github.actor }} -p $GITHUB_TOKEN ghcr.io
           podman push ${{ needs.build-ci-container.outputs.container-name-tag }}
           podman push ${{ needs.build-ci-container.outputs.container-name }}:latest
+
+          podman tag ${{ needs.build-ci-container.outputs.container-name-agent-tag }} ${{ needs.build-ci-container.outputs.container-name-agent }}:latest
+          podman push ${{ needs.build-ci-container.outputs.container-name-agent-tag }}
+          podman push ${{ needs.build-ci-container.outputs.container-name-agent }}:latest
diff --git .github/workflows/containers/github-action-ci-windows/Dockerfile .github/workflows/containers/github-action-ci-windows/Dockerfile
index bc56e2093550..2295e39d62c3 100644
--- .github/workflows/containers/github-action-ci-windows/Dockerfile
+++ .github/workflows/containers/github-action-ci-windows/Dockerfile
@@ -108,7 +108,7 @@ RUN choco install -y handle
 
 RUN pip3 install pywin32 buildbot-worker==2.8.4
 
-ARG RUNNER_VERSION=2.319.1
+ARG RUNNER_VERSION=2.321.0
 ENV RUNNER_VERSION=$RUNNER_VERSION
 
 RUN powershell -Command \
diff --git .github/workflows/containers/github-action-ci/Dockerfile .github/workflows/containers/github-action-ci/Dockerfile
index 3757e603f8a1..35a0f1f6020d 100644
--- .github/workflows/containers/github-action-ci/Dockerfile
+++ .github/workflows/containers/github-action-ci/Dockerfile
@@ -13,7 +13,8 @@ RUN apt-get update && \
     ninja-build \
     python3 \
     git \
-    curl
+    curl \
+    zlib1g-dev
 
 RUN curl -O -L https://github.com/llvm/llvm-project/archive/refs/tags/llvmorg-$LLVM_VERSION.tar.gz && tar -xf llvmorg-$LLVM_VERSION.tar.gz
 
@@ -38,7 +39,7 @@ RUN cmake -B ./build -G Ninja ./llvm \
 
 RUN ninja -C ./build stage2-clang-bolt stage2-install-distribution && ninja -C ./build install-distribution
 
-FROM base
+FROM base as ci-container
     
 COPY --from=stage1-toolchain $LLVM_SYSROOT $LLVM_SYSROOT
 
@@ -91,4 +92,15 @@ RUN adduser gha sudo
 RUN echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
 
 USER gha
+WORKDIR /home/gha
+
+FROM ci-container as ci-container-agent
+
+ENV GITHUB_RUNNER_VERSION=2.321.0
+
+RUN mkdir actions-runner && \
+    cd actions-runner && \
+    curl -O -L https://github.com/actions/runner/releases/download/v$GITHUB_RUNNER_VERSION/actions-runner-linux-x64-$GITHUB_RUNNER_VERSION.tar.gz && \
+    tar xzf ./actions-runner-linux-x64-$GITHUB_RUNNER_VERSION.tar.gz && \
+    rm ./actions-runner-linux-x64-$GITHUB_RUNNER_VERSION.tar.gz
 
diff --git .github/workflows/premerge.yaml .github/workflows/premerge.yaml
index 261dc8bbb97e..6c7f1d81a953 100644
--- .github/workflows/premerge.yaml
+++ .github/workflows/premerge.yaml
@@ -30,6 +30,8 @@ jobs:
           fetch-depth: 2
       - name: Setup ccache
         uses: hendrikmuhs/ccache-action@v1.2.14
+        with:
+          max-size: "2000M"
       - name: Build and Test
         # Mark the job as a success even if the step fails so that people do
         # not get notified while the new premerge pipeline is in an
diff --git .github/workflows/release-binaries-all.yml .github/workflows/release-binaries-all.yml
index f5318aecc53a..d5b2d3328610 100644
--- .github/workflows/release-binaries-all.yml
+++ .github/workflows/release-binaries-all.yml
@@ -83,7 +83,6 @@ jobs:
       matrix:
         runs-on:
           - ubuntu-22.04
-          - windows-2022
           - macos-13
           - macos-14
 
diff --git .github/workflows/release-binaries.yml .github/workflows/release-binaries.yml
index fc5431c96bbf..29be8195da68 100644
--- .github/workflows/release-binaries.yml
+++ .github/workflows/release-binaries.yml
@@ -18,7 +18,6 @@ on:
         type: choice
         options:
           - ubuntu-22.04
-          - windows-2022
           - macos-13
           - macos-14
 
@@ -60,6 +59,8 @@ jobs:
       enable-pgo: ${{ steps.vars.outputs.enable-pgo }}
       release-binary-basename: ${{ steps.vars.outputs.release-binary-basename }}
       release-binary-filename: ${{ steps.vars.outputs.release-binary-filename }}
+      build-runs-on: ${{ steps.vars.outputs.build-runs-on }}
+      test-runs-on: ${{ steps.vars.outputs.build-runs-on }}
 
     steps:
     # It's good practice to use setup-python, but this is also required on macos-14
@@ -144,12 +145,40 @@ jobs:
 
         echo "target-cmake-flags=$target_cmake_flags" >> $GITHUB_OUTPUT
         echo "build-flang=$build_flang" >> $GITHUB_OUTPUT
-
-  build-stage1:
-    name: "Build Stage 1"
+        case "${{ inputs.runs-on }}" in
+          ubuntu-22.04)
+            build_runs_on="depot-${{ inputs.runs-on }}-16"
+            test_runs_on=$build_runs_on
+            ;;
+          macos-13)
+            if [ "$GITHUB_EVENT_NAME" = "pull_request" ]; then
+              build_runs_on="${{ inputs.runs-on }}"
+            else
+              build_runs_on="macos-13-large"
+            fi
+            test_runs_on="${{ inputs.runs-on }}"
+            ;;
+          macos-14)
+            if [ "$GITHUB_EVENT_NAME" = "pull_request" ]; then
+              build_runs_on="${{ inputs.runs-on }}"
+            else
+              build_runs_on="depot-macos-14"
+            fi
+            test_runs_on="${{ inputs.runs-on }}"
+            ;;
+          *)
+            test_runs_on="${{ inputs.runs-on }}"
+            build_runs_on=$test_runs_on
+            ;;
+        esac
+        echo "build-runs-on=$build_runs_on" >> $GITHUB_OUTPUT
+        echo "test-runs-on=$test_runs_on" >> $GITHUB_OUTPUT
+
+  build-release-package:
+    name: "Build Release Package"
     needs: prepare
     if: github.repository == 'llvm/llvm-project'
-    runs-on: ${{ inputs.runs-on }}
+    runs-on: ${{ needs.prepare.outputs.build-runs-on }}
     steps:
 
     - name: Checkout Actions
@@ -195,7 +224,7 @@ jobs:
         key: sccache-${{ runner.os }}-${{ runner.arch }}-release
         variant: sccache
 
-    - name: Build Stage 1 Clang
+    - name: Configure
       id: build
       shell: bash
       run: |
@@ -208,182 +237,12 @@ jobs:
             -DBOOTSTRAP_CPACK_PACKAGE_FILE_NAME="${{ needs.prepare.outputs.release-binary-basename }}" \
             -DCMAKE_C_COMPILER_LAUNCHER=sccache \
             -DCMAKE_CXX_COMPILER_LAUNCHER=sccache
-        ninja -v -C ${{ steps.setup-stage.outputs.build-prefix }}/build
-        # There is a race condition on the MacOS builders and this command is here
-        # to help debug that when it happens.
-        ls -ltr ${{ steps.setup-stage.outputs.build-prefix }}/build
-    
-    - name: Save Stage
-      uses: ./workflows-main/.github/workflows/release-binaries-save-stage
-      with:
-        build-prefix: ${{ steps.setup-stage.outputs.build-prefix }}
-
-  build-stage2:
-    name: "Build Stage 2"
-    needs:
-      - prepare
-      - build-stage1
-    if: github.repository == 'llvm/llvm-project'
-    runs-on: ${{ inputs.runs-on }}
-    steps:
-    - name: Checkout Actions
-      uses: actions/checkout@v4
-      with:
-        ref: ${{ (github.event_name == 'pull_request' && github.sha) || 'main' }}
-        sparse-checkout: |
-          .github/workflows/
-        sparse-checkout-cone-mode: false
-        path: workflows
-    - name: Setup Stage
-      id: setup-stage
-      uses: ./workflows/.github/workflows/release-binaries-setup-stage
-      with:
-        previous-artifact: build-stage1
-
-    - name: Build Stage 2
-      # Re-enable once PGO builds are supported.
-      if: needs.prepare.outputs.enable-pgo == 'true'
-      shell: bash
-      run: |
-        ninja -C ${{ steps.setup-stage.outputs.build-prefix}}/build stage2-instrumented
-
-    - name: Save Stage
-      uses: ./workflows/.github/workflows/release-binaries-save-stage
-      with:
-        build-prefix: ${{ steps.setup-stage.outputs.build-prefix }}
-
-  build-stage3-clang:
-    name: "Build Stage 3 LLVM/Clang"
-    needs:
-      - prepare
-      - build-stage2
-    if: github.repository == 'llvm/llvm-project'
-    runs-on: ${{ inputs.runs-on }}
-    steps:
-    - name: Checkout Actions
-      uses: actions/checkout@v4
-      with:
-        ref: ${{ (github.event_name == 'pull_request' && github.sha) || 'main' }}
-        sparse-checkout: |
-          .github/workflows/
-        sparse-checkout-cone-mode: false
-        path: workflows
-    - name: Setup Stage
-      id: setup-stage
-      uses: ./workflows/.github/workflows/release-binaries-setup-stage
-      with:
-        previous-artifact: build-stage2
-
-    - name: Build LLVM/Clang
-      shell: bash
-      run: |
-        # There is a race condition on the MacOS builders and this command is here
-        # to help debug that when it happens.
-        ls -ltr ${{ steps.setup-stage.outputs.build-prefix }}/build
-        ninja -C ${{ steps.setup-stage.outputs.build-prefix }}/build stage2-clang
-        # Build some of the larger binaries here too.
-        ninja -C ${{ steps.setup-stage.outputs.build-prefix }}/build/tools/clang/stage2-bins/ \
-            clang-scan-deps \
-            modularize clangd \
-            clangd-indexer \
-            clang-check \
-            ${{ (runner.os == 'Linux' && 'clangd-fuzzer') || '' }} \
-            clang-tidy \
-            llc \
-            lli \
-            llvm-exegesis \
-            llvm-opt-fuzzer \
-            llvm-reduce \
-            llvm-lto \
-            dsymutil
-
-    - name: Save Stage
-      uses: ./workflows/.github/workflows/release-binaries-save-stage
-      with:
-        build-prefix: ${{ steps.setup-stage.outputs.build-prefix }}
-
-  build-stage3-flang:
-    name: "Build Stage 3 Flang/MLIR/Bolt"
-    needs:
-      - prepare
-      - build-stage3-clang
-    runs-on: ${{ inputs.runs-on }}
-    steps:
-    - name: Checkout Actions
-      uses: actions/checkout@v4
-      with:
-        ref: ${{ (github.event_name == 'pull_request' && github.sha) || 'main' }}
-        sparse-checkout: |
-          .github/workflows/
-        sparse-checkout-cone-mode: false
-        path: workflows
-    - name: Setup Stage
-      id: setup-stage
-      uses: ./workflows/.github/workflows/release-binaries-setup-stage
-      with:
-        previous-artifact: build-stage3-clang
-
-    - name: Build Flang / MLIR / Bolt
+    - name: Build
       shell: bash
       run: |
-        # Build some of the mlir tools that take a long time to link
-        if [ "${{ needs.prepare.outputs.build-flang }}" = "true" ]; then
-          ninja -C ${{ steps.setup-stage.outputs.build-prefix }}/build/tools/clang/stage2-bins/ -j2 flang bbc
-        fi
-        ninja -C ${{ steps.setup-stage.outputs.build-prefix }}/build/tools/clang/stage2-bins/ \
-            mlir-bytecode-parser-fuzzer \
-            mlir-cpu-runner \
-            mlir-lsp-server \
-            mlir-opt \
-            mlir-query \
-            mlir-reduce \
-            mlir-text-parser-fuzzer \
-            mlir-translate \
-            mlir-transform-opt \
-            mlir-cat \
-            mlir-minimal-opt \
-            mlir-minimal-opt-canonicalize \
-            mlir-pdll-lsp-server \
-            llvm-bolt \
-            llvm-bolt-heatmap
-    
-    - name: Save Stage
-      uses: ./workflows/.github/workflows/release-binaries-save-stage
-      with:
-        build-prefix: ${{ steps.setup-stage.outputs.build-prefix }}
-
-  build-stage3-all:
-    name: "Build Stage 3"
-    needs:
-      - prepare
-      - build-stage3-flang
-    runs-on: ${{ inputs.runs-on }}
-    steps:
-    - name: Checkout Actions
-      uses: actions/checkout@v4
-      with:
-        ref: ${{ (github.event_name == 'pull_request' && github.sha) || 'main' }}
-        sparse-checkout: |
-          .github/workflows/
-        sparse-checkout-cone-mode: false
-        path: workflows
-    - name: Setup Stage
-      id: setup-stage
-      uses: ./workflows/.github/workflows/release-binaries-setup-stage
-      with:
-        previous-artifact: build-stage3-flang
-
-    - name: Build Release Package
-      shell: bash
-      run: |
-        which cmake
-        ninja -C ${{ steps.setup-stage.outputs.build-prefix }}/build stage2-package
-        # Copy Release artifact to the workspace so it is easier to upload.
-        # This is necessary, because on Windows, the build-prefix path can
-        # only be used on bash steps, because it uses the form of /d/files/
-        # and other steps expect D:\files.
+        ninja -v -C ${{ steps.setup-stage.outputs.build-prefix }}/build stage2-package
         mv ${{ steps.setup-stage.outputs.build-prefix  }}/build/tools/clang/stage2-bins/${{ needs.prepare.outputs.release-binary-filename }} .
-
+    
     - uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0
       with:
         name: ${{ runner.os }}-${{ runner.arch }}-release-binary
@@ -398,9 +257,9 @@ jobs:
       run: |
         find ${{ steps.setup-stage.outputs.build-prefix }}/build -iname ${{ needs.prepare.outputs.release-binary-filename }} -delete
         rm -Rf ${{ steps.setup-stage.outputs.build-prefix }}/build/tools/clang/stage2-bins/_CPack_Packages
-
+    
     - name: Save Stage
-      uses: ./workflows/.github/workflows/release-binaries-save-stage
+      uses: ./workflows-main/.github/workflows/release-binaries-save-stage
       with:
         build-prefix: ${{ steps.setup-stage.outputs.build-prefix }}
 
@@ -408,9 +267,8 @@ jobs:
     name: "Upload Release Binaries"
     needs:
       - prepare
-      - build-stage3-all
+      - build-release-package
     if: >-
-      always() &&
       github.event_name != 'pull_request' &&
       needs.prepare.outputs.upload == 'true'
     runs-on: ubuntu-22.04
@@ -463,14 +321,14 @@ jobs:
         upload \
         --files ${{ needs.prepare.outputs.release-binary-filename }}*
 
-  test-stage3:
-    name: "Test Stage 3"
+  test-release:
+    name: "Test Release"
     needs:
       - prepare
-      - build-stage3-all
+      - build-release-package
     if: >-
       github.repository == 'llvm/llvm-project'
-    runs-on: ${{ inputs.runs-on }}
+    runs-on: ${{ needs.prepare.outputs.test-runs-on }}
     steps:
     - name: Checkout Actions
       uses: actions/checkout@v4
@@ -484,7 +342,7 @@ jobs:
       id: setup-stage
       uses: ./workflows/.github/workflows/release-binaries-setup-stage
       with:
-        previous-artifact: build-stage3-all
+        previous-artifact: build-release-package
 
     - name: Run Tests
       shell: bash
diff --git .github/workflows/spirv-tests.yml .github/workflows/spirv-tests.yml
index 34c77a398c15..ea466dc6c52e 100644
--- .github/workflows/spirv-tests.yml
+++ .github/workflows/spirv-tests.yml
@@ -25,5 +25,5 @@ jobs:
     with:
       build_target: check-llvm-codegen-spirv
       projects:
-      extra_cmake_args: '-DLLVM_TARGETS_TO_BUILD="" -DLLVM_EXPERIMENTAL_TARGETS_TO_BUILD="SPIRV" -DLLVM_INCLUDE_SPIRV_TOOLS_TESTS=ON'
+      extra_cmake_args: '-DLLVM_TARGETS_TO_BUILD="SPIRV" -DLLVM_INCLUDE_SPIRV_TOOLS_TESTS=ON'
       os_list: '["ubuntu-22.04"]'
diff --git bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index d84da10b5bbe..0b6f21527f0a 100644
--- bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -134,10 +134,15 @@ public:
   using MCPlusBuilder::MCPlusBuilder;
 
   MCPhysReg getStackPointer() const override { return AArch64::SP; }
+  MCPhysReg getFramePointer() const override { return AArch64::FP; }
 
-  bool isPush(const MCInst &Inst) const override { return false; }
+  bool isPush(const MCInst &Inst) const override {
+    return isStoreToStack(Inst);
+  };
 
-  bool isPop(const MCInst &Inst) const override { return false; }
+  bool isPop(const MCInst &Inst) const override {
+    return isLoadFromStack(Inst);
+  };
 
   void createCall(MCInst &Inst, const MCSymbol *Target,
                   MCContext *Ctx) override {
@@ -244,59 +249,207 @@ public:
   }
 
   bool isLDRB(const MCInst &Inst) const {
-    return (Inst.getOpcode() == AArch64::LDRBBpost ||
-            Inst.getOpcode() == AArch64::LDRBBpre ||
-            Inst.getOpcode() == AArch64::LDRBBroW ||
-            Inst.getOpcode() == AArch64::LDRBBroX ||
-            Inst.getOpcode() == AArch64::LDRBBui ||
-            Inst.getOpcode() == AArch64::LDRSBWpost ||
-            Inst.getOpcode() == AArch64::LDRSBWpre ||
-            Inst.getOpcode() == AArch64::LDRSBWroW ||
-            Inst.getOpcode() == AArch64::LDRSBWroX ||
-            Inst.getOpcode() == AArch64::LDRSBWui ||
-            Inst.getOpcode() == AArch64::LDRSBXpost ||
-            Inst.getOpcode() == AArch64::LDRSBXpre ||
-            Inst.getOpcode() == AArch64::LDRSBXroW ||
-            Inst.getOpcode() == AArch64::LDRSBXroX ||
-            Inst.getOpcode() == AArch64::LDRSBXui);
+    const unsigned opcode = Inst.getOpcode();
+    switch (opcode) {
+    case AArch64::LDRBpost:
+    case AArch64::LDRBBpost:
+    case AArch64::LDRBBpre:
+    case AArch64::LDRBBroW:
+    case AArch64::LDRBroW:
+    case AArch64::LDRBroX:
+    case AArch64::LDRBBroX:
+    case AArch64::LDRBBui:
+    case AArch64::LDRBui:
+    case AArch64::LDRBpre:
+    case AArch64::LDRSBWpost:
+    case AArch64::LDRSBWpre:
+    case AArch64::LDRSBWroW:
+    case AArch64::LDRSBWroX:
+    case AArch64::LDRSBWui:
+    case AArch64::LDRSBXpost:
+    case AArch64::LDRSBXpre:
+    case AArch64::LDRSBXroW:
+    case AArch64::LDRSBXroX:
+    case AArch64::LDRSBXui:
+    case AArch64::LDURBi:
+    case AArch64::LDURBBi:
+    case AArch64::LDURSBWi:
+    case AArch64::LDURSBXi:
+    case AArch64::LDTRBi:
+    case AArch64::LDTRSBWi:
+    case AArch64::LDTRSBXi:
+      return true;
+    default:
+      break;
+    }
+
+    return false;
   }
 
   bool isLDRH(const MCInst &Inst) const {
-    return (Inst.getOpcode() == AArch64::LDRHHpost ||
-            Inst.getOpcode() == AArch64::LDRHHpre ||
-            Inst.getOpcode() == AArch64::LDRHHroW ||
-            Inst.getOpcode() == AArch64::LDRHHroX ||
-            Inst.getOpcode() == AArch64::LDRHHui ||
-            Inst.getOpcode() == AArch64::LDRSHWpost ||
-            Inst.getOpcode() == AArch64::LDRSHWpre ||
-            Inst.getOpcode() == AArch64::LDRSHWroW ||
-            Inst.getOpcode() == AArch64::LDRSHWroX ||
-            Inst.getOpcode() == AArch64::LDRSHWui ||
-            Inst.getOpcode() == AArch64::LDRSHXpost ||
-            Inst.getOpcode() == AArch64::LDRSHXpre ||
-            Inst.getOpcode() == AArch64::LDRSHXroW ||
-            Inst.getOpcode() == AArch64::LDRSHXroX ||
-            Inst.getOpcode() == AArch64::LDRSHXui);
+    const unsigned opcode = Inst.getOpcode();
+    switch (opcode) {
+    case AArch64::LDRHpost:
+    case AArch64::LDRHHpost:
+    case AArch64::LDRHHpre:
+    case AArch64::LDRHroW:
+    case AArch64::LDRHHroW:
+    case AArch64::LDRHroX:
+    case AArch64::LDRHHroX:
+    case AArch64::LDRHHui:
+    case AArch64::LDRHui:
+    case AArch64::LDRHpre:
+    case AArch64::LDRSHWpost:
+    case AArch64::LDRSHWpre:
+    case AArch64::LDRSHWroW:
+    case AArch64::LDRSHWroX:
+    case AArch64::LDRSHWui:
+    case AArch64::LDRSHXpost:
+    case AArch64::LDRSHXpre:
+    case AArch64::LDRSHXroW:
+    case AArch64::LDRSHXroX:
+    case AArch64::LDRSHXui:
+    case AArch64::LDURHi:
+    case AArch64::LDURHHi:
+    case AArch64::LDURSHWi:
+    case AArch64::LDURSHXi:
+    case AArch64::LDTRHi:
+    case AArch64::LDTRSHWi:
+    case AArch64::LDTRSHXi:
+      return true;
+    default:
+      break;
+    }
+
+    return false;
   }
 
   bool isLDRW(const MCInst &Inst) const {
-    return (Inst.getOpcode() == AArch64::LDRWpost ||
-            Inst.getOpcode() == AArch64::LDRWpre ||
-            Inst.getOpcode() == AArch64::LDRWroW ||
-            Inst.getOpcode() == AArch64::LDRWroX ||
-            Inst.getOpcode() == AArch64::LDRWui);
+    const unsigned opcode = Inst.getOpcode();
+    switch (opcode) {
+    case AArch64::LDRWpost:
+    case AArch64::LDRWpre:
+    case AArch64::LDRWroW:
+    case AArch64::LDRWroX:
+    case AArch64::LDRWui:
+    case AArch64::LDRWl:
+    case AArch64::LDRSWl:
+    case AArch64::LDURWi:
+    case AArch64::LDRSWpost:
+    case AArch64::LDRSWpre:
+    case AArch64::LDRSWroW:
+    case AArch64::LDRSWroX:
+    case AArch64::LDRSWui:
+    case AArch64::LDURSWi:
+    case AArch64::LDTRWi:
+    case AArch64::LDTRSWi:
+    case AArch64::LDPWi:
+    case AArch64::LDPWpost:
+    case AArch64::LDPWpre:
+    case AArch64::LDPSWi:
+    case AArch64::LDPSWpost:
+    case AArch64::LDPSWpre:
+    case AArch64::LDNPWi:
+      return true;
+    default:
+      break;
+    }
+
+    return false;
   }
 
   bool isLDRX(const MCInst &Inst) const {
-    return (Inst.getOpcode() == AArch64::LDRXpost ||
-            Inst.getOpcode() == AArch64::LDRXpre ||
-            Inst.getOpcode() == AArch64::LDRXroW ||
-            Inst.getOpcode() == AArch64::LDRXroX ||
-            Inst.getOpcode() == AArch64::LDRXui);
+    const unsigned opcode = Inst.getOpcode();
+    switch (opcode) {
+    case AArch64::LDRXpost:
+    case AArch64::LDRXpre:
+    case AArch64::LDRXroW:
+    case AArch64::LDRXroX:
+    case AArch64::LDRXui:
+    case AArch64::LDRXl:
+    case AArch64::LDURXi:
+    case AArch64::LDTRXi:
+    case AArch64::LDNPXi:
+    case AArch64::LDPXi:
+    case AArch64::LDPXpost:
+    case AArch64::LDPXpre:
+      return true;
+    default:
+      break;
+    }
+
+    return false;
+  }
+
+  bool isLDRS(const MCInst &Inst) const {
+    const unsigned opcode = Inst.getOpcode();
+    switch (opcode) {
+    case AArch64::LDRSl:
+    case AArch64::LDRSui:
+    case AArch64::LDRSroW:
+    case AArch64::LDRSroX:
+    case AArch64::LDURSi:
+    case AArch64::LDPSi:
+    case AArch64::LDNPSi:
+    case AArch64::LDRSpre:
+    case AArch64::LDRSpost:
+    case AArch64::LDPSpost:
+    case AArch64::LDPSpre:
+      return true;
+    default:
+      break;
+    }
+
+    return false;
+  }
+
+  bool isLDRD(const MCInst &Inst) const {
+    const unsigned opcode = Inst.getOpcode();
+    switch (opcode) {
+    case AArch64::LDRDl:
+    case AArch64::LDRDui:
+    case AArch64::LDRDpre:
+    case AArch64::LDRDpost:
+    case AArch64::LDRDroW:
+    case AArch64::LDRDroX:
+    case AArch64::LDURDi:
+    case AArch64::LDPDi:
+    case AArch64::LDNPDi:
+    case AArch64::LDPDpost:
+    case AArch64::LDPDpre:
+      return true;
+    default:
+      break;
+    }
+
+    return false;
+  }
+
+  bool isLDRQ(const MCInst &Inst) const {
+    const unsigned opcode = Inst.getOpcode();
+    switch (opcode) {
+    case AArch64::LDRQui:
+    case AArch64::LDRQl:
+    case AArch64::LDRQpre:
+    case AArch64::LDRQpost:
+    case AArch64::LDRQroW:
+    case AArch64::LDRQroX:
+    case AArch64::LDURQi:
+    case AArch64::LDPQi:
+    case AArch64::LDNPQi:
+    case AArch64::LDPQpost:
+    case AArch64::LDPQpre:
+      return true;
+    default:
+      break;
+    }
+
+    return false;
   }
 
   bool mayLoad(const MCInst &Inst) const override {
-    return isLDRB(Inst) || isLDRH(Inst) || isLDRW(Inst) || isLDRX(Inst);
+    return isLDRB(Inst) || isLDRH(Inst) || isLDRW(Inst) || isLDRX(Inst) ||
+           isLDRQ(Inst) || isLDRD(Inst) || isLDRS(Inst);
   }
 
   bool isAArch64ExclusiveLoad(const MCInst &Inst) const override {
@@ -340,8 +493,7 @@ public:
       if (!Operand.isReg())
         continue;
       unsigned Reg = Operand.getReg();
-      if (Reg == AArch64::SP || Reg == AArch64::WSP || Reg == AArch64::FP ||
-          Reg == AArch64::W29)
+      if (Reg == AArch64::SP || Reg == AArch64::WSP)
         return true;
     }
     return false;
@@ -1170,7 +1322,209 @@ public:
     Inst.addOperand(MCOperand::createImm(0));
   }
 
-  bool mayStore(const MCInst &Inst) const override { return false; }
+  bool isStorePair(const MCInst &Inst) const {
+    const unsigned opcode = Inst.getOpcode();
+
+    auto isStorePairImmOffset = [&]() {
+      switch (opcode) {
+      case AArch64::STPWi:
+      case AArch64::STPXi:
+      case AArch64::STPSi:
+      case AArch64::STPDi:
+      case AArch64::STPQi:
+      case AArch64::STNPWi:
+      case AArch64::STNPXi:
+      case AArch64::STNPSi:
+      case AArch64::STNPDi:
+      case AArch64::STNPQi:
+        return true;
+      default:
+        break;
+      }
+
+      return false;
+    };
+
+    auto isStorePairPostIndex = [&]() {
+      switch (opcode) {
+      case AArch64::STPWpost:
+      case AArch64::STPXpost:
+      case AArch64::STPSpost:
+      case AArch64::STPDpost:
+      case AArch64::STPQpost:
+        return true;
+      default:
+        break;
+      }
+
+      return false;
+    };
+
+    auto isStorePairPreIndex = [&]() {
+      switch (opcode) {
+      case AArch64::STPWpre:
+      case AArch64::STPXpre:
+      case AArch64::STPSpre:
+      case AArch64::STPDpre:
+      case AArch64::STPQpre:
+        return true;
+      default:
+        break;
+      }
+
+      return false;
+    };
+
+    return isStorePairImmOffset() || isStorePairPostIndex() ||
+           isStorePairPreIndex();
+  }
+
+  bool isStoreReg(const MCInst &Inst) const {
+    const unsigned opcode = Inst.getOpcode();
+
+    auto isStoreRegUnscaleImm = [&]() {
+      switch (opcode) {
+      case AArch64::STURBi:
+      case AArch64::STURBBi:
+      case AArch64::STURHi:
+      case AArch64::STURHHi:
+      case AArch64::STURWi:
+      case AArch64::STURXi:
+      case AArch64::STURSi:
+      case AArch64::STURDi:
+      case AArch64::STURQi:
+        return true;
+      default:
+        break;
+      }
+
+      return false;
+    };
+
+    auto isStoreRegScaledImm = [&]() {
+      switch (opcode) {
+      case AArch64::STRBui:
+      case AArch64::STRBBui:
+      case AArch64::STRHui:
+      case AArch64::STRHHui:
+      case AArch64::STRWui:
+      case AArch64::STRXui:
+      case AArch64::STRSui:
+      case AArch64::STRDui:
+      case AArch64::STRQui:
+        return true;
+      default:
+        break;
+      }
+
+      return false;
+    };
+
+    auto isStoreRegImmPostIndexed = [&]() {
+      switch (opcode) {
+      case AArch64::STRBpost:
+      case AArch64::STRBBpost:
+      case AArch64::STRHpost:
+      case AArch64::STRHHpost:
+      case AArch64::STRWpost:
+      case AArch64::STRXpost:
+      case AArch64::STRSpost:
+      case AArch64::STRDpost:
+      case AArch64::STRQpost:
+        return true;
+      default:
+        break;
+      }
+
+      return false;
+    };
+
+    auto isStoreRegImmPreIndexed = [&]() {
+      switch (opcode) {
+      case AArch64::STRBpre:
+      case AArch64::STRBBpre:
+      case AArch64::STRHpre:
+      case AArch64::STRHHpre:
+      case AArch64::STRWpre:
+      case AArch64::STRXpre:
+      case AArch64::STRSpre:
+      case AArch64::STRDpre:
+      case AArch64::STRQpre:
+        return true;
+      default:
+        break;
+      }
+
+      return false;
+    };
+
+    auto isStoreRegUnscaleUnpriv = [&]() {
+      switch (opcode) {
+      case AArch64::STTRBi:
+      case AArch64::STTRHi:
+      case AArch64::STTRWi:
+      case AArch64::STTRXi:
+        return true;
+      default:
+        break;
+      }
+
+      return false;
+    };
+
+    auto isStoreRegTrunc = [&]() {
+      switch (opcode) {
+      case AArch64::STRBBroW:
+      case AArch64::STRBBroX:
+      case AArch64::STRBroW:
+      case AArch64::STRBroX:
+      case AArch64::STRDroW:
+      case AArch64::STRDroX:
+      case AArch64::STRHHroW:
+      case AArch64::STRHHroX:
+      case AArch64::STRHroW:
+      case AArch64::STRHroX:
+      case AArch64::STRQroW:
+      case AArch64::STRQroX:
+      case AArch64::STRSroW:
+      case AArch64::STRSroX:
+      case AArch64::STRWroW:
+      case AArch64::STRWroX:
+      case AArch64::STRXroW:
+      case AArch64::STRXroX:
+        return true;
+      default:
+        break;
+      }
+
+      return false;
+    };
+
+    return isStoreRegUnscaleImm() || isStoreRegScaledImm() ||
+           isStoreRegImmPreIndexed() || isStoreRegImmPostIndexed() ||
+           isStoreRegUnscaleUnpriv() || isStoreRegTrunc();
+  }
+
+  bool mayStore(const MCInst &Inst) const override {
+    return isStorePair(Inst) || isStoreReg(Inst) ||
+           isAArch64ExclusiveStore(Inst);
+  }
+
+  bool isStoreToStack(const MCInst &Inst) const {
+    if (!mayStore(Inst))
+      return false;
+
+    for (const MCOperand &Operand : useOperands(Inst)) {
+      if (!Operand.isReg())
+        continue;
+
+      unsigned Reg = Operand.getReg();
+      if (Reg == AArch64::SP || Reg == AArch64::WSP)
+        return true;
+    }
+
+    return false;
+  }
 
   void createDirectCall(MCInst &Inst, const MCSymbol *Target, MCContext *Ctx,
                         bool IsTailCall) override {
diff --git clang-tools-extra/clang-tidy/misc/UnusedUsingDeclsCheck.h clang-tools-extra/clang-tidy/misc/UnusedUsingDeclsCheck.h
index 7bdaf12e8aec..e5f766dbac56 100644
--- clang-tools-extra/clang-tidy/misc/UnusedUsingDeclsCheck.h
+++ clang-tools-extra/clang-tidy/misc/UnusedUsingDeclsCheck.h
@@ -51,7 +51,6 @@ private:
   std::vector<UsingDeclContext> Contexts;
   llvm::SmallPtrSet<const Decl *, 32> UsingTargetDeclsCache;
 
-  StringRef RawStringHeaderFileExtensions;
   FileExtensionsSet HeaderFileExtensions;
 };
 
diff --git clang-tools-extra/clangd/CMakeLists.txt clang-tools-extra/clangd/CMakeLists.txt
index d797ddce8c44..6f10afe4a562 100644
--- clang-tools-extra/clangd/CMakeLists.txt
+++ clang-tools-extra/clangd/CMakeLists.txt
@@ -91,7 +91,6 @@ add_clang_library(clangDaemon STATIC
   GlobalCompilationDatabase.cpp
   Headers.cpp
   HeaderSourceSwitch.cpp
-  HeuristicResolver.cpp
   Hover.cpp
   IncludeCleaner.cpp
   IncludeFixer.cpp
diff --git clang-tools-extra/clangd/FindTarget.cpp clang-tools-extra/clangd/FindTarget.cpp
index e702c6b3537a..bb4c91b83135 100644
--- clang-tools-extra/clangd/FindTarget.cpp
+++ clang-tools-extra/clangd/FindTarget.cpp
@@ -8,7 +8,6 @@
 
 #include "FindTarget.h"
 #include "AST.h"
-#include "HeuristicResolver.h"
 #include "support/Logger.h"
 #include "clang/AST/ASTConcept.h"
 #include "clang/AST/ASTTypeTraits.h"
@@ -35,6 +34,7 @@
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Basic/SourceManager.h"
 #include "clang/Basic/Specifiers.h"
+#include "clang/Sema/HeuristicResolver.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
@@ -496,8 +496,7 @@ public:
       return;
     case NestedNameSpecifier::Identifier:
       if (Resolver) {
-        add(QualType(Resolver->resolveNestedNameSpecifierToType(NNS), 0),
-            Flags);
+        add(Resolver->resolveNestedNameSpecifierToType(NNS), Flags);
       }
       return;
     case NestedNameSpecifier::TypeSpec:
diff --git clang-tools-extra/clangd/FindTarget.h clang-tools-extra/clangd/FindTarget.h
index b41c54709510..a7706804ce7e 100644
--- clang-tools-extra/clangd/FindTarget.h
+++ clang-tools-extra/clangd/FindTarget.h
@@ -33,9 +33,11 @@
 #include <bitset>
 
 namespace clang {
-namespace clangd {
+
 class HeuristicResolver;
 
+namespace clangd {
+
 /// Describes the link between an AST node and a Decl it refers to.
 enum class DeclRelation : unsigned;
 /// A bitfield of DeclRelations.
diff --git clang-tools-extra/clangd/Hover.cpp clang-tools-extra/clangd/Hover.cpp
index 5e136d0e76ec..3ab3d8903052 100644
--- clang-tools-extra/clangd/Hover.cpp
+++ clang-tools-extra/clangd/Hover.cpp
@@ -1193,12 +1193,13 @@ void maybeAddSymbolProviders(ParsedAST &AST, HoverInfo &HI,
                              include_cleaner::Symbol Sym) {
   trace::Span Tracer("Hover::maybeAddSymbolProviders");
 
-  const SourceManager &SM = AST.getSourceManager();
   llvm::SmallVector<include_cleaner::Header> RankedProviders =
-      include_cleaner::headersForSymbol(Sym, SM, &AST.getPragmaIncludes());
+      include_cleaner::headersForSymbol(Sym, AST.getPreprocessor(),
+                                        &AST.getPragmaIncludes());
   if (RankedProviders.empty())
     return;
 
+  const SourceManager &SM = AST.getSourceManager();
   std::string Result;
   include_cleaner::Includes ConvertedIncludes = convertIncludes(AST);
   for (const auto &P : RankedProviders) {
diff --git clang-tools-extra/clangd/InlayHints.cpp clang-tools-extra/clangd/InlayHints.cpp
index fefffeb4efc1..1b1bcf78c985 100644
--- clang-tools-extra/clangd/InlayHints.cpp
+++ clang-tools-extra/clangd/InlayHints.cpp
@@ -9,7 +9,6 @@
 #include "../clang-tidy/utils/DesignatedInitializers.h"
 #include "AST.h"
 #include "Config.h"
-#include "HeuristicResolver.h"
 #include "ParsedAST.h"
 #include "Protocol.h"
 #include "SourceCode.h"
@@ -27,6 +26,7 @@
 #include "clang/Basic/OperatorKinds.h"
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Basic/SourceManager.h"
+#include "clang/Sema/HeuristicResolver.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
diff --git clang-tools-extra/clangd/ParsedAST.cpp clang-tools-extra/clangd/ParsedAST.cpp
index 725cbeb154cb..89d6f26d0f15 100644
--- clang-tools-extra/clangd/ParsedAST.cpp
+++ clang-tools-extra/clangd/ParsedAST.cpp
@@ -20,7 +20,6 @@
 #include "Feature.h"
 #include "FeatureModule.h"
 #include "Headers.h"
-#include "HeuristicResolver.h"
 #include "IncludeCleaner.h"
 #include "IncludeFixer.h"
 #include "Preamble.h"
@@ -53,6 +52,7 @@
 #include "clang/Lex/Lexer.h"
 #include "clang/Lex/PPCallbacks.h"
 #include "clang/Lex/Preprocessor.h"
+#include "clang/Sema/HeuristicResolver.h"
 #include "clang/Serialization/ASTWriter.h"
 #include "clang/Tooling/CompilationDatabase.h"
 #include "clang/Tooling/Core/Diagnostic.h"
diff --git clang-tools-extra/clangd/ParsedAST.h clang-tools-extra/clangd/ParsedAST.h
index 8d9d1e645692..82fac9636048 100644
--- clang-tools-extra/clangd/ParsedAST.h
+++ clang-tools-extra/clangd/ParsedAST.h
@@ -38,9 +38,9 @@
 #include <vector>
 
 namespace clang {
+class HeuristicResolver;
 class Sema;
 namespace clangd {
-class HeuristicResolver;
 
 /// Stores and provides access to parsed AST.
 class ParsedAST {
diff --git clang-tools-extra/clangd/SemanticHighlighting.cpp clang-tools-extra/clangd/SemanticHighlighting.cpp
index e6d16af2495f..86ca05644c70 100644
--- clang-tools-extra/clangd/SemanticHighlighting.cpp
+++ clang-tools-extra/clangd/SemanticHighlighting.cpp
@@ -9,7 +9,6 @@
 #include "SemanticHighlighting.h"
 #include "Config.h"
 #include "FindTarget.h"
-#include "HeuristicResolver.h"
 #include "ParsedAST.h"
 #include "Protocol.h"
 #include "SourceCode.h"
@@ -27,6 +26,7 @@
 #include "clang/Basic/LangOptions.h"
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Basic/SourceManager.h"
+#include "clang/Sema/HeuristicResolver.h"
 #include "clang/Tooling/Syntax/Tokens.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
diff --git clang-tools-extra/clangd/XRefs.cpp clang-tools-extra/clangd/XRefs.cpp
index 8d5ab2e491a4..1a23f6cca775 100644
--- clang-tools-extra/clangd/XRefs.cpp
+++ clang-tools-extra/clangd/XRefs.cpp
@@ -10,7 +10,6 @@
 #include "FindSymbols.h"
 #include "FindTarget.h"
 #include "Headers.h"
-#include "HeuristicResolver.h"
 #include "IncludeCleaner.h"
 #include "ParsedAST.h"
 #include "Protocol.h"
@@ -53,6 +52,7 @@
 #include "clang/Index/IndexingOptions.h"
 #include "clang/Index/USRGeneration.h"
 #include "clang/Lex/Lexer.h"
+#include "clang/Sema/HeuristicResolver.h"
 #include "clang/Tooling/Syntax/Tokens.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
@@ -2034,9 +2034,10 @@ static void unwrapFindType(
 
   // For smart pointer types, add the underlying type
   if (H)
-    if (const auto* PointeeType = H->getPointeeType(T.getNonReferenceType().getTypePtr())) {
-        unwrapFindType(QualType(PointeeType, 0), H, Out);
-        return Out.push_back(T);
+    if (auto PointeeType = H->getPointeeType(T.getNonReferenceType());
+        !PointeeType.isNull()) {
+      unwrapFindType(PointeeType, H, Out);
+      return Out.push_back(T);
     }
 
   return Out.push_back(T);
diff --git clang-tools-extra/clangd/index/SymbolCollector.cpp clang-tools-extra/clangd/index/SymbolCollector.cpp
index 6d0af20e3126..1de7faf81746 100644
--- clang-tools-extra/clangd/index/SymbolCollector.cpp
+++ clang-tools-extra/clangd/index/SymbolCollector.cpp
@@ -888,7 +888,7 @@ void SymbolCollector::setIncludeLocation(const Symbol &S, SourceLocation DefLoc,
   // might run while parsing, rather than at the end of a translation unit.
   // Hence we see more and more redecls over time.
   SymbolProviders[S.ID] =
-      include_cleaner::headersForSymbol(Sym, SM, Opts.PragmaIncludes);
+      include_cleaner::headersForSymbol(Sym, *PP, Opts.PragmaIncludes);
 }
 
 llvm::StringRef getStdHeader(const Symbol *S, const LangOptions &LangOpts) {
diff --git clang-tools-extra/clangd/unittests/CMakeLists.txt clang-tools-extra/clangd/unittests/CMakeLists.txt
index 8dba8088908d..dffdcd5d014c 100644
--- clang-tools-extra/clangd/unittests/CMakeLists.txt
+++ clang-tools-extra/clangd/unittests/CMakeLists.txt
@@ -64,7 +64,6 @@ add_unittest(ClangdUnitTests ClangdTests
   GlobalCompilationDatabaseTests.cpp
   HeadersTests.cpp
   HeaderSourceSwitchTests.cpp
-  HeuristicResolverTests.cpp
   HoverTests.cpp
   IncludeCleanerTests.cpp
   IndexActionTests.cpp
diff --git clang-tools-extra/docs/clang-tidy/Contributing.rst clang-tools-extra/docs/clang-tidy/Contributing.rst
index ff8b05ff263c..4f1df8d11444 100644
--- clang-tools-extra/docs/clang-tidy/Contributing.rst
+++ clang-tools-extra/docs/clang-tidy/Contributing.rst
@@ -331,7 +331,7 @@ a starting point for your test cases.  A rough outline of the process looks like
 - Issue the necessary diagnostics and fix-its in the ``check`` method.
 - Add the necessary ``CHECK-MESSAGES`` and ``CHECK-FIXES`` annotations to your
   test case to validate the diagnostics and fix-its.
-- Build the target ``check-clang-tool`` to confirm the test passes.
+- Build the target ``check-clang-tools`` to confirm the test passes.
 - Repeat the process until all aspects of your check are covered by tests.
 
 The quickest way to prototype your matcher is to use :program:`clang-query` to
@@ -519,8 +519,8 @@ the check implements and what the current values are (e.g. for the
   public:
     MyCheck(StringRef Name, ClangTidyContext *Context)
       : ClangTidyCheck(Name, Context),
-        SomeOption(Options.get("SomeOption1", -1U)),
-        SomeOption(Options.get("SomeOption2", "some default")) {}
+        SomeOption1(Options.get("SomeOption1", -1U)),
+        SomeOption2(Options.get("SomeOption2", "some default")) {}
 
     void storeOptions(ClangTidyOptions::OptionMap &Opts) override {
       Options.store(Opts, "SomeOption1", SomeOption1);
diff --git clang-tools-extra/include-cleaner/include/clang-include-cleaner/Analysis.h clang-tools-extra/include-cleaner/include/clang-include-cleaner/Analysis.h
index 46ca3c9d0807..c3241763237d 100644
--- clang-tools-extra/include-cleaner/include/clang-include-cleaner/Analysis.h
+++ clang-tools-extra/include-cleaner/include/clang-include-cleaner/Analysis.h
@@ -90,7 +90,7 @@ std::string fixIncludes(const AnalysisResults &Results,
 /// Returned headers are sorted by relevance, first element is the most
 /// likely provider for the symbol.
 llvm::SmallVector<Header> headersForSymbol(const Symbol &S,
-                                           const SourceManager &SM,
+                                           const Preprocessor &PP,
                                            const PragmaIncludes *PI);
 } // namespace include_cleaner
 } // namespace clang
diff --git clang-tools-extra/include-cleaner/lib/Analysis.cpp clang-tools-extra/include-cleaner/lib/Analysis.cpp
index e3a4834cb19a..a1781f4e24f2 100644
--- clang-tools-extra/include-cleaner/lib/Analysis.cpp
+++ clang-tools-extra/include-cleaner/lib/Analysis.cpp
@@ -64,7 +64,7 @@ void walkUsed(llvm::ArrayRef<Decl *> ASTRoots,
       // FIXME: Most of the work done here is repetitive. It might be useful to
       // have a cache/batching.
       SymbolReference SymRef{ND, Loc, RT};
-      return CB(SymRef, headersForSymbol(ND, SM, PI));
+      return CB(SymRef, headersForSymbol(ND, PP, PI));
     });
   }
   for (const SymbolReference &MacroRef : MacroRefs) {
@@ -72,7 +72,7 @@ void walkUsed(llvm::ArrayRef<Decl *> ASTRoots,
     if (!SM.isWrittenInMainFile(SM.getSpellingLoc(MacroRef.RefLocation)) ||
         shouldIgnoreMacroReference(PP, MacroRef.Target.macro()))
       continue;
-    CB(MacroRef, headersForSymbol(MacroRef.Target, SM, PI));
+    CB(MacroRef, headersForSymbol(MacroRef.Target, PP, PI));
   }
 }
 
diff --git clang-tools-extra/include-cleaner/lib/AnalysisInternal.h clang-tools-extra/include-cleaner/lib/AnalysisInternal.h
index cd796c2da7b8..7d170fd15014 100644
--- clang-tools-extra/include-cleaner/lib/AnalysisInternal.h
+++ clang-tools-extra/include-cleaner/lib/AnalysisInternal.h
@@ -25,6 +25,8 @@
 #include "clang-include-cleaner/Analysis.h"
 #include "clang-include-cleaner/Record.h"
 #include "clang-include-cleaner/Types.h"
+#include "clang/Basic/LangOptions.h"
+#include "clang/Lex/Preprocessor.h"
 #include "llvm/ADT/STLFunctionalExtras.h"
 #include <vector>
 
@@ -57,13 +59,14 @@ llvm::SmallVector<Hinted<Header>> findHeaders(const SymbolLocation &Loc,
                                               const PragmaIncludes *PI);
 
 /// A set of locations that provides the declaration.
-std::vector<Hinted<SymbolLocation>> locateSymbol(const Symbol &S);
+std::vector<Hinted<SymbolLocation>> locateSymbol(const Symbol &S,
+                                                 const LangOptions &LO);
 
 /// Write an HTML summary of the analysis to the given stream.
 void writeHTMLReport(FileID File, const Includes &,
                      llvm::ArrayRef<Decl *> Roots,
                      llvm::ArrayRef<SymbolReference> MacroRefs, ASTContext &Ctx,
-                     const HeaderSearch &HS, PragmaIncludes *PI,
+                     const Preprocessor &PP, PragmaIncludes *PI,
                      llvm::raw_ostream &OS);
 
 } // namespace include_cleaner
diff --git clang-tools-extra/include-cleaner/lib/FindHeaders.cpp clang-tools-extra/include-cleaner/lib/FindHeaders.cpp
index 7b28d1c252d7..b96d9a70728c 100644
--- clang-tools-extra/include-cleaner/lib/FindHeaders.cpp
+++ clang-tools-extra/include-cleaner/lib/FindHeaders.cpp
@@ -18,6 +18,7 @@
 #include "clang/Basic/FileEntry.h"
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Basic/SourceManager.h"
+#include "clang/Lex/Preprocessor.h"
 #include "clang/Tooling/Inclusions/StandardLibrary.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
@@ -239,8 +240,9 @@ llvm::SmallVector<Hinted<Header>> findHeaders(const SymbolLocation &Loc,
 }
 
 llvm::SmallVector<Header> headersForSymbol(const Symbol &S,
-                                           const SourceManager &SM,
+                                           const Preprocessor &PP,
                                            const PragmaIncludes *PI) {
+  const auto &SM = PP.getSourceManager();
   // Get headers for all the locations providing Symbol. Same header can be
   // reached through different traversals, deduplicate those into a single
   // Header by merging their hints.
@@ -248,7 +250,7 @@ llvm::SmallVector<Header> headersForSymbol(const Symbol &S,
   if (auto SpecialHeaders = headersForSpecialSymbol(S, SM, PI)) {
     Headers = std::move(*SpecialHeaders);
   } else {
-    for (auto &Loc : locateSymbol(S))
+    for (auto &Loc : locateSymbol(S, PP.getLangOpts()))
       Headers.append(applyHints(findHeaders(Loc, SM, PI), Loc.Hint));
   }
   // If two Headers probably refer to the same file (e.g. Verbatim(foo.h) and
diff --git clang-tools-extra/include-cleaner/lib/HTMLReport.cpp clang-tools-extra/include-cleaner/lib/HTMLReport.cpp
index bbe8bc230c6e..92c7c554ca50 100644
--- clang-tools-extra/include-cleaner/lib/HTMLReport.cpp
+++ clang-tools-extra/include-cleaner/lib/HTMLReport.cpp
@@ -21,6 +21,7 @@
 #include "clang/Basic/SourceManager.h"
 #include "clang/Lex/HeaderSearch.h"
 #include "clang/Lex/Lexer.h"
+#include "clang/Lex/Preprocessor.h"
 #include "clang/Tooling/Inclusions/StandardLibrary.h"
 #include "llvm/Support/ScopedPrinter.h"
 #include "llvm/Support/raw_ostream.h"
@@ -135,7 +136,7 @@ class Reporter {
   llvm::raw_ostream &OS;
   const ASTContext &Ctx;
   const SourceManager &SM;
-  const HeaderSearch &HS;
+  const Preprocessor &PP;
   const include_cleaner::Includes &Includes;
   const PragmaIncludes *PI;
   FileID MainFile;
@@ -170,9 +171,9 @@ class Reporter {
 
   void fillTarget(Ref &R) {
     // Duplicates logic from walkUsed(), which doesn't expose SymbolLocations.
-    for (auto &Loc : locateSymbol(R.Sym))
+    for (auto &Loc : locateSymbol(R.Sym, Ctx.getLangOpts()))
       R.Locations.push_back(Loc);
-    R.Headers = headersForSymbol(R.Sym, SM, PI);
+    R.Headers = headersForSymbol(R.Sym, PP, PI);
 
     for (const auto &H : R.Headers) {
       R.Includes.append(Includes.match(H));
@@ -189,14 +190,15 @@ class Reporter {
                      R.Includes.end());
 
     if (!R.Headers.empty())
-      R.Insert = spellHeader({R.Headers.front(), HS, MainFE});
+      R.Insert =
+          spellHeader({R.Headers.front(), PP.getHeaderSearchInfo(), MainFE});
   }
 
 public:
-  Reporter(llvm::raw_ostream &OS, ASTContext &Ctx, const HeaderSearch &HS,
+  Reporter(llvm::raw_ostream &OS, ASTContext &Ctx, const Preprocessor &PP,
            const include_cleaner::Includes &Includes, const PragmaIncludes *PI,
            FileID MainFile)
-      : OS(OS), Ctx(Ctx), SM(Ctx.getSourceManager()), HS(HS),
+      : OS(OS), Ctx(Ctx), SM(Ctx.getSourceManager()), PP(PP),
         Includes(Includes), PI(PI), MainFile(MainFile),
         MainFE(SM.getFileEntryForID(MainFile)) {}
 
@@ -498,9 +500,9 @@ private:
 void writeHTMLReport(FileID File, const include_cleaner::Includes &Includes,
                      llvm::ArrayRef<Decl *> Roots,
                      llvm::ArrayRef<SymbolReference> MacroRefs, ASTContext &Ctx,
-                     const HeaderSearch &HS, PragmaIncludes *PI,
+                     const Preprocessor &PP, PragmaIncludes *PI,
                      llvm::raw_ostream &OS) {
-  Reporter R(OS, Ctx, HS, Includes, PI, File);
+  Reporter R(OS, Ctx, PP, Includes, PI, File);
   const auto& SM = Ctx.getSourceManager();
   for (Decl *Root : Roots)
     walkAST(*Root, [&](SourceLocation Loc, const NamedDecl &D, RefType T) {
diff --git clang-tools-extra/include-cleaner/lib/LocateSymbol.cpp clang-tools-extra/include-cleaner/lib/LocateSymbol.cpp
index 78e783a62eb2..b7433305152f 100644
--- clang-tools-extra/include-cleaner/lib/LocateSymbol.cpp
+++ clang-tools-extra/include-cleaner/lib/LocateSymbol.cpp
@@ -54,20 +54,24 @@ std::vector<Hinted<SymbolLocation>> locateDecl(const Decl &D) {
   return Result;
 }
 
-std::vector<Hinted<SymbolLocation>> locateMacro(const Macro &M) {
+std::vector<Hinted<SymbolLocation>> locateMacro(const Macro &M,
+                                                const tooling::stdlib::Lang L) {
   // FIXME: Should we also provide physical locations?
-  if (auto SS = tooling::stdlib::Symbol::named("", M.Name->getName()))
+  if (auto SS = tooling::stdlib::Symbol::named("", M.Name->getName(), L))
     return {{*SS, Hints::CompleteSymbol}};
   return {{M.Definition, Hints::CompleteSymbol}};
 }
 } // namespace
 
-std::vector<Hinted<SymbolLocation>> locateSymbol(const Symbol &S) {
+std::vector<Hinted<SymbolLocation>> locateSymbol(const Symbol &S,
+                                                 const LangOptions &LO) {
+  const auto L = !LO.CPlusPlus && LO.C99 ? tooling::stdlib::Lang::C
+                                         : tooling::stdlib::Lang::CXX;
   switch (S.kind()) {
   case Symbol::Declaration:
     return locateDecl(S.declaration());
   case Symbol::Macro:
-    return locateMacro(S.macro());
+    return locateMacro(S.macro(), L);
   }
   llvm_unreachable("Unknown Symbol::Kind enum");
 }
diff --git clang-tools-extra/include-cleaner/tool/IncludeCleaner.cpp clang-tools-extra/include-cleaner/tool/IncludeCleaner.cpp
index f85dbc0e0c31..1d9458ffc4d3 100644
--- clang-tools-extra/include-cleaner/tool/IncludeCleaner.cpp
+++ clang-tools-extra/include-cleaner/tool/IncludeCleaner.cpp
@@ -216,10 +216,9 @@ private:
       ++Errors;
       return;
     }
-    writeHTMLReport(
-        AST.Ctx->getSourceManager().getMainFileID(), PP.Includes, AST.Roots,
-        PP.MacroReferences, *AST.Ctx,
-        getCompilerInstance().getPreprocessor().getHeaderSearchInfo(), &PI, OS);
+    writeHTMLReport(AST.Ctx->getSourceManager().getMainFileID(), PP.Includes,
+                    AST.Roots, PP.MacroReferences, *AST.Ctx,
+                    getCompilerInstance().getPreprocessor(), &PI, OS);
   }
 };
 class ActionFactory : public tooling::FrontendActionFactory {
diff --git clang-tools-extra/include-cleaner/unittests/FindHeadersTest.cpp clang-tools-extra/include-cleaner/unittests/FindHeadersTest.cpp
index 84e02e1d0d62..0ac243937e6e 100644
--- clang-tools-extra/include-cleaner/unittests/FindHeadersTest.cpp
+++ clang-tools-extra/include-cleaner/unittests/FindHeadersTest.cpp
@@ -306,7 +306,7 @@ protected:
     if (!V.Out)
       ADD_FAILURE() << "Couldn't find any decls named " << Name << ".";
     assert(V.Out);
-    return headersForSymbol(*V.Out, AST->sourceManager(), &PI);
+    return headersForSymbol(*V.Out, AST->preprocessor(), &PI);
   }
   llvm::SmallVector<Header> headersForFoo() { return headersFor("foo"); }
 };
@@ -611,13 +611,12 @@ TEST_F(HeadersForSymbolTest, AmbiguousStdSymbolsUsingShadow) {
   Visitor V;
   V.TraverseDecl(AST->context().getTranslationUnitDecl());
   ASSERT_TRUE(V.Out) << "Couldn't find a DeclRefExpr!";
-  EXPECT_THAT(headersForSymbol(*(V.Out->getFoundDecl()),
-                               AST->sourceManager(), &PI),
-              UnorderedElementsAre(
-                  Header(*tooling::stdlib::Header::named("<cstdio>"))));
+  EXPECT_THAT(
+      headersForSymbol(*(V.Out->getFoundDecl()), AST->preprocessor(), &PI),
+      UnorderedElementsAre(
+          Header(*tooling::stdlib::Header::named("<cstdio>"))));
 }
 
-
 TEST_F(HeadersForSymbolTest, StandardHeaders) {
   Inputs.Code = R"cpp(
     #include "stdlib_internal.h"
@@ -636,6 +635,30 @@ TEST_F(HeadersForSymbolTest, StandardHeaders) {
                            tooling::stdlib::Header::named("<assert.h>")));
 }
 
+TEST_F(HeadersForSymbolTest, StdlibLangForMacros) {
+  Inputs.Code = R"cpp(
+    #define EOF 0
+    void foo() { EOF; }
+  )cpp";
+  {
+    buildAST();
+    const Macro Eof{AST->preprocessor().getIdentifierInfo("EOF"), {}};
+    EXPECT_THAT(
+        headersForSymbol(Eof, AST->preprocessor(), nullptr),
+        UnorderedElementsAre(tooling::stdlib::Header::named("<cstdio>"),
+                             tooling::stdlib::Header::named("<stdio.h>")));
+  }
+
+  {
+    Inputs.ExtraArgs.push_back("-xc");
+    buildAST();
+    const Macro Eof{AST->preprocessor().getIdentifierInfo("EOF"), {}};
+    EXPECT_THAT(headersForSymbol(Eof, AST->preprocessor(), nullptr),
+                UnorderedElementsAre(tooling::stdlib::Header::named(
+                    "<stdio.h>", tooling::stdlib::Lang::C)));
+  }
+}
+
 TEST_F(HeadersForSymbolTest, ExporterNoNameMatch) {
   Inputs.Code = R"cpp(
     #include "exporter/foo.h"
diff --git clang-tools-extra/include-cleaner/unittests/LocateSymbolTest.cpp clang-tools-extra/include-cleaner/unittests/LocateSymbolTest.cpp
index 756757cfd0f0..1e7baf142a75 100644
--- clang-tools-extra/include-cleaner/unittests/LocateSymbolTest.cpp
+++ clang-tools-extra/include-cleaner/unittests/LocateSymbolTest.cpp
@@ -11,6 +11,7 @@
 #include "clang/AST/Decl.h"
 #include "clang/AST/DeclBase.h"
 #include "clang/AST/RecursiveASTVisitor.h"
+#include "clang/Basic/LangOptions.h"
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Lex/Preprocessor.h"
 #include "clang/Testing/TestAST.h"
@@ -96,6 +97,8 @@ public:
       Results.emplace_back(SM.getComposedLoc(FID, Offset));
     return Results;
   }
+
+  const LangOptions &langOpts() { return AST.preprocessor().getLangOpts(); }
 };
 
 TEST(LocateSymbol, Decl) {
@@ -110,7 +113,7 @@ TEST(LocateSymbol, Decl) {
   for (auto &Case : Cases) {
     SCOPED_TRACE(Case);
     LocateExample Test(Case);
-    EXPECT_THAT(locateSymbol(Test.findDecl("foo")),
+    EXPECT_THAT(locateSymbol(Test.findDecl("foo"), Test.langOpts()),
                 ElementsAreArray(Test.points()));
   }
 }
@@ -119,12 +122,12 @@ TEST(LocateSymbol, Stdlib) {
   {
     LocateExample Test("namespace std { struct vector; }");
     EXPECT_THAT(
-        locateSymbol(Test.findDecl("vector")),
+        locateSymbol(Test.findDecl("vector"), Test.langOpts()),
         ElementsAre(*tooling::stdlib::Symbol::named("std::", "vector")));
   }
   {
     LocateExample Test("#define assert(x)\nvoid foo() { assert(true); }");
-    EXPECT_THAT(locateSymbol(Test.findMacro("assert")),
+    EXPECT_THAT(locateSymbol(Test.findMacro("assert"), Test.langOpts()),
                 ElementsAre(*tooling::stdlib::Symbol::named("", "assert")));
   }
 }
@@ -132,7 +135,7 @@ TEST(LocateSymbol, Stdlib) {
 TEST(LocateSymbol, Macros) {
   // Make sure we preserve the last one.
   LocateExample Test("#define FOO\n#undef FOO\n#define ^FOO");
-  EXPECT_THAT(locateSymbol(Test.findMacro("FOO")),
+  EXPECT_THAT(locateSymbol(Test.findMacro("FOO"), Test.langOpts()),
               ElementsAreArray(Test.points()));
 }
 
@@ -143,7 +146,7 @@ TEST(LocateSymbol, CompleteSymbolHint) {
   {
     // stdlib symbols are always complete.
     LocateExample Test("namespace std { struct vector; }");
-    EXPECT_THAT(locateSymbol(Test.findDecl("vector")),
+    EXPECT_THAT(locateSymbol(Test.findDecl("vector"), Test.langOpts()),
                 ElementsAre(HintedSymbol(
                     *tooling::stdlib::Symbol::named("std::", "vector"),
                     Hints::CompleteSymbol)));
@@ -151,7 +154,7 @@ TEST(LocateSymbol, CompleteSymbolHint) {
   {
     // macros are always complete.
     LocateExample Test("#define ^FOO");
-    EXPECT_THAT(locateSymbol(Test.findMacro("FOO")),
+    EXPECT_THAT(locateSymbol(Test.findMacro("FOO"), Test.langOpts()),
                 ElementsAre(HintedSymbol(Test.points().front(),
                                          Hints::CompleteSymbol)));
   }
@@ -165,7 +168,7 @@ TEST(LocateSymbol, CompleteSymbolHint) {
     for (auto &Case : Cases) {
       SCOPED_TRACE(Case);
       LocateExample Test(Case);
-      EXPECT_THAT(locateSymbol(Test.findDecl("foo")),
+      EXPECT_THAT(locateSymbol(Test.findDecl("foo"), Test.langOpts()),
                   ElementsAre(HintedSymbol(Test.points().front(), Hints::None),
                               HintedSymbol(Test.points().back(),
                                            Hints::CompleteSymbol)));
@@ -181,7 +184,7 @@ TEST(LocateSymbol, CompleteSymbolHint) {
     for (auto &Case : Cases) {
       SCOPED_TRACE(Case);
       LocateExample Test(Case);
-      EXPECT_THAT(locateSymbol(Test.findDecl("foo")),
+      EXPECT_THAT(locateSymbol(Test.findDecl("foo"), Test.langOpts()),
                   Each(Field(&Hinted<SymbolLocation>::Hint,
                              Eq(Hints::CompleteSymbol))));
     }
diff --git clang/docs/ClangFormatStyleOptions.rst clang/docs/ClangFormatStyleOptions.rst
index 0edf7af72c24..30a2325949f4 100644
--- clang/docs/ClangFormatStyleOptions.rst
+++ clang/docs/ClangFormatStyleOptions.rst
@@ -4213,6 +4213,21 @@ the configuration (without a prefix: ``Auto``).
        plop();                                  plop();
      }                                      }
 
+.. _IndentExportBlock:
+
+**IndentExportBlock** (``Boolean``) :versionbadge:`clang-format 20` :ref:`Â¶ <IndentExportBlock>`
+  If ``true``, clang-format will indent the body of an ``export { ... }``
+  block. This doesn't affect the formatting of anything else related to
+  exported declarations.
+
+  .. code-block:: c++
+
+     true:                     false:
+     export {          vs.     export {
+       void foo();             void foo();
+       void bar();             void bar();
+     }                         }
+
 .. _IndentExternBlock:
 
 **IndentExternBlock** (``IndentExternBlockStyle``) :versionbadge:`clang-format 11` :ref:`Â¶ <IndentExternBlock>`
diff --git clang/docs/ClangOffloadBundler.rst clang/docs/ClangOffloadBundler.rst
index 3c241027d405..bceb4060992f 100644
--- clang/docs/ClangOffloadBundler.rst
+++ clang/docs/ClangOffloadBundler.rst
@@ -542,3 +542,5 @@ The compressed offload bundle begins with a header followed by the compressed bi
 
 - **Compressed Data**:
     The actual compressed binary data follows the header. Its size can be inferred from the total size of the file minus the header size.
+
+    > **Note**: Version 3 of the format is under development. It uses 64-bit fields for Total File Size and Uncompressed Binary Size to support files larger than 4GB. To experiment with version 3, set the environment variable `COMPRESSED_BUNDLE_FORMAT_VERSION=3`. This support is experimental and not recommended for production use.
\ No newline at end of file
diff --git clang/docs/ClangTransformerTutorial.rst clang/docs/ClangTransformerTutorial.rst
index b07b83f80f17..e9b701203300 100644
--- clang/docs/ClangTransformerTutorial.rst
+++ clang/docs/ClangTransformerTutorial.rst
@@ -70,7 +70,7 @@ can express this a Transformer rewrite rule:
 
 .. code-block:: c++
 
-   makeRule(functionDecl(hasName("MkX").bind("fun"),
+   makeRule(functionDecl(hasName("MkX")).bind("fun"),
 	    noopEdit(node("fun")),
 	    cat("The name ``MkX`` is not allowed for functions; please rename"));
 
diff --git clang/docs/LanguageExtensions.rst clang/docs/LanguageExtensions.rst
index 2eb0777dbdc6..bbeeefe82282 100644
--- clang/docs/LanguageExtensions.rst
+++ clang/docs/LanguageExtensions.rst
@@ -434,6 +434,114 @@ __datasizeof
 ``__datasizeof`` behaves like ``sizeof``, except that it returns the size of the
 type ignoring tail padding.
 
+_BitInt, _ExtInt
+----------------
+
+Clang supports the C23 ``_BitInt(N)`` feature as an extension in older C modes
+and in C++. This type was previously implemented in Clang with the same
+semantics, but spelled ``_ExtInt(N)``. This spelling has been deprecated in
+favor of the standard type.
+
+Note: the ABI for ``_BitInt(N)`` is still in the process of being stabilized,
+so this type should not yet be used in interfaces that require ABI stability.
+
+C keywords supported in all language modes
+------------------------------------------
+
+Clang supports ``_Alignas``, ``_Alignof``, ``_Atomic``, ``_Complex``,
+``_Generic``, ``_Imaginary``, ``_Noreturn``, ``_Static_assert``,
+``_Thread_local``, and ``_Float16`` in all language modes with the C semantics.
+
+__alignof, __alignof__
+----------------------
+
+``__alignof`` and ``__alignof__`` return, in contrast to ``_Alignof`` and
+``alignof``, the preferred alignment of a type. This may be larger than the
+required alignment for improved performance.
+
+__extension__
+-------------
+
+``__extension__`` suppresses extension diagnostics in the statement it is
+prepended to.
+
+__auto_type
+-----------
+
+``__auto_type`` behaves the same as ``auto`` in C++11 but is available in all
+language modes.
+
+__imag, __imag__
+----------------
+
+``__imag`` and ``__imag__`` can be used to get the imaginary part of a complex
+value.
+
+__real, __real__
+----------------
+
+``__real`` and ``__real__`` can be used to get the real part of a complex value.
+
+__asm, __asm__
+--------------
+
+``__asm`` and ``__asm__`` are alternate spellings for ``asm``, but available in
+all language modes.
+
+__complex, __complex__
+----------------------
+
+``__complex`` and ``__complex__`` are alternate spellings for ``_Complex``.
+
+__const, __const__, __volatile, __volatile__, __restrict, __restrict__
+----------------------------------------------------------------------
+
+These are alternate spellings for their non-underscore counterparts, but are
+available in all langauge modes.
+
+__decltype
+----------
+
+``__decltype`` is an alternate spelling for ``decltype``, but is also available
+in C++ modes before C++11.
+
+__inline, __inline__
+--------------------
+
+``__inline`` and ``__inline__`` are alternate spellings for ``inline``, but are
+available in all language modes.
+
+__nullptr
+---------
+
+``__nullptr`` is an alternate spelling for ``nullptr``, but is also available in
+C++ modes prior to C++11. Note that it's currently not availbale in C despite
+C23 having support for ``nullptr``.
+
+__signed, __signed__
+--------------------
+
+``__signed`` and ``__signed__`` are alternate spellings for ``signed``.
+``__unsigned`` and ``__unsigned__`` are **not** supported.
+
+__typeof, __typeof__, __typeof_unqual, __typeof_unqual__
+--------------------------------------------------------
+
+``__typeof`` and ``__typeof__`` are alternate spellings for ``typeof``, but are
+available in all langauge modes. These spellings result in the operand,
+retaining all qualifiers.
+
+``__typeof_unqual`` and ``__typeof_unqual__`` are alternate spellings for the
+C23 ``typeof_unqual`` type specifier, but are available in all language modes.
+These spellings result in the type of the operand, stripping all qualifiers.
+
+__char16_t, __char32_t
+----------------------
+
+``__char16_t`` and ``__char32_t`` are alternate spellings for ``char16_t`` and
+``char32_t`` respectively, but are also available in C++ modes before C++11.
+They are only supported in C++. ``__char8_t`` is not available.
+
 ..
   FIXME: This should list all the keyword extensions
 
@@ -1697,7 +1805,7 @@ The following type trait primitives are supported by Clang. Those traits marked
 * ``__is_referenceable`` (C++, GNU, Microsoft, Embarcadero):
   Returns true if a type is referenceable, and false otherwise. A referenceable
   type is a type that's either an object type, a reference type, or an unqualified
-  function type.
+  function type. This trait is deprecated and will be removed in Clang 21.
 * ``__is_rvalue_reference`` (C++, Embarcadero)
 * ``__is_same`` (C++, Embarcadero)
 * ``__is_same_as`` (GCC): Synonym for ``__is_same``.
@@ -5785,17 +5893,6 @@ Examples are:
    # 60 "" 2 // return to "main.c"
    # 1 "/usr/ancient/header.h" 1 4 // Enter an implicit extern "C" header
 
-Extended Integer Types
-======================
-
-Clang supports the C23 ``_BitInt(N)`` feature as an extension in older C modes
-and in C++. This type was previously implemented in Clang with the same
-semantics, but spelled ``_ExtInt(N)``. This spelling has been deprecated in
-favor of the standard type.
-
-Note: the ABI for ``_BitInt(N)`` is still in the process of being stabilized,
-so this type should not yet be used in interfaces that require ABI stability.
-
 Intrinsics Support within Constant Expressions
 ==============================================
 
diff --git clang/docs/ReleaseNotes.rst clang/docs/ReleaseNotes.rst
index aa1c02d04f7c..41d217b500a4 100644
--- clang/docs/ReleaseNotes.rst
+++ clang/docs/ReleaseNotes.rst
@@ -93,6 +93,11 @@ C++ Specific Potentially Breaking Changes
   few users and can be written as ``__is_same(__remove_cv(T), decltype(nullptr))``,
   which GCC supports as well.
 
+- The type trait builtin ``__is_referenceable`` has been deprecated, since it has
+  very few users and all the type traits that could benefit from it in the
+  standard library already have their own bespoke builtins. It will be removed in
+  Clang 21.
+
 - Clang will now correctly diagnose as ill-formed a constant expression where an
   enum without a fixed underlying type is set to a value outside the range of
   the enumeration's values.
@@ -310,7 +315,7 @@ C++23 Feature Support
 
 - Extend lifetime of temporaries in mem-default-init for P2718R0. Clang now fully
   supports `P2718R0 Lifetime extension in range-based for loops <https://wg21.link/P2718R0>`_.
-  
+
 - ``__cpp_explicit_this_parameter`` is now defined. (#GH82780)
 
 C++20 Feature Support
@@ -655,6 +660,8 @@ Improvements to Clang's diagnostics
 
 - Don't emit bogus dangling diagnostics when ``[[gsl::Owner]]`` and `[[clang::lifetimebound]]` are used together (#GH108272).
 
+- Don't emit bogus dignostic about an undefined behavior on ``reinterpret_cast<T>`` for non-instantiated template functions without sufficient knowledge whether it can actually lead to undefined behavior for ``T`` (#GH109430).
+
 - The ``-Wreturn-stack-address`` warning now also warns about addresses of
   local variables passed to function calls using the ``[[clang::musttail]]``
   attribute.
@@ -717,7 +724,7 @@ Improvements to Clang's diagnostics
 
 - Clang now diagnoses dangling references for C++20's parenthesized aggregate initialization (#101957).
 
-- Fixed a bug where Clang would not emit ``-Wunused-private-field`` warnings when an unrelated class 
+- Fixed a bug where Clang would not emit ``-Wunused-private-field`` warnings when an unrelated class
   defined a defaulted comparison operator (#GH116270).
 
   .. code-block:: c++
@@ -791,6 +798,8 @@ Improvements to Clang's diagnostics
     }
 - Diagnose invalid declarators in the declaration of constructors and destructors (#GH121706).
 
+- Fix false positives warning for non-std functions with name `infinity` (#123231).
+
 Improvements to Clang's time-trace
 ----------------------------------
 
@@ -855,7 +864,7 @@ Bug Fixes to C++ Support
   module imports in those situations. (#GH60336)
 - Fix init-capture packs having a size of one before being instantiated. (#GH63677)
 - Clang now preserves the unexpanded flag in a lambda transform used for pack expansion. (#GH56852), (#GH85667),
-  (#GH99877).
+  (#GH99877), (#GH122417).
 - Fixed a bug when diagnosing ambiguous explicit specializations of constrained member functions.
 - Fixed an assertion failure when selecting a function from an overload set that includes a
   specialization of a conversion function template.
@@ -937,7 +946,7 @@ Bug Fixes to C++ Support
 - Fixed an assertion failure caused by invalid default argument substitutions in non-defining
   friend declarations. (#GH113324)
 - Fix a crash caused by incorrect argument position in merging deduced template arguments. (#GH113659)
-- Fixed a parser crash when using pack indexing as a nested name specifier. (#GH119072) 
+- Fixed a parser crash when using pack indexing as a nested name specifier. (#GH119072)
 - Fixed a null pointer dereference issue when heuristically computing ``sizeof...(pack)`` expressions. (#GH81436)
 - Fixed an assertion failure caused by mangled names with invalid identifiers. (#GH112205)
 - Fixed an incorrect lambda scope of generic lambdas that caused Clang to crash when computing potential lambda
@@ -956,6 +965,7 @@ Bug Fixes to C++ Support
 - Fixed a crash caused by the incorrect construction of template arguments for CTAD alias guides when type
   constraints are applied. (#GH122134)
 - Fixed canonicalization of pack indexing types - Clang did not always recognized identical pack indexing. (#GH123033)
+- Fixed a nested lambda substitution issue for constraint evaluation. (#GH123441)
 
 
 Bug Fixes to AST Handling
@@ -1124,6 +1134,7 @@ RISC-V Support
 CUDA/HIP Language Changes
 ^^^^^^^^^^^^^^^^^^^^^^^^^
 - Fixed a bug about overriding a constexpr pure-virtual member function with a non-constexpr virtual member function which causes compilation failure when including standard C++ header `format`.
+- Added initial support for version 3 of the compressed offload bundle format, which uses 64-bit fields for Total File Size and Uncompressed Binary Size. This enables support for files larger than 4GB. The support is currently experimental and can be enabled by setting the environment variable `COMPRESSED_BUNDLE_FORMAT_VERSION=3`.
 
 CUDA Support
 ^^^^^^^^^^^^
@@ -1224,6 +1235,7 @@ clang-format
 - Adds ``VariableTemplates`` option.
 - Adds support for bash globstar in ``.clang-format-ignore``.
 - Adds ``WrapNamespaceBodyWithEmptyLines`` option.
+- Adds the ``IndentExportBlock`` option.
 
 libclang
 --------
diff --git clang/include/clang/AST/DeclTemplate.h clang/include/clang/AST/DeclTemplate.h
index d3a466a8617b..8c2da97c07a3 100644
--- clang/include/clang/AST/DeclTemplate.h
+++ clang/include/clang/AST/DeclTemplate.h
@@ -367,12 +367,11 @@ public:
     if (!isSet())
       ValueOrInherited = InheritedFrom;
     else if ([[maybe_unused]] auto *D =
-                 ValueOrInherited.template dyn_cast<ParmDecl *>()) {
+                 dyn_cast<ParmDecl *>(ValueOrInherited)) {
       assert(C.isSameDefaultTemplateArgument(D, InheritedFrom));
       ValueOrInherited =
           new (allocateDefaultArgStorageChain(C)) Chain{InheritedFrom, get()};
-    } else if (auto *Inherited =
-                   ValueOrInherited.template dyn_cast<Chain *>()) {
+    } else if (auto *Inherited = dyn_cast<Chain *>(ValueOrInherited)) {
       assert(C.isSameDefaultTemplateArgument(Inherited->PrevDeclWithDefaultArg,
                                              InheritedFrom));
       Inherited->PrevDeclWithDefaultArg = InheritedFrom;
diff --git clang/include/clang/Basic/AttrDocs.td clang/include/clang/Basic/AttrDocs.td
index 5e66e752512d..56a817892bbb 100644
--- clang/include/clang/Basic/AttrDocs.td
+++ clang/include/clang/Basic/AttrDocs.td
@@ -121,11 +121,12 @@ def InitPriorityDocs : Documentation {
 In C++, the order in which global variables are initialized across translation
 units is unspecified, unlike the ordering within a single translation unit. The
 ``init_priority`` attribute allows you to specify a relative ordering for the
-initialization of objects declared at namespace scope in C++. The priority is
-given as an integer constant expression between 101 and 65535 (inclusive).
-Priorities outside of that range are reserved for use by the implementation. A
-lower value indicates a higher priority of initialization. Note that only the
-relative ordering of values is important. For example:
+initialization of objects declared at namespace scope in C++ within a single
+linked image on supported platforms. The priority is given as an integer constant
+expression between 101 and 65535 (inclusive). Priorities outside of that range are
+reserved for use by the implementation. A lower value indicates a higher priority
+of initialization. Note that only the relative ordering of values is important.
+For example:
 
 .. code-block:: c++
 
@@ -136,10 +137,16 @@ relative ordering of values is important. For example:
 ``Obj2`` will be initialized *before* ``Obj1`` despite the usual order of
 initialization being the opposite.
 
+Note that this attribute does not control the initialization order of objects
+across final linked image boundaries like shared objects and executables.
+
 On Windows, ``init_seg(compiler)`` is represented with a priority of 200 and
 ``init_seg(library)`` is represented with a priority of 400. ``init_seg(user)``
 uses the default 65535 priority.
 
+On MachO platforms, this attribute also does not control the order of initialization
+across translation units, where it only affects the order within a single TU.
+
 This attribute is only supported for C++ and Objective-C++ and is ignored in
 other language modes. Currently, this attribute is not implemented on z/OS.
   }];
diff --git clang/include/clang/Basic/BuiltinsNVPTX.def clang/include/clang/Basic/BuiltinsNVPTX.def
index 969dd9e41ebf..37b4e6ff77fd 100644
--- clang/include/clang/Basic/BuiltinsNVPTX.def
+++ clang/include/clang/Basic/BuiltinsNVPTX.def
@@ -28,7 +28,9 @@
 #pragma push_macro("SM_90")
 #pragma push_macro("SM_90a")
 #pragma push_macro("SM_100")
-#define SM_100 "sm_100"
+#pragma push_macro("SM_100a")
+#define SM_100a "sm_100a"
+#define SM_100 "sm_100|" SM_100a
 #define SM_90a "sm_90a"
 #define SM_90 "sm_90|" SM_90a "|" SM_100
 #define SM_89 "sm_89|" SM_90
@@ -1091,6 +1093,7 @@ TARGET_BUILTIN(__nvvm_getctarank_shared_cluster, "iv*3", "", AND(SM_90,PTX78))
 #pragma pop_macro("SM_90")
 #pragma pop_macro("SM_90a")
 #pragma pop_macro("SM_100")
+#pragma pop_macro("SM_100a")
 #pragma pop_macro("PTX42")
 #pragma pop_macro("PTX60")
 #pragma pop_macro("PTX61")
diff --git clang/include/clang/Basic/BuiltinsSystemZ.def clang/include/clang/Basic/BuiltinsSystemZ.def
index c564dd9e486b..ba94c1a130f9 100644
--- clang/include/clang/Basic/BuiltinsSystemZ.def
+++ clang/include/clang/Basic/BuiltinsSystemZ.def
@@ -286,6 +286,7 @@ TARGET_BUILTIN(__builtin_s390_vstrszf, "V16UcV4UiV4UiV16Uci*", "nc", "vector-enh
 TARGET_BUILTIN(__builtin_s390_vlbrh, "V8UsV8Us", "nc", "vector")
 TARGET_BUILTIN(__builtin_s390_vlbrf, "V4UiV4Ui", "nc", "vector")
 TARGET_BUILTIN(__builtin_s390_vlbrg, "V2ULLiV2ULLi", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_vlbrq, "ULLLiULLLi", "nc", "vector")
 
 // NNP-assist facility intrinsics.
 TARGET_BUILTIN(__builtin_s390_vclfnhs, "V4fV8UsIi", "nc", "nnp-assist")
@@ -294,5 +295,44 @@ TARGET_BUILTIN(__builtin_s390_vcrnfs, "V8UsV4fV4fIi", "nc", "nnp-assist")
 TARGET_BUILTIN(__builtin_s390_vcfn, "V8UsV8UsIi", "nc", "nnp-assist")
 TARGET_BUILTIN(__builtin_s390_vcnf, "V8UsV8UsIi", "nc", "nnp-assist")
 
+// Miscellaneous instruction extensions facility 4 intrinsics.
+TARGET_BUILTIN(__builtin_s390_bdepg, "ULiULiULi", "nc", "miscellaneous-extensions-4")
+TARGET_BUILTIN(__builtin_s390_bextg, "ULiULiULi", "nc", "miscellaneous-extensions-4")
+
+// Vector-enhancements facility 3 intrinsics.
+TARGET_BUILTIN(__builtin_s390_vgemb, "V16UcV8Us", "nc", "vector-enhancements-3")
+TARGET_BUILTIN(__builtin_s390_vgemh, "V8UsV16Uc", "nc", "vector-enhancements-3")
+TARGET_BUILTIN(__builtin_s390_vgemf, "V4UiV16Uc", "nc", "vector-enhancements-3")
+TARGET_BUILTIN(__builtin_s390_vgemg, "V2ULLiV16Uc", "nc", "vector-enhancements-3")
+TARGET_BUILTIN(__builtin_s390_vgemq, "ULLLiV16Uc", "nc", "vector-enhancements-3")
+TARGET_BUILTIN(__builtin_s390_vuplg, "SLLLiV2SLLi", "nc", "vector-enhancements-3")
+TARGET_BUILTIN(__builtin_s390_vupllg, "ULLLiV2ULLi", "nc", "vector-enhancements-3")
+TARGET_BUILTIN(__builtin_s390_vuphg, "SLLLiV2SLLi", "nc", "vector-enhancements-3")
+TARGET_BUILTIN(__builtin_s390_vuplhg, "ULLLiV2ULLi", "nc", "vector-enhancements-3")
+TARGET_BUILTIN(__builtin_s390_vavgq, "SLLLiSLLLiSLLLi", "nc", "vector-enhancements-3")
+TARGET_BUILTIN(__builtin_s390_vavglq, "ULLLiULLLiULLLi", "nc", "vector-enhancements-3")
+TARGET_BUILTIN(__builtin_s390_veval, "V16UcV16UcV16UcV16UcIi", "nc", "vector-enhancements-3")
+TARGET_BUILTIN(__builtin_s390_vmahg, "V2SLLiV2SLLiV2SLLiV2SLLi", "nc", "vector-enhancements-3")
+TARGET_BUILTIN(__builtin_s390_vmahq, "SLLLiSLLLiSLLLiSLLLi", "nc", "vector-enhancements-3")
+TARGET_BUILTIN(__builtin_s390_vmalhg, "V2ULLiV2ULLiV2ULLiV2ULLi", "nc", "vector-enhancements-3")
+TARGET_BUILTIN(__builtin_s390_vmalhq, "ULLLiULLLiULLLiULLLi", "nc", "vector-enhancements-3")
+TARGET_BUILTIN(__builtin_s390_vmaeg, "SLLLiV2SLLiV2SLLiSLLLi", "nc", "vector-enhancements-3")
+TARGET_BUILTIN(__builtin_s390_vmaleg, "ULLLiV2ULLiV2ULLiULLLi", "nc", "vector-enhancements-3")
+TARGET_BUILTIN(__builtin_s390_vmaog, "SLLLiV2SLLiV2SLLiSLLLi", "nc", "vector-enhancements-3")
+TARGET_BUILTIN(__builtin_s390_vmalog, "ULLLiV2ULLiV2ULLiULLLi", "nc", "vector-enhancements-3")
+TARGET_BUILTIN(__builtin_s390_vmhg, "V2SLLiV2SLLiV2SLLi", "nc", "vector-enhancements-3")
+TARGET_BUILTIN(__builtin_s390_vmhq, "SLLLiSLLLiSLLLi", "nc", "vector-enhancements-3")
+TARGET_BUILTIN(__builtin_s390_vmlhg, "V2ULLiV2ULLiV2ULLi", "nc", "vector-enhancements-3")
+TARGET_BUILTIN(__builtin_s390_vmlhq, "ULLLiULLLiULLLi", "nc", "vector-enhancements-3")
+TARGET_BUILTIN(__builtin_s390_vmeg, "SLLLiV2SLLiV2SLLi", "nc", "vector-enhancements-3")
+TARGET_BUILTIN(__builtin_s390_vmleg, "ULLLiV2ULLiV2ULLi", "nc", "vector-enhancements-3")
+TARGET_BUILTIN(__builtin_s390_vmog, "SLLLiV2SLLiV2SLLi", "nc", "vector-enhancements-3")
+TARGET_BUILTIN(__builtin_s390_vmlog, "ULLLiV2ULLiV2ULLi", "nc", "vector-enhancements-3")
+TARGET_BUILTIN(__builtin_s390_vceqqs, "SLLLiULLLiULLLii*", "nc", "vector-enhancements-3")
+TARGET_BUILTIN(__builtin_s390_vchqs, "SLLLiSLLLiSLLLii*", "nc", "vector-enhancements-3")
+TARGET_BUILTIN(__builtin_s390_vchlqs, "SLLLiULLLiULLLii*", "nc", "vector-enhancements-3")
+TARGET_BUILTIN(__builtin_s390_vclzq, "ULLLiULLLi", "nc", "vector-enhancements-3")
+TARGET_BUILTIN(__builtin_s390_vctzq, "ULLLiULLLi", "nc", "vector-enhancements-3")
+
 #undef BUILTIN
 #undef TARGET_BUILTIN
diff --git clang/include/clang/Basic/BuiltinsX86.td clang/include/clang/Basic/BuiltinsX86.td
index 18fc10eb85c0..a6c932967f52 100644
--- clang/include/clang/Basic/BuiltinsX86.td
+++ clang/include/clang/Basic/BuiltinsX86.td
@@ -4936,15 +4936,15 @@ let Features = "avx10.2-512,sm4", Attributes = [NoThrow, RequiredVectorWidth<512
 }
 
 let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
-  def vminmaxnepbf16128 : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>, _Vector<8, __bf16>, _Constant int)">;
+  def vminmaxbf16128 : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>, _Vector<8, __bf16>, _Constant int)">;
 }
 
 let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
-  def vminmaxnepbf16256 : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>, _Vector<16, __bf16>, _Constant int)">;
+  def vminmaxbf16256 : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>, _Vector<16, __bf16>, _Constant int)">;
 }
 
 let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
-  def vminmaxnepbf16512 : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Vector<32, __bf16>, _Constant int)">;
+  def vminmaxbf16512 : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Vector<32, __bf16>, _Constant int)">;
 }
 
 let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
diff --git clang/include/clang/Basic/CodeGenOptions.def clang/include/clang/Basic/CodeGenOptions.def
index 0f4ed13d5f3d..1ab8c7fb4d3c 100644
--- clang/include/clang/Basic/CodeGenOptions.def
+++ clang/include/clang/Basic/CodeGenOptions.def
@@ -413,9 +413,6 @@ CODEGENOPT(StrictReturn, 1, 1)
 /// Whether emit pseudo probes for sample pgo profile collection.
 CODEGENOPT(PseudoProbeForProfiling, 1, 0)
 
-/// Whether 3-component vector type is preserved.
-CODEGENOPT(PreserveVec3Type, 1, 0)
-
 CODEGENOPT(NoPLT, 1, 0)
 
 /// Whether to emit all vtables
diff --git clang/include/clang/Basic/Cuda.h clang/include/clang/Basic/Cuda.h
index c2a4addf488d..1cdfc8178db8 100644
--- clang/include/clang/Basic/Cuda.h
+++ clang/include/clang/Basic/Cuda.h
@@ -44,9 +44,12 @@ enum class CudaVersion {
   CUDA_124,
   CUDA_125,
   CUDA_126,
+  CUDA_127,
+  CUDA_128,
+  CUDA_129,
   FULLY_SUPPORTED = CUDA_123,
   PARTIALLY_SUPPORTED =
-      CUDA_126, // Partially supported. Proceed with a warning.
+      CUDA_129, // Partially supported. Proceed with a warning.
   NEW = 10000,  // Too new. Issue a warning, but allow using it.
 };
 const char *CudaVersionToString(CudaVersion V);
@@ -80,6 +83,7 @@ enum class OffloadArch {
   SM_90,
   SM_90a,
   SM_100,
+  SM_100a,
   GFX600,
   GFX601,
   GFX602,
diff --git clang/include/clang/Basic/LangOptions.def clang/include/clang/Basic/LangOptions.def
index 3b833240e5b6..a980be853d53 100644
--- clang/include/clang/Basic/LangOptions.def
+++ clang/include/clang/Basic/LangOptions.def
@@ -532,6 +532,8 @@ BENIGN_LANGOPT(CheckConstexprFunctionBodies, 1, 1,
 
 LANGOPT(BoundsSafety, 1, 0, "Bounds safety extension for C")
 
+LANGOPT(PreserveVec3Type, 1, 0, "Preserve 3-component vector type")
+
 #undef LANGOPT
 #undef COMPATIBLE_LANGOPT
 #undef BENIGN_LANGOPT
diff --git clang/include/clang/Basic/arm_neon.td clang/include/clang/Basic/arm_neon.td
index ef89fa4358df..ddc5391eb3fa 100644
--- clang/include/clang/Basic/arm_neon.td
+++ clang/include/clang/Basic/arm_neon.td
@@ -259,11 +259,6 @@ def OP_VCVT_F32_BF16_LO
 def OP_VCVT_F32_BF16_HI
     : Op<(call "vcvt_f32_bf16", (call "vget_high", $p0))>;
 
-def OP_VCVT_BF16_F32_LO_A64
-    : Op<(call "__a64_vcvtq_low_bf16", $p0)>;
-def OP_VCVT_BF16_F32_A64
-    : Op<(call "vget_low", (call "__a64_vcvtq_low_bf16", $p0))>;
-
 def OP_VCVT_BF16_F32_A32
     : Op<(call "__a32_vcvt_bf16", $p0)>;
 
@@ -2061,10 +2056,9 @@ let ArchGuard = "!defined(__aarch64__) && !defined(__arm64ec__)", TargetGuard =
 }
 
 let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "bf16,neon" in {
-  def VCVT_LOW_BF16_F32_A64_INTERNAL : WInst<"__a64_vcvtq_low_bf16", "BQ", "Hf">;
-  def VCVT_LOW_BF16_F32_A64 : SOpInst<"vcvt_low_bf16", "BQ", "Qf", OP_VCVT_BF16_F32_LO_A64>;
+  def VCVT_LOW_BF16_F32_A64 : SInst<"vcvt_low_bf16", "BQ", "Qf">;
   def VCVT_HIGH_BF16_F32_A64 : SInst<"vcvt_high_bf16", "BBQ", "Qf">;
-  def VCVT_BF16_F32 : SOpInst<"vcvt_bf16",    "BQ", "f", OP_VCVT_BF16_F32_A64>;
+  def VCVT_BF16_F32 : SInst<"vcvt_bf16", "BQ", "f">;
 
   def COPY_LANE_BF16 : IOpInst<"vcopy_lane", "..I.I", "b", OP_COPY_LN>;
   def COPYQ_LANE_BF16 : IOpInst<"vcopy_lane", "..IqI", "Qb", OP_COPY_LN>;
diff --git clang/include/clang/Basic/arm_sve.td clang/include/clang/Basic/arm_sve.td
index ac1c139b2094..e7001bac450e 100644
--- clang/include/clang/Basic/arm_sve.td
+++ clang/include/clang/Basic/arm_sve.td
@@ -2280,15 +2280,15 @@ let SVETargetGuard = "sve2p1", SMETargetGuard = "sme2" in {
 
 let SVETargetGuard = "sve2p1", SMETargetGuard = InvalidMode in {
   // ZIPQ1, ZIPQ2, UZPQ1, UZPQ2
-  def SVZIPQ1 : SInst<"svzipq1[_{d}]", "ddd", "cUcsUsiUilUlbhfd", MergeNone, "aarch64_sve_zipq1", [], []>;
-  def SVZIPQ2 : SInst<"svzipq2[_{d}]", "ddd", "cUcsUsiUilUlbhfd", MergeNone, "aarch64_sve_zipq2", [], []>;
-  def SVUZPQ1 : SInst<"svuzpq1[_{d}]", "ddd", "cUcsUsiUilUlbhfd", MergeNone, "aarch64_sve_uzpq1", [], []>;
-  def SVUZPQ2 : SInst<"svuzpq2[_{d}]", "ddd", "cUcsUsiUilUlbhfd", MergeNone, "aarch64_sve_uzpq2", [], []>;
+  def SVZIPQ1 : SInst<"svzipq1[_{d}]", "ddd", "cUcsUsiUilUlbhfdm", MergeNone, "aarch64_sve_zipq1", [], []>;
+  def SVZIPQ2 : SInst<"svzipq2[_{d}]", "ddd", "cUcsUsiUilUlbhfdm", MergeNone, "aarch64_sve_zipq2", [], []>;
+  def SVUZPQ1 : SInst<"svuzpq1[_{d}]", "ddd", "cUcsUsiUilUlbhfdm", MergeNone, "aarch64_sve_uzpq1", [], []>;
+  def SVUZPQ2 : SInst<"svuzpq2[_{d}]", "ddd", "cUcsUsiUilUlbhfdm", MergeNone, "aarch64_sve_uzpq2", [], []>;
   // TBLQ, TBXQ
-  def SVTBLQ : SInst<"svtblq[_{d}]", "ddu", "cUcsUsiUilUlbhfd", MergeNone, "aarch64_sve_tblq">;
-  def SVTBXQ : SInst<"svtbxq[_{d}]", "dddu", "cUcsUsiUilUlbhfd", MergeNone, "aarch64_sve_tbxq">;
+  def SVTBLQ : SInst<"svtblq[_{d}]", "ddu", "cUcsUsiUilUlbhfdm", MergeNone, "aarch64_sve_tblq">;
+  def SVTBXQ : SInst<"svtbxq[_{d}]", "dddu", "cUcsUsiUilUlbhfdm", MergeNone, "aarch64_sve_tbxq">;
   // EXTQ
-  def EXTQ : SInst<"svextq[_{d}]", "dddk", "cUcsUsiUilUlbhfd", MergeNone, "aarch64_sve_extq", [], [ImmCheck<2, ImmCheckLaneIndex, 0>]>;
+  def EXTQ : SInst<"svextq[_{d}]", "dddk", "cUcsUsiUilUlbhfdm", MergeNone, "aarch64_sve_extq", [], [ImmCheck<2, ImmCheckLaneIndex, 0>]>;
 
   // PMOV
   // Move to Pred
@@ -2314,7 +2314,7 @@ let SVETargetGuard = "sve2p1", SMETargetGuard = InvalidMode in {
 
 let SVETargetGuard = "sve2p1", SMETargetGuard = "sme2p1" in {
   // DUPQ
-  def SVDUP_LANEQ_B  : SInst<"svdup_laneq[_{d}]", "ddi",  "cUc", MergeNone, "aarch64_sve_dup_laneq", [VerifyRuntimeMode], [ImmCheck<1, ImmCheck0_15>]>;
+  def SVDUP_LANEQ_B  : SInst<"svdup_laneq[_{d}]", "ddi",  "cUcm", MergeNone, "aarch64_sve_dup_laneq", [VerifyRuntimeMode], [ImmCheck<1, ImmCheck0_15>]>;
   def SVDUP_LANEQ_H  : SInst<"svdup_laneq[_{d}]", "ddi",  "sUsh", MergeNone, "aarch64_sve_dup_laneq", [VerifyRuntimeMode], [ImmCheck<1, ImmCheck0_7>]>;
   def SVDUP_LANEQ_S  : SInst<"svdup_laneq[_{d}]", "ddi",  "iUif", MergeNone, "aarch64_sve_dup_laneq", [VerifyRuntimeMode], [ImmCheck<1, ImmCheck0_3>]>;
   def SVDUP_LANEQ_D  : SInst<"svdup_laneq[_{d}]", "ddi",  "lUld", MergeNone, "aarch64_sve_dup_laneq", [VerifyRuntimeMode], [ImmCheck<1, ImmCheck0_1>]>;
diff --git clang/include/clang/Driver/OffloadBundler.h clang/include/clang/Driver/OffloadBundler.h
index 57ecbdcb7d04..31c11e25ecd9 100644
--- clang/include/clang/Driver/OffloadBundler.h
+++ clang/include/clang/Driver/OffloadBundler.h
@@ -39,6 +39,7 @@ public:
   bool Verbose = false;
   llvm::compression::Format CompressionFormat;
   int CompressionLevel;
+  uint16_t CompressedBundleVersion;
 
   unsigned BundleAlignment = 1;
   unsigned HostInputIndex = ~0u;
@@ -100,36 +101,63 @@ struct OffloadTargetInfo {
 // - Version (2 bytes)
 // - Compression Method (2 bytes) - Uses the values from
 // llvm::compression::Format.
-// - Total file size (4 bytes). Available in version 2 and above.
-// - Uncompressed Size (4 bytes).
+// - Total file size (4 bytes in V2, 8 bytes in V3).
+// - Uncompressed Size (4 bytes in V1/V2, 8 bytes in V3).
 // - Truncated MD5 Hash (8 bytes).
 // - Compressed Data (variable length).
-
 class CompressedOffloadBundle {
 private:
   static inline const size_t MagicSize = 4;
   static inline const size_t VersionFieldSize = sizeof(uint16_t);
   static inline const size_t MethodFieldSize = sizeof(uint16_t);
-  static inline const size_t FileSizeFieldSize = sizeof(uint32_t);
-  static inline const size_t UncompressedSizeFieldSize = sizeof(uint32_t);
+  // Legacy size fields for V1/V2
+  static inline const size_t FileSizeFieldSizeV2 = sizeof(uint32_t);
+  static inline const size_t UncompressedSizeFieldSizeV2 = sizeof(uint32_t);
+  // New size fields for V3
+  static inline const size_t FileSizeFieldSizeV3 = sizeof(uint64_t);
+  static inline const size_t UncompressedSizeFieldSizeV3 = sizeof(uint64_t);
   static inline const size_t HashFieldSize = sizeof(uint64_t);
+
+  // Keep V1 header size for backward compatibility
   static inline const size_t V1HeaderSize =
       MagicSize + VersionFieldSize + MethodFieldSize +
-      UncompressedSizeFieldSize + HashFieldSize;
+      UncompressedSizeFieldSizeV2 + HashFieldSize;
+
+  // Keep V2 header size for backward compatibility
   static inline const size_t V2HeaderSize =
-      MagicSize + VersionFieldSize + FileSizeFieldSize + MethodFieldSize +
-      UncompressedSizeFieldSize + HashFieldSize;
+      MagicSize + VersionFieldSize + FileSizeFieldSizeV2 + MethodFieldSize +
+      UncompressedSizeFieldSizeV2 + HashFieldSize;
+
+  // Add V3 header size with 64-bit fields
+  static inline const size_t V3HeaderSize =
+      MagicSize + VersionFieldSize + FileSizeFieldSizeV3 + MethodFieldSize +
+      UncompressedSizeFieldSizeV3 + HashFieldSize;
+
   static inline const llvm::StringRef MagicNumber = "CCOB";
-  static inline const uint16_t Version = 2;
 
 public:
+  static inline const uint16_t DefaultVersion = 2;
+
+  // Helper method to get header size based on version
+  static size_t getHeaderSize(uint16_t Version) {
+    switch (Version) {
+    case 1:
+      return V1HeaderSize;
+    case 2:
+      return V2HeaderSize;
+    case 3:
+      return V3HeaderSize;
+    default:
+      llvm_unreachable("Unsupported version");
+    }
+  }
+
   static llvm::Expected<std::unique_ptr<llvm::MemoryBuffer>>
   compress(llvm::compression::Params P, const llvm::MemoryBuffer &Input,
-           bool Verbose = false);
+           uint16_t Version, bool Verbose = false);
   static llvm::Expected<std::unique_ptr<llvm::MemoryBuffer>>
   decompress(const llvm::MemoryBuffer &Input, bool Verbose = false);
 };
-
 } // namespace clang
 
 #endif // LLVM_CLANG_DRIVER_OFFLOADBUNDLER_H
diff --git clang/include/clang/Driver/Options.td clang/include/clang/Driver/Options.td
index d38dd2b4e3cf..852051e772fc 100644
--- clang/include/clang/Driver/Options.td
+++ clang/include/clang/Driver/Options.td
@@ -3505,6 +3505,11 @@ def fno_struct_path_tbaa : Flag<["-"], "fno-struct-path-tbaa">, Group<f_Group>;
 def fno_strict_enums : Flag<["-"], "fno-strict-enums">, Group<f_Group>;
 def fno_strict_overflow : Flag<["-"], "fno-strict-overflow">, Group<f_Group>,
   Visibility<[ClangOption, FlangOption]>;
+defm init_global_zero : BoolOptionWithoutMarshalling<"f", "init-global-zero",
+  PosFlag<SetTrue, [], [FlangOption, FC1Option],
+          "Zero initialize globals without default initialization (default)">,
+  NegFlag<SetFalse, [], [FlangOption, FC1Option],
+          "Do not zero initialize globals without default initialization">>;
 def fno_pointer_tbaa : Flag<["-"], "fno-pointer-tbaa">, Group<f_Group>;
 def fno_temp_file : Flag<["-"], "fno-temp-file">, Group<f_Group>,
   Visibility<[ClangOption, CC1Option, CLOption, DXCOption]>, HelpText<
@@ -8240,10 +8245,6 @@ def fhlsl_strict_availability : Flag<["-"], "fhlsl-strict-availability">,
   Group<hlsl_Group>,
   MarshallingInfoFlag<LangOpts<"HLSLStrictAvailability">>;
 
-def fpreserve_vec3_type : Flag<["-"], "fpreserve-vec3-type">,
-  HelpText<"Preserve 3-component vector type">,
-  MarshallingInfoFlag<CodeGenOpts<"PreserveVec3Type">>,
-  ImpliedByAnyOf<[hlsl.KeyPath]>;
 def fwchar_type_EQ : Joined<["-"], "fwchar-type=">,
   HelpText<"Select underlying type for wchar_t">,
   Values<"char,short,int">,
diff --git clang/include/clang/Format/Format.h clang/include/clang/Format/Format.h
index 7c2afd4d94ab..874376cb2310 100644
--- clang/include/clang/Format/Format.h
+++ clang/include/clang/Format/Format.h
@@ -2819,6 +2819,19 @@ struct FormatStyle {
   /// \version 10
   bool IndentGotoLabels;
 
+  /// If ``true``, clang-format will indent the body of an ``export { ... }``
+  /// block. This doesn't affect the formatting of anything else related to
+  /// exported declarations.
+  /// \code
+  ///    true:                     false:
+  ///    export {          vs.     export {
+  ///      void foo();             void foo();
+  ///      void bar();             void bar();
+  ///    }                         }
+  /// \endcode
+  /// \version 20
+  bool IndentExportBlock;
+
   /// Indents extern blocks
   enum IndentExternBlockStyle : int8_t {
     /// Backwards compatible with AfterExternBlock's indenting.
@@ -5266,6 +5279,7 @@ struct FormatStyle {
            IndentAccessModifiers == R.IndentAccessModifiers &&
            IndentCaseBlocks == R.IndentCaseBlocks &&
            IndentCaseLabels == R.IndentCaseLabels &&
+           IndentExportBlock == R.IndentExportBlock &&
            IndentExternBlock == R.IndentExternBlock &&
            IndentGotoLabels == R.IndentGotoLabels &&
            IndentPPDirectives == R.IndentPPDirectives &&
diff --git clang-tools-extra/clangd/HeuristicResolver.h clang/include/clang/Sema/HeuristicResolver.h
similarity index 93%
rename from clang-tools-extra/clangd/HeuristicResolver.h
rename to clang/include/clang/Sema/HeuristicResolver.h
index c130e0677e86..3760003aab89 100644
--- clang-tools-extra/clangd/HeuristicResolver.h
+++ clang/include/clang/Sema/HeuristicResolver.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_HEURISTICRESOLVER_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_HEURISTICRESOLVER_H
+#ifndef LLVM_CLANG_SEMA_HEURISTICRESOLVER_H
+#define LLVM_CLANG_SEMA_HEURISTICRESOLVER_H
 
 #include "clang/AST/Decl.h"
 #include <vector>
@@ -24,8 +24,6 @@ class NamedDecl;
 class Type;
 class UnresolvedUsingValueDecl;
 
-namespace clangd {
-
 // This class handles heuristic resolution of declarations and types in template
 // code.
 //
@@ -68,19 +66,18 @@ public:
   // Try to heuristically resolve a dependent nested name specifier
   // to the type it likely denotes. Note that *dependent* name specifiers always
   // denote types, not namespaces.
-  const Type *
+  QualType
   resolveNestedNameSpecifierToType(const NestedNameSpecifier *NNS) const;
 
   // Given the type T of a dependent expression that appears of the LHS of a
   // "->", heuristically find a corresponding pointee type in whose scope we
   // could look up the name appearing on the RHS.
-  const Type *getPointeeType(const Type *T) const;
+  const QualType getPointeeType(QualType T) const;
 
 private:
   ASTContext &Ctx;
 };
 
-} // namespace clangd
 } // namespace clang
 
 #endif
diff --git clang/include/clang/Sema/Overload.h clang/include/clang/Sema/Overload.h
index 58fa64c80a16..176a2a8d2a35 100644
--- clang/include/clang/Sema/Overload.h
+++ clang/include/clang/Sema/Overload.h
@@ -898,7 +898,8 @@ class Sema;
     ConversionFixItGenerator Fix;
 
     /// Viable - True to indicate that this overload candidate is viable.
-    bool Viable : 1;
+    LLVM_PREFERRED_TYPE(bool)
+    unsigned Viable : 1;
 
     /// Whether this candidate is the best viable function, or tied for being
     /// the best viable function.
@@ -907,12 +908,14 @@ class Sema;
     /// was part of the ambiguity kernel: the minimal non-empty set of viable
     /// candidates such that all elements of the ambiguity kernel are better
     /// than all viable candidates not in the ambiguity kernel.
-    bool Best : 1;
+    LLVM_PREFERRED_TYPE(bool)
+    unsigned Best : 1;
 
     /// IsSurrogate - True to indicate that this candidate is a
     /// surrogate for a conversion to a function pointer or reference
     /// (C++ [over.call.object]).
-    bool IsSurrogate : 1;
+    LLVM_PREFERRED_TYPE(bool)
+    unsigned IsSurrogate : 1;
 
     /// IgnoreObjectArgument - True to indicate that the first
     /// argument's conversion, which for this function represents the
@@ -921,12 +924,15 @@ class Sema;
     /// implicit object argument is just a placeholder) or a
     /// non-static member function when the call doesn't have an
     /// object argument.
-    bool IgnoreObjectArgument : 1;
+    LLVM_PREFERRED_TYPE(bool)
+    unsigned IgnoreObjectArgument : 1;
 
-    bool TookAddressOfOverload : 1;
+    LLVM_PREFERRED_TYPE(bool)
+    unsigned TookAddressOfOverload : 1;
 
     /// True if the candidate was found using ADL.
-    CallExpr::ADLCallKind IsADLCandidate : 1;
+    LLVM_PREFERRED_TYPE(CallExpr::ADLCallKind)
+    unsigned IsADLCandidate : 1;
 
     /// Whether this is a rewritten candidate, and if so, of what kind?
     LLVM_PREFERRED_TYPE(OverloadCandidateRewriteKind)
@@ -999,7 +1005,8 @@ class Sema;
     friend class OverloadCandidateSet;
     OverloadCandidate()
         : IsSurrogate(false), IgnoreObjectArgument(false),
-          TookAddressOfOverload(false), IsADLCandidate(CallExpr::NotADL),
+          TookAddressOfOverload(false),
+          IsADLCandidate(llvm::to_underlying(CallExpr::NotADL)),
           RewriteKind(CRK_None) {}
   };
 
diff --git clang/include/clang/Sema/Sema.h clang/include/clang/Sema/Sema.h
index a41f16f6dc8c..9fa33d6ca76b 100644
--- clang/include/clang/Sema/Sema.h
+++ clang/include/clang/Sema/Sema.h
@@ -13841,6 +13841,13 @@ private:
       LocalInstantiationScope &Scope,
       const MultiLevelTemplateArgumentList &TemplateArgs);
 
+  /// Introduce the instantiated captures of the lambda into the local
+  /// instantiation scope.
+  bool addInstantiatedCapturesToScope(
+      FunctionDecl *Function, const FunctionDecl *PatternDecl,
+      LocalInstantiationScope &Scope,
+      const MultiLevelTemplateArgumentList &TemplateArgs);
+
   int ParsingClassDepth = 0;
 
   class SavePendingParsedClassStateRAII {
@@ -14521,16 +14528,9 @@ private:
   // The current stack of constraint satisfactions, so we can exit-early.
   llvm::SmallVector<SatisfactionStackEntryTy, 10> SatisfactionStack;
 
-  /// Introduce the instantiated captures of the lambda into the local
-  /// instantiation scope.
-  bool addInstantiatedCapturesToScope(
-      FunctionDecl *Function, const FunctionDecl *PatternDecl,
-      LocalInstantiationScope &Scope,
-      const MultiLevelTemplateArgumentList &TemplateArgs);
-
-  /// Used by SetupConstraintCheckingTemplateArgumentsAndScope to recursively(in
-  /// the case of lambdas) set up the LocalInstantiationScope of the current
-  /// function.
+  /// Used by SetupConstraintCheckingTemplateArgumentsAndScope to set up the
+  /// LocalInstantiationScope of the current non-lambda function. For lambdas,
+  /// use LambdaScopeForCallOperatorInstantiationRAII.
   bool
   SetupConstraintScope(FunctionDecl *FD,
                        std::optional<ArrayRef<TemplateArgument>> TemplateArgs,
diff --git clang/lib/AST/ByteCode/ByteCodeEmitter.cpp clang/lib/AST/ByteCode/ByteCodeEmitter.cpp
index 3f2bc46664a4..19e2416c4c94 100644
--- clang/lib/AST/ByteCode/ByteCodeEmitter.cpp
+++ clang/lib/AST/ByteCode/ByteCodeEmitter.cpp
@@ -332,6 +332,12 @@ void emit(Program &P, std::vector<std::byte> &Code, const IntegralAP<true> &Val,
   emitSerialized(Code, Val, Success);
 }
 
+template <>
+void emit(Program &P, std::vector<std::byte> &Code, const FixedPoint &Val,
+          bool &Success) {
+  emitSerialized(Code, Val, Success);
+}
+
 template <typename... Tys>
 bool ByteCodeEmitter::emitOp(Opcode Op, const Tys &...Args,
                              const SourceInfo &SI) {
diff --git clang/lib/AST/ByteCode/Compiler.cpp clang/lib/AST/ByteCode/Compiler.cpp
index 3ef2b0858e66..66ab27bdd13d 100644
--- clang/lib/AST/ByteCode/Compiler.cpp
+++ clang/lib/AST/ByteCode/Compiler.cpp
@@ -253,6 +253,9 @@ bool Compiler<Emitter>::VisitCastExpr(const CastExpr *CE) {
 
   case CK_UncheckedDerivedToBase:
   case CK_DerivedToBase: {
+    if (DiscardResult)
+      return this->discard(SubExpr);
+
     if (!this->delegate(SubExpr))
       return false;
 
@@ -282,6 +285,9 @@ bool Compiler<Emitter>::VisitCastExpr(const CastExpr *CE) {
   }
 
   case CK_BaseToDerived: {
+    if (DiscardResult)
+      return this->discard(SubExpr);
+
     if (!this->delegate(SubExpr))
       return false;
 
@@ -689,20 +695,18 @@ bool Compiler<Emitter>::VisitCastExpr(const CastExpr *CE) {
     if (!this->visit(SubExpr))
       return false;
 
-    auto Sem = Ctx.getASTContext().getFixedPointSemantics(CE->getType());
-    uint32_t I;
-    std::memcpy(&I, &Sem, sizeof(Sem));
-    return this->emitCastIntegralFixedPoint(classifyPrim(SubExpr->getType()), I,
-                                            CE);
+    auto Sem =
+        Ctx.getASTContext().getFixedPointSemantics(CE->getType()).toOpaqueInt();
+    return this->emitCastIntegralFixedPoint(classifyPrim(SubExpr->getType()),
+                                            Sem, CE);
   }
   case CK_FloatingToFixedPoint: {
     if (!this->visit(SubExpr))
       return false;
 
-    auto Sem = Ctx.getASTContext().getFixedPointSemantics(CE->getType());
-    uint32_t I;
-    std::memcpy(&I, &Sem, sizeof(Sem));
-    return this->emitCastFloatingFixedPoint(I, CE);
+    auto Sem =
+        Ctx.getASTContext().getFixedPointSemantics(CE->getType()).toOpaqueInt();
+    return this->emitCastFloatingFixedPoint(Sem, CE);
   }
   case CK_FixedPointToFloating: {
     if (!this->visit(SubExpr))
@@ -718,10 +722,9 @@ bool Compiler<Emitter>::VisitCastExpr(const CastExpr *CE) {
   case CK_FixedPointCast: {
     if (!this->visit(SubExpr))
       return false;
-    auto Sem = Ctx.getASTContext().getFixedPointSemantics(CE->getType());
-    uint32_t I;
-    std::memcpy(&I, &Sem, sizeof(Sem));
-    return this->emitCastFixedPoint(I, CE);
+    auto Sem =
+        Ctx.getASTContext().getFixedPointSemantics(CE->getType()).toOpaqueInt();
+    return this->emitCastFixedPoint(Sem, CE);
   }
 
   case CK_ToVoid:
@@ -1522,28 +1525,29 @@ template <class Emitter>
 bool Compiler<Emitter>::VisitFixedPointBinOp(const BinaryOperator *E) {
   const Expr *LHS = E->getLHS();
   const Expr *RHS = E->getRHS();
+  const ASTContext &ASTCtx = Ctx.getASTContext();
 
   assert(LHS->getType()->isFixedPointType() ||
          RHS->getType()->isFixedPointType());
 
-  auto LHSSema = Ctx.getASTContext().getFixedPointSemantics(LHS->getType());
-  auto RHSSema = Ctx.getASTContext().getFixedPointSemantics(RHS->getType());
+  auto LHSSema = ASTCtx.getFixedPointSemantics(LHS->getType());
+  auto LHSSemaInt = LHSSema.toOpaqueInt();
+  auto RHSSema = ASTCtx.getFixedPointSemantics(RHS->getType());
+  auto RHSSemaInt = RHSSema.toOpaqueInt();
 
   if (!this->visit(LHS))
     return false;
   if (!LHS->getType()->isFixedPointType()) {
-    uint32_t I;
-    std::memcpy(&I, &LHSSema, sizeof(llvm::FixedPointSemantics));
-    if (!this->emitCastIntegralFixedPoint(classifyPrim(LHS->getType()), I, E))
+    if (!this->emitCastIntegralFixedPoint(classifyPrim(LHS->getType()),
+                                          LHSSemaInt, E))
       return false;
   }
 
   if (!this->visit(RHS))
     return false;
   if (!RHS->getType()->isFixedPointType()) {
-    uint32_t I;
-    std::memcpy(&I, &RHSSema, sizeof(llvm::FixedPointSemantics));
-    if (!this->emitCastIntegralFixedPoint(classifyPrim(RHS->getType()), I, E))
+    if (!this->emitCastIntegralFixedPoint(classifyPrim(RHS->getType()),
+                                          RHSSemaInt, E))
       return false;
   }
 
@@ -1551,13 +1555,10 @@ bool Compiler<Emitter>::VisitFixedPointBinOp(const BinaryOperator *E) {
   auto ConvertResult = [&](bool R) -> bool {
     if (!R)
       return false;
-    auto ResultSema = Ctx.getASTContext().getFixedPointSemantics(E->getType());
-    auto CommonSema = LHSSema.getCommonSemantics(RHSSema);
-    if (ResultSema != CommonSema) {
-      uint32_t I;
-      std::memcpy(&I, &ResultSema, sizeof(ResultSema));
-      return this->emitCastFixedPoint(I, E);
-    }
+    auto ResultSema = ASTCtx.getFixedPointSemantics(E->getType()).toOpaqueInt();
+    auto CommonSema = LHSSema.getCommonSemantics(RHSSema).toOpaqueInt();
+    if (ResultSema != CommonSema)
+      return this->emitCastFixedPoint(ResultSema, E);
     return true;
   };
 
@@ -4984,6 +4985,15 @@ bool Compiler<Emitter>::visitDeclStmt(const DeclStmt *DS) {
       return false;
     if (!this->visitVarDecl(VD))
       return false;
+
+    // Register decomposition decl holding vars.
+    if (const auto *DD = dyn_cast<DecompositionDecl>(VD)) {
+      for (auto *BD : DD->bindings())
+        if (auto *KD = BD->getHoldingVar()) {
+          if (!this->visitVarDecl(KD))
+            return false;
+        }
+    }
   }
 
   return true;
@@ -6194,60 +6204,67 @@ bool Compiler<Emitter>::visitDeclRef(const ValueDecl *D, const Expr *E) {
       return revisit(VD);
   }
 
-  if (D != InitializingDecl) {
-    // Try to lazily visit (or emit dummy pointers for) declarations
-    // we haven't seen yet.
-    if (Ctx.getLangOpts().CPlusPlus) {
-      if (const auto *VD = dyn_cast<VarDecl>(D)) {
-        const auto typeShouldBeVisited = [&](QualType T) -> bool {
-          if (T.isConstant(Ctx.getASTContext()))
-            return true;
-          return T->isReferenceType();
-        };
+  // Avoid infinite recursion.
+  if (D == InitializingDecl)
+    return this->emitDummyPtr(D, E);
 
-        // DecompositionDecls are just proxies for us.
-        if (isa<DecompositionDecl>(VD))
-          return revisit(VD);
-
-        if ((VD->hasGlobalStorage() || VD->isStaticDataMember()) &&
-            typeShouldBeVisited(VD->getType())) {
-          if (const Expr *Init = VD->getAnyInitializer();
-              Init && !Init->isValueDependent()) {
-            // Whether or not the evaluation is successul doesn't really matter
-            // here -- we will create a global variable in any case, and that
-            // will have the state of initializer evaluation attached.
-            APValue V;
-            SmallVector<PartialDiagnosticAt> Notes;
-            (void)Init->EvaluateAsInitializer(V, Ctx.getASTContext(), VD, Notes,
-                                              true);
-            return this->visitDeclRef(D, E);
-          }
-          return revisit(VD);
-        }
+  // Try to lazily visit (or emit dummy pointers for) declarations
+  // we haven't seen yet.
+  // For C.
+  if (!Ctx.getLangOpts().CPlusPlus) {
+    if (const auto *VD = dyn_cast<VarDecl>(D);
+        VD && VD->getAnyInitializer() &&
+        VD->getType().isConstant(Ctx.getASTContext()) && !VD->isWeak())
+      return revisit(VD);
+    return this->emitDummyPtr(D, E);
+  }
+
+  // ... and C++.
+  const auto *VD = dyn_cast<VarDecl>(D);
+  if (!VD)
+    return this->emitDummyPtr(D, E);
 
-        // FIXME: The evaluateValue() check here is a little ridiculous, since
-        // it will ultimately call into Context::evaluateAsInitializer(). In
-        // other words, we're evaluating the initializer, just to know if we can
-        // evaluate the initializer.
-        if (VD->isLocalVarDecl() && typeShouldBeVisited(VD->getType()) &&
-            VD->getInit() && !VD->getInit()->isValueDependent()) {
+  const auto typeShouldBeVisited = [&](QualType T) -> bool {
+    if (T.isConstant(Ctx.getASTContext()))
+      return true;
+    return T->isReferenceType();
+  };
 
-          if (VD->evaluateValue())
-            return revisit(VD);
+  // DecompositionDecls are just proxies for us.
+  if (isa<DecompositionDecl>(VD))
+    return revisit(VD);
+
+  if ((VD->hasGlobalStorage() || VD->isStaticDataMember()) &&
+      typeShouldBeVisited(VD->getType())) {
+    if (const Expr *Init = VD->getAnyInitializer();
+        Init && !Init->isValueDependent()) {
+      // Whether or not the evaluation is successul doesn't really matter
+      // here -- we will create a global variable in any case, and that
+      // will have the state of initializer evaluation attached.
+      APValue V;
+      SmallVector<PartialDiagnosticAt> Notes;
+      (void)Init->EvaluateAsInitializer(V, Ctx.getASTContext(), VD, Notes,
+                                        true);
+      return this->visitDeclRef(D, E);
+    }
+    return revisit(VD);
+  }
+
+  // FIXME: The evaluateValue() check here is a little ridiculous, since
+  // it will ultimately call into Context::evaluateAsInitializer(). In
+  // other words, we're evaluating the initializer, just to know if we can
+  // evaluate the initializer.
+  if (VD->isLocalVarDecl() && typeShouldBeVisited(VD->getType()) &&
+      VD->getInit() && !VD->getInit()->isValueDependent()) {
+
+    if (VD->evaluateValue())
+      return revisit(VD);
 
-          if (!D->getType()->isReferenceType())
-            return this->emitDummyPtr(D, E);
+    if (!D->getType()->isReferenceType())
+      return this->emitDummyPtr(D, E);
 
-          return this->emitInvalidDeclRef(cast<DeclRefExpr>(E),
-                                          /*InitializerFailed=*/true, E);
-        }
-      }
-    } else {
-      if (const auto *VD = dyn_cast<VarDecl>(D);
-          VD && VD->getAnyInitializer() &&
-          VD->getType().isConstant(Ctx.getASTContext()) && !VD->isWeak())
-        return revisit(VD);
-    }
+    return this->emitInvalidDeclRef(cast<DeclRefExpr>(E),
+                                    /*InitializerFailed=*/true, E);
   }
 
   return this->emitDummyPtr(D, E);
diff --git clang/lib/AST/ByteCode/Disasm.cpp clang/lib/AST/ByteCode/Disasm.cpp
index 1aba778eaf7b..3c55c884a350 100644
--- clang/lib/AST/ByteCode/Disasm.cpp
+++ clang/lib/AST/ByteCode/Disasm.cpp
@@ -62,6 +62,12 @@ inline IntegralAP<true> ReadArg<IntegralAP<true>>(Program &P, CodePtr &OpPC) {
   return I;
 }
 
+template <> inline FixedPoint ReadArg<FixedPoint>(Program &P, CodePtr &OpPC) {
+  FixedPoint I = FixedPoint::deserialize(*OpPC);
+  OpPC += align(I.bytesToSerialize());
+  return I;
+}
+
 LLVM_DUMP_METHOD void Function::dump() const { dump(llvm::errs()); }
 
 LLVM_DUMP_METHOD void Function::dump(llvm::raw_ostream &OS) const {
diff --git clang/lib/AST/ByteCode/FixedPoint.h clang/lib/AST/ByteCode/FixedPoint.h
index ab8d6d7f02b5..fcb3c79cc109 100644
--- clang/lib/AST/ByteCode/FixedPoint.h
+++ clang/lib/AST/ByteCode/FixedPoint.h
@@ -91,6 +91,32 @@ public:
     return ComparisonCategoryResult::Greater;
   }
 
+  size_t bytesToSerialize() const {
+    return sizeof(uint32_t) + (V.getValue().getBitWidth() / CHAR_BIT);
+  }
+
+  void serialize(std::byte *Buff) const {
+    // Semantics followed by APInt.
+    uint32_t SemI = V.getSemantics().toOpaqueInt();
+    std::memcpy(Buff, &SemI, sizeof(SemI));
+
+    llvm::APInt API = V.getValue();
+    llvm::StoreIntToMemory(API, (uint8_t *)(Buff + sizeof(SemI)),
+                           bitWidth() / 8);
+  }
+
+  static FixedPoint deserialize(const std::byte *Buff) {
+    auto Sem = llvm::FixedPointSemantics::getFromOpaqueInt(
+        *reinterpret_cast<const uint32_t *>(Buff));
+    unsigned BitWidth = Sem.getWidth();
+    APInt I(BitWidth, 0ull, !Sem.isSigned());
+    llvm::LoadIntFromMemory(
+        I, reinterpret_cast<const uint8_t *>(Buff + sizeof(uint32_t)),
+        BitWidth / CHAR_BIT);
+
+    return FixedPoint(I, Sem);
+  }
+
   static bool neg(const FixedPoint &A, FixedPoint *R) {
     bool Overflow = false;
     *R = FixedPoint(A.V.negate(&Overflow));
diff --git clang/lib/AST/ByteCode/Interp.cpp clang/lib/AST/ByteCode/Interp.cpp
index cb0ce886f668..c765ebf5d618 100644
--- clang/lib/AST/ByteCode/Interp.cpp
+++ clang/lib/AST/ByteCode/Interp.cpp
@@ -321,7 +321,7 @@ bool CheckLive(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
 
     if (Ptr.isDynamic()) {
       S.FFDiag(Src, diag::note_constexpr_access_deleted_object) << AK;
-    } else {
+    } else if (!S.checkingPotentialConstantExpression()) {
       bool IsTemp = Ptr.isTemporary();
       S.FFDiag(Src, diag::note_constexpr_lifetime_ended, 1) << AK << !IsTemp;
 
@@ -416,9 +416,11 @@ bool CheckRange(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
                 AccessKinds AK) {
   if (!Ptr.isOnePastEnd())
     return true;
-  const SourceInfo &Loc = S.Current->getSource(OpPC);
-  S.FFDiag(Loc, diag::note_constexpr_access_past_end)
-      << AK << S.Current->getRange(OpPC);
+  if (S.getLangOpts().CPlusPlus) {
+    const SourceInfo &Loc = S.Current->getSource(OpPC);
+    S.FFDiag(Loc, diag::note_constexpr_access_past_end)
+        << AK << S.Current->getRange(OpPC);
+  }
   return false;
 }
 
@@ -538,7 +540,7 @@ bool CheckInitialized(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
     return true;
 
   if (const auto *VD = Ptr.getDeclDesc()->asVarDecl();
-      VD && VD->hasGlobalStorage()) {
+      VD && (VD->isConstexpr() || VD->hasGlobalStorage())) {
     const SourceInfo &Loc = S.Current->getSource(OpPC);
     if (VD->getAnyInitializer()) {
       S.FFDiag(Loc, diag::note_constexpr_var_init_non_constant, 1) << VD;
diff --git clang/lib/AST/ByteCode/Interp.h clang/lib/AST/ByteCode/Interp.h
index 93a91976a31b..063970afec9e 100644
--- clang/lib/AST/ByteCode/Interp.h
+++ clang/lib/AST/ByteCode/Interp.h
@@ -2141,9 +2141,8 @@ inline bool CastFP(InterpState &S, CodePtr OpPC, const llvm::fltSemantics *Sem,
 }
 
 inline bool CastFixedPoint(InterpState &S, CodePtr OpPC, uint32_t FPS) {
-  FixedPointSemantics TargetSemantics(0, 0, false, false, false);
-  std::memcpy(&TargetSemantics, &FPS, sizeof(TargetSemantics));
-
+  FixedPointSemantics TargetSemantics =
+      FixedPointSemantics::getFromOpaqueInt(FPS);
   const auto &Source = S.Stk.pop<FixedPoint>();
 
   bool Overflow;
@@ -2271,8 +2270,7 @@ static inline bool CastIntegralFixedPoint(InterpState &S, CodePtr OpPC,
                                           uint32_t FPS) {
   const T &Int = S.Stk.pop<T>();
 
-  FixedPointSemantics Sem(0, 0, false, false, false);
-  std::memcpy(&Sem, &FPS, sizeof(Sem));
+  FixedPointSemantics Sem = FixedPointSemantics::getFromOpaqueInt(FPS);
 
   bool Overflow;
   FixedPoint Result = FixedPoint::from(Int.toAPSInt(), Sem, &Overflow);
@@ -2288,8 +2286,7 @@ static inline bool CastFloatingFixedPoint(InterpState &S, CodePtr OpPC,
                                           uint32_t FPS) {
   const auto &Float = S.Stk.pop<Floating>();
 
-  FixedPointSemantics Sem(0, 0, false, false, false);
-  std::memcpy(&Sem, &FPS, sizeof(Sem));
+  FixedPointSemantics Sem = FixedPointSemantics::getFromOpaqueInt(FPS);
 
   bool Overflow;
   FixedPoint Result = FixedPoint::from(Float.getAPFloat(), Sem, &Overflow);
@@ -2696,6 +2693,10 @@ template <PrimType Name, class T = typename PrimConv<Name>::T>
 inline bool GetIntPtr(InterpState &S, CodePtr OpPC, const Descriptor *Desc) {
   const T &IntVal = S.Stk.pop<T>();
 
+  if (Desc)
+    S.CCEDiag(S.Current->getSource(OpPC), diag::note_constexpr_invalid_cast)
+        << 2 << S.getLangOpts().CPlusPlus;
+
   S.Stk.push<Pointer>(static_cast<uint64_t>(IntVal), Desc);
   return true;
 }
@@ -3073,6 +3074,13 @@ inline IntegralAP<true> ReadArg<IntegralAP<true>>(InterpState &S,
   return I;
 }
 
+template <>
+inline FixedPoint ReadArg<FixedPoint>(InterpState &S, CodePtr &OpPC) {
+  FixedPoint FP = FixedPoint::deserialize(*OpPC);
+  OpPC += align(FP.bytesToSerialize());
+  return FP;
+}
+
 } // namespace interp
 } // namespace clang
 
diff --git clang/lib/AST/ByteCode/Program.cpp clang/lib/AST/ByteCode/Program.cpp
index c98a3506b0a9..7d8862d606ba 100644
--- clang/lib/AST/ByteCode/Program.cpp
+++ clang/lib/AST/ByteCode/Program.cpp
@@ -155,7 +155,7 @@ unsigned Program::getOrCreateDummy(const DeclTy &D) {
 
   QualType QT;
   bool IsWeak = false;
-  if (const auto *E = D.dyn_cast<const Expr *>()) {
+  if (const auto *E = dyn_cast<const Expr *>(D)) {
     QT = E->getType();
   } else {
     const ValueDecl *VD = cast<ValueDecl>(cast<const Decl *>(D));
diff --git clang/lib/AST/Decl.cpp clang/lib/AST/Decl.cpp
index 30341b046f95..f641a72ed264 100644
--- clang/lib/AST/Decl.cpp
+++ clang/lib/AST/Decl.cpp
@@ -2399,7 +2399,7 @@ Expr *VarDecl::getInit() {
   if (!hasInit())
     return nullptr;
 
-  if (auto *S = Init.dyn_cast<Stmt *>())
+  if (auto *S = dyn_cast<Stmt *>(Init))
     return cast<Expr>(S);
 
   auto *Eval = getEvaluatedStmt();
diff --git clang/lib/Analysis/UnsafeBufferUsage.cpp clang/lib/Analysis/UnsafeBufferUsage.cpp
index a9aff39df647..c064aa30e8ae 100644
--- clang/lib/Analysis/UnsafeBufferUsage.cpp
+++ clang/lib/Analysis/UnsafeBufferUsage.cpp
@@ -453,8 +453,13 @@ AST_MATCHER(ArraySubscriptExpr, isSafeArraySubscript) {
     return false;
   }
 
-  if (const auto *IdxLit = dyn_cast<IntegerLiteral>(Node.getIdx())) {
-    const APInt ArrIdx = IdxLit->getValue();
+  Expr::EvalResult EVResult;
+  const Expr *IndexExpr = Node.getIdx();
+  if (!IndexExpr->isValueDependent() &&
+      IndexExpr->EvaluateAsInt(EVResult, Finder->getASTContext())) {
+    llvm::APSInt ArrIdx = EVResult.Val.getInt();
+    // FIXME: ArrIdx.isNegative() we could immediately emit an error as that's a
+    // bug
     if (ArrIdx.isNonNegative() && ArrIdx.getLimitedValue() < limit)
       return true;
   }
diff --git clang/lib/Basic/Cuda.cpp clang/lib/Basic/Cuda.cpp
index d56609a2a8f2..b1461429d4f5 100644
--- clang/lib/Basic/Cuda.cpp
+++ clang/lib/Basic/Cuda.cpp
@@ -44,6 +44,9 @@ static const CudaVersionMapEntry CudaNameVersionMap[] = {
     CUDA_ENTRY(12, 4),
     CUDA_ENTRY(12, 5),
     CUDA_ENTRY(12, 6),
+    CUDA_ENTRY(12, 7),
+    CUDA_ENTRY(12, 8),
+    CUDA_ENTRY(12, 9),
     {"", CudaVersion::NEW, llvm::VersionTuple(std::numeric_limits<int>::max())},
     {"unknown", CudaVersion::UNKNOWN, {}} // End of list tombstone.
 };
@@ -98,6 +101,7 @@ static const OffloadArchToStringMap arch_names[] = {
     SM(90),                          // Hopper
     SM(90a),                         // Hopper
     SM(100),                         // Blackwell
+    SM(100a),                        // Blackwell
     GFX(600),  // gfx600
     GFX(601),  // gfx601
     GFX(602),  // gfx602
@@ -227,8 +231,8 @@ CudaVersion MinVersionForOffloadArch(OffloadArch A) {
   case OffloadArch::SM_90a:
     return CudaVersion::CUDA_120;
   case OffloadArch::SM_100:
-    return CudaVersion::NEW; // TODO: use specific CUDA version once it's
-                             // public.
+  case OffloadArch::SM_100a:
+    return CudaVersion::CUDA_127;
   default:
     llvm_unreachable("invalid enum");
   }
diff --git clang/lib/Basic/LangOptions.cpp clang/lib/Basic/LangOptions.cpp
index 94caf6a3897b..e3037ec819ad 100644
--- clang/lib/Basic/LangOptions.cpp
+++ clang/lib/Basic/LangOptions.cpp
@@ -208,6 +208,8 @@ void LangOptions::setLangDefaults(LangOptions &Opts, Language Lang,
 
   // OpenCL and HLSL have half keyword
   Opts.Half = Opts.OpenCL || Opts.HLSL;
+
+  Opts.PreserveVec3Type = Opts.HLSL;
 }
 
 FPOptions FPOptions::defaultWithoutTrailingStorage(const LangOptions &LO) {
diff --git clang/lib/Basic/Targets.cpp clang/lib/Basic/Targets.cpp
index fad3de217d81..281aebdb1c35 100644
--- clang/lib/Basic/Targets.cpp
+++ clang/lib/Basic/Targets.cpp
@@ -743,9 +743,6 @@ std::unique_ptr<TargetInfo> AllocateTarget(const llvm::Triple &Triple,
     case llvm::Triple::Linux:
       return std::make_unique<LinuxTargetInfo<LoongArch32TargetInfo>>(Triple,
                                                                       Opts);
-    case llvm::Triple::FreeBSD:
-      return std::make_unique<FreeBSDTargetInfo<LoongArch32TargetInfo>>(Triple,
-                                                                        Opts);
     default:
       return std::make_unique<LoongArch32TargetInfo>(Triple, Opts);
     }
diff --git clang/lib/Basic/Targets/NVPTX.cpp clang/lib/Basic/Targets/NVPTX.cpp
index dbc3fec36576..56efad90cb7c 100644
--- clang/lib/Basic/Targets/NVPTX.cpp
+++ clang/lib/Basic/Targets/NVPTX.cpp
@@ -285,6 +285,7 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts,
       case OffloadArch::SM_90a:
         return "900";
       case OffloadArch::SM_100:
+      case OffloadArch::SM_100a:
         return "1000";
       }
       llvm_unreachable("unhandled OffloadArch");
@@ -292,6 +293,8 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts,
     Builder.defineMacro("__CUDA_ARCH__", CUDAArchCode);
     if (GPU == OffloadArch::SM_90a)
       Builder.defineMacro("__CUDA_ARCH_FEAT_SM90_ALL", "1");
+    if (GPU == OffloadArch::SM_100a)
+      Builder.defineMacro("__CUDA_ARCH_FEAT_SM100_ALL", "1");
   }
 }
 
diff --git clang/lib/Basic/Targets/OSTargets.h clang/lib/Basic/Targets/OSTargets.h
index ba9acc8b2a05..991efd2bde01 100644
--- clang/lib/Basic/Targets/OSTargets.h
+++ clang/lib/Basic/Targets/OSTargets.h
@@ -250,11 +250,8 @@ public:
     case llvm::Triple::arm:
       this->MCountName = "__mcount";
       break;
-    case llvm::Triple::riscv32:
-    case llvm::Triple::riscv64:
-      break;
-    case llvm::Triple::loongarch32:
     case llvm::Triple::loongarch64:
+    case llvm::Triple::riscv64:
       break;
     }
   }
diff --git clang/lib/Basic/Targets/SystemZ.cpp clang/lib/Basic/Targets/SystemZ.cpp
index 06f08db2eadd..c836d110d26d 100644
--- clang/lib/Basic/Targets/SystemZ.cpp
+++ clang/lib/Basic/Targets/SystemZ.cpp
@@ -105,6 +105,7 @@ static constexpr ISANameRevision ISARevisions[] = {
   {{"arch12"}, 12}, {{"z14"}, 12},
   {{"arch13"}, 13}, {{"z15"}, 13},
   {{"arch14"}, 14}, {{"z16"}, 14},
+  {{"arch15"}, 15},
 };
 
 int SystemZTargetInfo::getISARevision(StringRef Name) const {
@@ -133,6 +134,7 @@ bool SystemZTargetInfo::hasFeature(StringRef Feature) const {
       .Case("arch12", ISARevision >= 12)
       .Case("arch13", ISARevision >= 13)
       .Case("arch14", ISARevision >= 14)
+      .Case("arch15", ISARevision >= 15)
       .Case("htm", HasTransactionalExecution)
       .Case("vx", HasVector)
       .Default(false);
@@ -167,7 +169,7 @@ void SystemZTargetInfo::getTargetDefines(const LangOptions &Opts,
   if (HasVector)
     Builder.defineMacro("__VX__");
   if (Opts.ZVector)
-    Builder.defineMacro("__VEC__", "10304");
+    Builder.defineMacro("__VEC__", "10305");
 }
 
 ArrayRef<Builtin::Info> SystemZTargetInfo::getTargetBuiltins() const {
diff --git clang/lib/Basic/Targets/SystemZ.h clang/lib/Basic/Targets/SystemZ.h
index e6405f174f66..d05948586c46 100644
--- clang/lib/Basic/Targets/SystemZ.h
+++ clang/lib/Basic/Targets/SystemZ.h
@@ -186,6 +186,10 @@ public:
       Features["vector-enhancements-2"] = true;
     if (ISARevision >= 14)
       Features["nnp-assist"] = true;
+    if (ISARevision >= 15) {
+      Features["miscellaneous-extensions-4"] = true;
+      Features["vector-enhancements-3"] = true;
+    }
     return TargetInfo::initFeatureMap(Features, Diags, CPU, FeaturesVec);
   }
 
diff --git clang/lib/CodeGen/ABIInfo.cpp clang/lib/CodeGen/ABIInfo.cpp
index 642bca9e8b76..cda8a494f6c2 100644
--- clang/lib/CodeGen/ABIInfo.cpp
+++ clang/lib/CodeGen/ABIInfo.cpp
@@ -236,6 +236,14 @@ void ABIInfo::appendAttributeMangling(StringRef AttrStr,
   }
 }
 
+llvm::FixedVectorType *
+ABIInfo::getOptimalVectorMemoryType(llvm::FixedVectorType *T,
+                                    const LangOptions &Opt) const {
+  if (T->getNumElements() == 3 && !Opt.PreserveVec3Type)
+    return llvm::FixedVectorType::get(T->getElementType(), 4);
+  return T;
+}
+
 // Pin the vtable to this file.
 SwiftABIInfo::~SwiftABIInfo() = default;
 
diff --git clang/lib/CodeGen/ABIInfo.h clang/lib/CodeGen/ABIInfo.h
index b8a8de57e5b9..213e7879c316 100644
--- clang/lib/CodeGen/ABIInfo.h
+++ clang/lib/CodeGen/ABIInfo.h
@@ -20,6 +20,7 @@ class Value;
 class LLVMContext;
 class DataLayout;
 class Type;
+class FixedVectorType;
 } // namespace llvm
 
 namespace clang {
@@ -123,6 +124,13 @@ public:
                                        raw_ostream &Out) const;
   virtual void appendAttributeMangling(StringRef AttrStr,
                                        raw_ostream &Out) const;
+
+  /// Returns the optimal vector memory type based on the given vector type. For
+  /// example, on certain targets, a vector with 3 elements might be promoted to
+  /// one with 4 elements to improve performance.
+  virtual llvm::FixedVectorType *
+  getOptimalVectorMemoryType(llvm::FixedVectorType *T,
+                             const LangOptions &Opt) const;
 };
 
 /// Target specific hooks for defining how a type should be passed or returned
diff --git clang/lib/CodeGen/CGBuiltin.cpp clang/lib/CodeGen/CGBuiltin.cpp
index b80833fd9188..c26b81306fb7 100644
--- clang/lib/CodeGen/CGBuiltin.cpp
+++ clang/lib/CodeGen/CGBuiltin.cpp
@@ -68,6 +68,7 @@
 #include "llvm/TargetParser/RISCVISAInfo.h"
 #include "llvm/TargetParser/RISCVTargetParser.h"
 #include "llvm/TargetParser/X86TargetParser.h"
+#include <numeric>
 #include <optional>
 #include <utility>
 
@@ -7307,7 +7308,6 @@ static const ARMVectorIntrinsicInfo ARMSIMDIntrinsicMap [] = {
 };
 
 static const ARMVectorIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
-  NEONMAP1(__a64_vcvtq_low_bf16_f32, aarch64_neon_bfcvtn, 0),
   NEONMAP0(splat_lane_v),
   NEONMAP0(splat_laneq_v),
   NEONMAP0(splatq_lane_v),
@@ -7407,7 +7407,8 @@ static const ARMVectorIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
   NEONMAP0(vcvtq_f16_s16),
   NEONMAP0(vcvtq_f16_u16),
   NEONMAP0(vcvtq_f32_v),
-  NEONMAP1(vcvtq_high_bf16_f32, aarch64_neon_bfcvtn2, 0),
+  NEONMAP0(vcvtq_high_bf16_f32),
+  NEONMAP0(vcvtq_low_bf16_f32),
   NEONMAP1(vcvtq_n_f16_s16, aarch64_neon_vcvtfxs2fp, 0),
   NEONMAP1(vcvtq_n_f16_u16, aarch64_neon_vcvtfxu2fp, 0),
   NEONMAP2(vcvtq_n_f32_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
@@ -7616,7 +7617,7 @@ static const ARMVectorIntrinsicInfo AArch64SISDIntrinsicMap[] = {
   NEONMAP1(vcvtd_n_u64_f64, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
   NEONMAP1(vcvtd_s64_f64, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
   NEONMAP1(vcvtd_u64_f64, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
-  NEONMAP1(vcvth_bf16_f32, aarch64_neon_bfcvt, 0),
+  NEONMAP0(vcvth_bf16_f32),
   NEONMAP1(vcvtmd_s64_f64, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
   NEONMAP1(vcvtmd_u64_f64, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
   NEONMAP1(vcvtms_s32_f32, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
@@ -12083,6 +12084,12 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     return ConstantInt::get(Builder.getInt32Ty(), 0);
   }
 
+  if (BuiltinID == NEON::BI__builtin_neon_vcvth_bf16_f32)
+    return Builder.CreateFPTrunc(
+        Builder.CreateBitCast(EmitScalarExpr(E->getArg(0)),
+                              Builder.getFloatTy()),
+        Builder.getBFloatTy());
+
   // Handle MSVC intrinsics before argument evaluation to prevent double
   // evaluation.
   if (std::optional<MSVCIntrin> MsvcIntId =
@@ -12808,6 +12815,35 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
                                         "vgetq_lane");
   }
+  case NEON::BI__builtin_neon_vcvt_bf16_f32: {
+    llvm::Type *V4F32 = FixedVectorType::get(Builder.getFloatTy(), 4);
+    llvm::Type *V4BF16 = FixedVectorType::get(Builder.getBFloatTy(), 4);
+    return Builder.CreateFPTrunc(Builder.CreateBitCast(Ops[0], V4F32), V4BF16);
+  }
+  case NEON::BI__builtin_neon_vcvtq_low_bf16_f32: {
+    SmallVector<int, 16> ConcatMask(8);
+    std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
+    llvm::Type *V4F32 = FixedVectorType::get(Builder.getFloatTy(), 4);
+    llvm::Type *V4BF16 = FixedVectorType::get(Builder.getBFloatTy(), 4);
+    llvm::Value *Trunc =
+        Builder.CreateFPTrunc(Builder.CreateBitCast(Ops[0], V4F32), V4BF16);
+    return Builder.CreateShuffleVector(
+        Trunc, ConstantAggregateZero::get(V4BF16), ConcatMask);
+  }
+  case NEON::BI__builtin_neon_vcvtq_high_bf16_f32: {
+    SmallVector<int, 16> ConcatMask(8);
+    std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
+    SmallVector<int, 16> LoMask(4);
+    std::iota(LoMask.begin(), LoMask.end(), 0);
+    llvm::Type *V4F32 = FixedVectorType::get(Builder.getFloatTy(), 4);
+    llvm::Type *V4BF16 = FixedVectorType::get(Builder.getBFloatTy(), 4);
+    llvm::Type *V8BF16 = FixedVectorType::get(Builder.getBFloatTy(), 8);
+    llvm::Value *Inactive = Builder.CreateShuffleVector(
+        Builder.CreateBitCast(Ops[0], V8BF16), LoMask);
+    llvm::Value *Trunc =
+        Builder.CreateFPTrunc(Builder.CreateBitCast(Ops[1], V4F32), V4BF16);
+    return Builder.CreateShuffleVector(Inactive, Trunc, ConcatMask);
+  }
 
   case clang::AArch64::BI_InterlockedAdd:
   case clang::AArch64::BI_InterlockedAdd64: {
@@ -20601,7 +20637,8 @@ Value *CodeGenFunction::EmitSystemZBuiltinExpr(unsigned BuiltinID,
   case SystemZ::BI__builtin_s390_vclzb:
   case SystemZ::BI__builtin_s390_vclzh:
   case SystemZ::BI__builtin_s390_vclzf:
-  case SystemZ::BI__builtin_s390_vclzg: {
+  case SystemZ::BI__builtin_s390_vclzg:
+  case SystemZ::BI__builtin_s390_vclzq: {
     llvm::Type *ResultType = ConvertType(E->getType());
     Value *X = EmitScalarExpr(E->getArg(0));
     Value *Undef = ConstantInt::get(Builder.getInt1Ty(), false);
@@ -20612,7 +20649,8 @@ Value *CodeGenFunction::EmitSystemZBuiltinExpr(unsigned BuiltinID,
   case SystemZ::BI__builtin_s390_vctzb:
   case SystemZ::BI__builtin_s390_vctzh:
   case SystemZ::BI__builtin_s390_vctzf:
-  case SystemZ::BI__builtin_s390_vctzg: {
+  case SystemZ::BI__builtin_s390_vctzg:
+  case SystemZ::BI__builtin_s390_vctzq: {
     llvm::Type *ResultType = ConvertType(E->getType());
     Value *X = EmitScalarExpr(E->getArg(0));
     Value *Undef = ConstantInt::get(Builder.getInt1Ty(), false);
@@ -20856,7 +20894,8 @@ Value *CodeGenFunction::EmitSystemZBuiltinExpr(unsigned BuiltinID,
 
   case SystemZ::BI__builtin_s390_vlbrh:
   case SystemZ::BI__builtin_s390_vlbrf:
-  case SystemZ::BI__builtin_s390_vlbrg: {
+  case SystemZ::BI__builtin_s390_vlbrg:
+  case SystemZ::BI__builtin_s390_vlbrq: {
     llvm::Type *ResultType = ConvertType(E->getType());
     Value *X = EmitScalarExpr(E->getArg(0));
     Function *F = CGM.getIntrinsic(Intrinsic::bswap, ResultType);
@@ -20881,16 +20920,19 @@ Value *CodeGenFunction::EmitSystemZBuiltinExpr(unsigned BuiltinID,
   INTRINSIC_WITH_CC(s390_vceqhs);
   INTRINSIC_WITH_CC(s390_vceqfs);
   INTRINSIC_WITH_CC(s390_vceqgs);
+  INTRINSIC_WITH_CC(s390_vceqqs);
 
   INTRINSIC_WITH_CC(s390_vchbs);
   INTRINSIC_WITH_CC(s390_vchhs);
   INTRINSIC_WITH_CC(s390_vchfs);
   INTRINSIC_WITH_CC(s390_vchgs);
+  INTRINSIC_WITH_CC(s390_vchqs);
 
   INTRINSIC_WITH_CC(s390_vchlbs);
   INTRINSIC_WITH_CC(s390_vchlhs);
   INTRINSIC_WITH_CC(s390_vchlfs);
   INTRINSIC_WITH_CC(s390_vchlgs);
+  INTRINSIC_WITH_CC(s390_vchlqs);
 
   INTRINSIC_WITH_CC(s390_vfaebs);
   INTRINSIC_WITH_CC(s390_vfaehs);
diff --git clang/lib/CodeGen/CGDebugInfo.cpp clang/lib/CodeGen/CGDebugInfo.cpp
index f88f56c98186..6cbcaf038441 100644
--- clang/lib/CodeGen/CGDebugInfo.cpp
+++ clang/lib/CodeGen/CGDebugInfo.cpp
@@ -2016,13 +2016,15 @@ llvm::DISubroutineType *CGDebugInfo::getOrCreateInstanceMethodType(
   // First element is always return type. For 'void' functions it is NULL.
   Elts.push_back(Args[0]);
 
-  // "this" pointer is always first argument.
-  // ThisPtr may be null if the member function has an explicit 'this'
-  // parameter.
-  if (!ThisPtr.isNull()) {
+  const bool HasExplicitObjectParameter = ThisPtr.isNull();
+
+  // "this" pointer is always first argument. For explicit "this"
+  // parameters, it will already be in Args[1].
+  if (!HasExplicitObjectParameter) {
     llvm::DIType *ThisPtrType = getOrCreateType(ThisPtr, Unit);
     TypeCache[ThisPtr.getAsOpaquePtr()].reset(ThisPtrType);
-    ThisPtrType = DBuilder.createObjectPointerType(ThisPtrType);
+    ThisPtrType =
+        DBuilder.createObjectPointerType(ThisPtrType, /*Implicit=*/true);
     Elts.push_back(ThisPtrType);
   }
 
@@ -2030,6 +2032,13 @@ llvm::DISubroutineType *CGDebugInfo::getOrCreateInstanceMethodType(
   for (unsigned i = 1, e = Args.size(); i != e; ++i)
     Elts.push_back(Args[i]);
 
+  // Attach FlagObjectPointer to the explicit "this" parameter.
+  if (HasExplicitObjectParameter) {
+    assert(Elts.size() >= 2 && Args.size() >= 2 &&
+           "Expected at least return type and object parameter.");
+    Elts[1] = DBuilder.createObjectPointerType(Args[1], /*Implicit=*/false);
+  }
+
   llvm::DITypeRefArray EltTypeArray = DBuilder.getOrCreateTypeArray(Elts);
 
   return DBuilder.createSubroutineType(EltTypeArray, OriginalFunc->getFlags(),
@@ -5118,7 +5127,7 @@ llvm::DIType *CGDebugInfo::CreateSelfType(const QualType &QualTy,
   llvm::DIType *CachedTy = getTypeOrNull(QualTy);
   if (CachedTy)
     Ty = CachedTy;
-  return DBuilder.createObjectPointerType(Ty);
+  return DBuilder.createObjectPointerType(Ty, /*Implicit=*/true);
 }
 
 void CGDebugInfo::EmitDeclareOfBlockDeclRefVariable(
diff --git clang/lib/CodeGen/CGExpr.cpp clang/lib/CodeGen/CGExpr.cpp
index 9a9a8c7f6eae..054f8d1eadb8 100644
--- clang/lib/CodeGen/CGExpr.cpp
+++ clang/lib/CodeGen/CGExpr.cpp
@@ -46,6 +46,7 @@
 #include "llvm/Support/xxhash.h"
 #include "llvm/Transforms/Utils/SanitizerStats.h"
 
+#include <numeric>
 #include <optional>
 #include <string>
 
@@ -2002,20 +2003,19 @@ llvm::Value *CodeGenFunction::EmitLoadOfScalar(Address Addr, bool Volatile,
       return EmitFromMemory(V, Ty);
     }
 
-    // Handle vectors of size 3 like size 4 for better performance.
-    const llvm::Type *EltTy = Addr.getElementType();
-    const auto *VTy = cast<llvm::FixedVectorType>(EltTy);
-
-    if (!CGM.getCodeGenOpts().PreserveVec3Type && VTy->getNumElements() == 3) {
-
-      llvm::VectorType *vec4Ty =
-          llvm::FixedVectorType::get(VTy->getElementType(), 4);
-      Address Cast = Addr.withElementType(vec4Ty);
-      // Now load value.
-      llvm::Value *V = Builder.CreateLoad(Cast, Volatile, "loadVec4");
-
-      // Shuffle vector to get vec3.
-      V = Builder.CreateShuffleVector(V, ArrayRef<int>{0, 1, 2}, "extractVec");
+    // Handles vectors of sizes that are likely to be expanded to a larger size
+    // to optimize performance.
+    auto *VTy = cast<llvm::FixedVectorType>(Addr.getElementType());
+    auto *NewVecTy =
+        CGM.getABIInfo().getOptimalVectorMemoryType(VTy, getLangOpts());
+
+    if (VTy != NewVecTy) {
+      Address Cast = Addr.withElementType(NewVecTy);
+      llvm::Value *V = Builder.CreateLoad(Cast, Volatile, "loadVecN");
+      unsigned OldNumElements = VTy->getNumElements();
+      SmallVector<int, 16> Mask(OldNumElements);
+      std::iota(Mask.begin(), Mask.end(), 0);
+      V = Builder.CreateShuffleVector(V, Mask, "extractVec");
       return EmitFromMemory(V, Ty);
     }
   }
@@ -2145,21 +2145,21 @@ void CodeGenFunction::EmitStoreOfScalar(llvm::Value *Value, Address Addr,
       Addr = Addr.withPointer(Builder.CreateThreadLocalAddress(GV),
                               NotKnownNonNull);
 
+  // Handles vectors of sizes that are likely to be expanded to a larger size
+  // to optimize performance.
   llvm::Type *SrcTy = Value->getType();
   if (const auto *ClangVecTy = Ty->getAs<VectorType>()) {
-    auto *VecTy = dyn_cast<llvm::FixedVectorType>(SrcTy);
-    if (!CGM.getCodeGenOpts().PreserveVec3Type) {
-      // Handle vec3 special.
-      if (VecTy && !ClangVecTy->isExtVectorBoolType() &&
-          cast<llvm::FixedVectorType>(VecTy)->getNumElements() == 3) {
-        // Our source is a vec3, do a shuffle vector to make it a vec4.
-        Value = Builder.CreateShuffleVector(Value, ArrayRef<int>{0, 1, 2, -1},
-                                            "extractVec");
-        SrcTy = llvm::FixedVectorType::get(VecTy->getElementType(), 4);
+    if (auto *VecTy = dyn_cast<llvm::FixedVectorType>(SrcTy)) {
+      auto *NewVecTy =
+          CGM.getABIInfo().getOptimalVectorMemoryType(VecTy, getLangOpts());
+      if (!ClangVecTy->isExtVectorBoolType() && VecTy != NewVecTy) {
+        SmallVector<int, 16> Mask(NewVecTy->getNumElements(), -1);
+        std::iota(Mask.begin(), Mask.begin() + VecTy->getNumElements(), 0);
+        Value = Builder.CreateShuffleVector(Value, Mask, "extractVec");
+        SrcTy = NewVecTy;
       }
-      if (Addr.getElementType() != SrcTy) {
+      if (Addr.getElementType() != SrcTy)
         Addr = Addr.withElementType(SrcTy);
-      }
     }
   }
 
diff --git clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 87c3635ed3f7..c13928f61a74 100644
--- clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -2277,6 +2277,7 @@ void CGOpenMPRuntimeGPU::processRequiresDirective(const OMPRequiresDecl *D) {
       case OffloadArch::SM_90:
       case OffloadArch::SM_90a:
       case OffloadArch::SM_100:
+      case OffloadArch::SM_100a:
       case OffloadArch::GFX600:
       case OffloadArch::GFX601:
       case OffloadArch::GFX602:
diff --git clang/lib/CodeGen/CodeGenModule.cpp clang/lib/CodeGen/CodeGenModule.cpp
index dfb51b11e1d8..eb8d3ceeeba4 100644
--- clang/lib/CodeGen/CodeGenModule.cpp
+++ clang/lib/CodeGen/CodeGenModule.cpp
@@ -120,6 +120,8 @@ createTargetCodeGenInfo(CodeGenModule &CGM) {
   case llvm::Triple::mipsel:
     if (Triple.getOS() == llvm::Triple::NaCl)
       return createPNaClTargetCodeGenInfo(CGM);
+    else if (Triple.getOS() == llvm::Triple::Win32)
+      return createWindowsMIPSTargetCodeGenInfo(CGM, /*IsOS32=*/true);
     return createMIPSTargetCodeGenInfo(CGM, /*IsOS32=*/true);
 
   case llvm::Triple::mips64:
diff --git clang/lib/CodeGen/TargetInfo.h clang/lib/CodeGen/TargetInfo.h
index ab3142bdea68..4a66683a3b91 100644
--- clang/lib/CodeGen/TargetInfo.h
+++ clang/lib/CodeGen/TargetInfo.h
@@ -522,6 +522,9 @@ createM68kTargetCodeGenInfo(CodeGenModule &CGM);
 std::unique_ptr<TargetCodeGenInfo>
 createMIPSTargetCodeGenInfo(CodeGenModule &CGM, bool IsOS32);
 
+std::unique_ptr<TargetCodeGenInfo>
+createWindowsMIPSTargetCodeGenInfo(CodeGenModule &CGM, bool IsOS32);
+
 std::unique_ptr<TargetCodeGenInfo>
 createMSP430TargetCodeGenInfo(CodeGenModule &CGM);
 
diff --git clang/lib/CodeGen/Targets/AMDGPU.cpp clang/lib/CodeGen/Targets/AMDGPU.cpp
index fa07e68c5583..788eac5f2823 100644
--- clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -52,6 +52,17 @@ public:
   void computeInfo(CGFunctionInfo &FI) const override;
   RValue EmitVAArg(CodeGenFunction &CGF, Address VAListAddr, QualType Ty,
                    AggValueSlot Slot) const override;
+
+  llvm::FixedVectorType *
+  getOptimalVectorMemoryType(llvm::FixedVectorType *T,
+                             const LangOptions &Opt) const override {
+    // We have legal instructions for 96-bit so 3x32 can be supported.
+    // FIXME: This check should be a subtarget feature as technically SI doesn't
+    // support it.
+    if (T->getNumElements() == 3 && getDataLayout().getTypeSizeInBits(T) == 96)
+      return T;
+    return DefaultABIInfo::getOptimalVectorMemoryType(T, Opt);
+  }
 };
 
 bool AMDGPUABIInfo::isHomogeneousAggregateBaseType(QualType Ty) const {
diff --git clang/lib/CodeGen/Targets/Mips.cpp clang/lib/CodeGen/Targets/Mips.cpp
index 06d9b6d4a576..771a85c84b35 100644
--- clang/lib/CodeGen/Targets/Mips.cpp
+++ clang/lib/CodeGen/Targets/Mips.cpp
@@ -105,6 +105,23 @@ public:
     return SizeOfUnwindException;
   }
 };
+
+class WindowsMIPSTargetCodeGenInfo : public MIPSTargetCodeGenInfo {
+public:
+  WindowsMIPSTargetCodeGenInfo(CodeGenTypes &CGT, bool IsO32)
+      : MIPSTargetCodeGenInfo(CGT, IsO32) {}
+
+  void getDependentLibraryOption(llvm::StringRef Lib,
+                                 llvm::SmallString<24> &Opt) const override {
+    Opt = "/DEFAULTLIB:";
+    Opt += qualifyWindowsLibrary(Lib);
+  }
+
+  void getDetectMismatchOption(llvm::StringRef Name, llvm::StringRef Value,
+                               llvm::SmallString<32> &Opt) const override {
+    Opt = "/FAILIFMISMATCH:\"" + Name.str() + "=" + Value.str() + "\"";
+  }
+};
 }
 
 void MipsABIInfo::CoerceToIntArgs(
@@ -436,3 +453,8 @@ std::unique_ptr<TargetCodeGenInfo>
 CodeGen::createMIPSTargetCodeGenInfo(CodeGenModule &CGM, bool IsOS32) {
   return std::make_unique<MIPSTargetCodeGenInfo>(CGM.getTypes(), IsOS32);
 }
+
+std::unique_ptr<TargetCodeGenInfo>
+CodeGen::createWindowsMIPSTargetCodeGenInfo(CodeGenModule &CGM, bool IsOS32) {
+  return std::make_unique<WindowsMIPSTargetCodeGenInfo>(CGM.getTypes(), IsOS32);
+}
diff --git clang/lib/Driver/OffloadBundler.cpp clang/lib/Driver/OffloadBundler.cpp
index 2d6bdff0393b..12d763e5c65b 100644
--- clang/lib/Driver/OffloadBundler.cpp
+++ clang/lib/Driver/OffloadBundler.cpp
@@ -935,7 +935,8 @@ CreateFileHandler(MemoryBuffer &FirstInput,
                            "'" + FilesType + "': invalid file type specified");
 }
 
-OffloadBundlerConfig::OffloadBundlerConfig() {
+OffloadBundlerConfig::OffloadBundlerConfig()
+    : CompressedBundleVersion(CompressedOffloadBundle::DefaultVersion) {
   if (llvm::compression::zstd::isAvailable()) {
     CompressionFormat = llvm::compression::Format::Zstd;
     // Compression level 3 is usually sufficient for zstd since long distance
@@ -951,16 +952,13 @@ OffloadBundlerConfig::OffloadBundlerConfig() {
       llvm::sys::Process::GetEnv("OFFLOAD_BUNDLER_IGNORE_ENV_VAR");
   if (IgnoreEnvVarOpt.has_value() && IgnoreEnvVarOpt.value() == "1")
     return;
-
   auto VerboseEnvVarOpt = llvm::sys::Process::GetEnv("OFFLOAD_BUNDLER_VERBOSE");
   if (VerboseEnvVarOpt.has_value())
     Verbose = VerboseEnvVarOpt.value() == "1";
-
   auto CompressEnvVarOpt =
       llvm::sys::Process::GetEnv("OFFLOAD_BUNDLER_COMPRESS");
   if (CompressEnvVarOpt.has_value())
     Compress = CompressEnvVarOpt.value() == "1";
-
   auto CompressionLevelEnvVarOpt =
       llvm::sys::Process::GetEnv("OFFLOAD_BUNDLER_COMPRESSION_LEVEL");
   if (CompressionLevelEnvVarOpt.has_value()) {
@@ -973,6 +971,26 @@ OffloadBundlerConfig::OffloadBundlerConfig() {
           << "Warning: Invalid value for OFFLOAD_BUNDLER_COMPRESSION_LEVEL: "
           << CompressionLevelStr.str() << ". Ignoring it.\n";
   }
+  auto CompressedBundleFormatVersionOpt =
+      llvm::sys::Process::GetEnv("COMPRESSED_BUNDLE_FORMAT_VERSION");
+  if (CompressedBundleFormatVersionOpt.has_value()) {
+    llvm::StringRef VersionStr = CompressedBundleFormatVersionOpt.value();
+    uint16_t Version;
+    if (!VersionStr.getAsInteger(10, Version)) {
+      if (Version >= 2 && Version <= 3)
+        CompressedBundleVersion = Version;
+      else
+        llvm::errs()
+            << "Warning: Invalid value for COMPRESSED_BUNDLE_FORMAT_VERSION: "
+            << VersionStr.str()
+            << ". Valid values are 2 or 3. Using default version "
+            << CompressedBundleVersion << ".\n";
+    } else
+      llvm::errs()
+          << "Warning: Invalid value for COMPRESSED_BUNDLE_FORMAT_VERSION: "
+          << VersionStr.str() << ". Using default version "
+          << CompressedBundleVersion << ".\n";
+  }
 }
 
 // Utility function to format numbers with commas
@@ -989,12 +1007,11 @@ static std::string formatWithCommas(unsigned long long Value) {
 llvm::Expected<std::unique_ptr<llvm::MemoryBuffer>>
 CompressedOffloadBundle::compress(llvm::compression::Params P,
                                   const llvm::MemoryBuffer &Input,
-                                  bool Verbose) {
+                                  uint16_t Version, bool Verbose) {
   if (!llvm::compression::zstd::isAvailable() &&
       !llvm::compression::zlib::isAvailable())
     return createStringError(llvm::inconvertibleErrorCode(),
                              "Compression not supported");
-
   llvm::Timer HashTimer("Hash Calculation Timer", "Hash calculation time",
                         *ClangOffloadBundlerTimerGroup);
   if (Verbose)
@@ -1011,7 +1028,6 @@ CompressedOffloadBundle::compress(llvm::compression::Params P,
   auto BufferUint8 = llvm::ArrayRef<uint8_t>(
       reinterpret_cast<const uint8_t *>(Input.getBuffer().data()),
       Input.getBuffer().size());
-
   llvm::Timer CompressTimer("Compression Timer", "Compression time",
                             *ClangOffloadBundlerTimerGroup);
   if (Verbose)
@@ -1021,11 +1037,31 @@ CompressedOffloadBundle::compress(llvm::compression::Params P,
     CompressTimer.stopTimer();
 
   uint16_t CompressionMethod = static_cast<uint16_t>(P.format);
-  uint32_t UncompressedSize = Input.getBuffer().size();
-  uint32_t TotalFileSize = MagicNumber.size() + sizeof(TotalFileSize) +
-                           sizeof(Version) + sizeof(CompressionMethod) +
-                           sizeof(UncompressedSize) + sizeof(TruncatedHash) +
-                           CompressedBuffer.size();
+
+  // Store sizes in 64-bit variables first
+  uint64_t UncompressedSize64 = Input.getBuffer().size();
+  uint64_t TotalFileSize64;
+
+  // Calculate total file size based on version
+  if (Version == 2) {
+    // For V2, ensure the sizes don't exceed 32-bit limit
+    if (UncompressedSize64 > std::numeric_limits<uint32_t>::max())
+      return createStringError(llvm::inconvertibleErrorCode(),
+                               "Uncompressed size exceeds version 2 limit");
+    if ((MagicNumber.size() + sizeof(uint32_t) + sizeof(Version) +
+         sizeof(CompressionMethod) + sizeof(uint32_t) + sizeof(TruncatedHash) +
+         CompressedBuffer.size()) > std::numeric_limits<uint32_t>::max())
+      return createStringError(llvm::inconvertibleErrorCode(),
+                               "Total file size exceeds version 2 limit");
+
+    TotalFileSize64 = MagicNumber.size() + sizeof(uint32_t) + sizeof(Version) +
+                      sizeof(CompressionMethod) + sizeof(uint32_t) +
+                      sizeof(TruncatedHash) + CompressedBuffer.size();
+  } else { // Version 3
+    TotalFileSize64 = MagicNumber.size() + sizeof(uint64_t) + sizeof(Version) +
+                      sizeof(CompressionMethod) + sizeof(uint64_t) +
+                      sizeof(TruncatedHash) + CompressedBuffer.size();
+  }
 
   SmallVector<char, 0> FinalBuffer;
   llvm::raw_svector_ostream OS(FinalBuffer);
@@ -1033,10 +1069,22 @@ CompressedOffloadBundle::compress(llvm::compression::Params P,
   OS.write(reinterpret_cast<const char *>(&Version), sizeof(Version));
   OS.write(reinterpret_cast<const char *>(&CompressionMethod),
            sizeof(CompressionMethod));
-  OS.write(reinterpret_cast<const char *>(&TotalFileSize),
-           sizeof(TotalFileSize));
-  OS.write(reinterpret_cast<const char *>(&UncompressedSize),
-           sizeof(UncompressedSize));
+
+  // Write size fields according to version
+  if (Version == 2) {
+    uint32_t TotalFileSize32 = static_cast<uint32_t>(TotalFileSize64);
+    uint32_t UncompressedSize32 = static_cast<uint32_t>(UncompressedSize64);
+    OS.write(reinterpret_cast<const char *>(&TotalFileSize32),
+             sizeof(TotalFileSize32));
+    OS.write(reinterpret_cast<const char *>(&UncompressedSize32),
+             sizeof(UncompressedSize32));
+  } else { // Version 3
+    OS.write(reinterpret_cast<const char *>(&TotalFileSize64),
+             sizeof(TotalFileSize64));
+    OS.write(reinterpret_cast<const char *>(&UncompressedSize64),
+             sizeof(UncompressedSize64));
+  }
+
   OS.write(reinterpret_cast<const char *>(&TruncatedHash),
            sizeof(TruncatedHash));
   OS.write(reinterpret_cast<const char *>(CompressedBuffer.data()),
@@ -1046,18 +1094,17 @@ CompressedOffloadBundle::compress(llvm::compression::Params P,
     auto MethodUsed =
         P.format == llvm::compression::Format::Zstd ? "zstd" : "zlib";
     double CompressionRate =
-        static_cast<double>(UncompressedSize) / CompressedBuffer.size();
+        static_cast<double>(UncompressedSize64) / CompressedBuffer.size();
     double CompressionTimeSeconds = CompressTimer.getTotalTime().getWallTime();
     double CompressionSpeedMBs =
-        (UncompressedSize / (1024.0 * 1024.0)) / CompressionTimeSeconds;
-
+        (UncompressedSize64 / (1024.0 * 1024.0)) / CompressionTimeSeconds;
     llvm::errs() << "Compressed bundle format version: " << Version << "\n"
                  << "Total file size (including headers): "
-                 << formatWithCommas(TotalFileSize) << " bytes\n"
+                 << formatWithCommas(TotalFileSize64) << " bytes\n"
                  << "Compression method used: " << MethodUsed << "\n"
                  << "Compression level: " << P.level << "\n"
                  << "Binary size before compression: "
-                 << formatWithCommas(UncompressedSize) << " bytes\n"
+                 << formatWithCommas(UncompressedSize64) << " bytes\n"
                  << "Binary size after compression: "
                  << formatWithCommas(CompressedBuffer.size()) << " bytes\n"
                  << "Compression rate: "
@@ -1069,6 +1116,7 @@ CompressedOffloadBundle::compress(llvm::compression::Params P,
                  << "Truncated MD5 hash: "
                  << llvm::format_hex(TruncatedHash, 16) << "\n";
   }
+
   return llvm::MemoryBuffer::getMemBufferCopy(
       llvm::StringRef(FinalBuffer.data(), FinalBuffer.size()));
 }
@@ -1076,9 +1124,9 @@ CompressedOffloadBundle::compress(llvm::compression::Params P,
 llvm::Expected<std::unique_ptr<llvm::MemoryBuffer>>
 CompressedOffloadBundle::decompress(const llvm::MemoryBuffer &Input,
                                     bool Verbose) {
-
   StringRef Blob = Input.getBuffer();
 
+  // Check minimum header size (using V1 as it's the smallest)
   if (Blob.size() < V1HeaderSize)
     return llvm::MemoryBuffer::getMemBufferCopy(Blob);
 
@@ -1091,31 +1139,56 @@ CompressedOffloadBundle::decompress(const llvm::MemoryBuffer &Input,
 
   size_t CurrentOffset = MagicSize;
 
+  // Read version
   uint16_t ThisVersion;
   memcpy(&ThisVersion, Blob.data() + CurrentOffset, sizeof(uint16_t));
   CurrentOffset += VersionFieldSize;
 
+  // Verify header size based on version
+  if (ThisVersion >= 2 && ThisVersion <= 3) {
+    size_t RequiredSize = (ThisVersion == 2) ? V2HeaderSize : V3HeaderSize;
+    if (Blob.size() < RequiredSize)
+      return createStringError(inconvertibleErrorCode(),
+                               "Compressed bundle header size too small");
+  }
+
+  // Read compression method
   uint16_t CompressionMethod;
   memcpy(&CompressionMethod, Blob.data() + CurrentOffset, sizeof(uint16_t));
   CurrentOffset += MethodFieldSize;
 
-  uint32_t TotalFileSize;
+  // Read total file size (version 2+)
+  uint64_t TotalFileSize = 0;
   if (ThisVersion >= 2) {
-    if (Blob.size() < V2HeaderSize)
-      return createStringError(inconvertibleErrorCode(),
-                               "Compressed bundle header size too small");
-    memcpy(&TotalFileSize, Blob.data() + CurrentOffset, sizeof(uint32_t));
-    CurrentOffset += FileSizeFieldSize;
+    if (ThisVersion == 2) {
+      uint32_t TotalFileSize32;
+      memcpy(&TotalFileSize32, Blob.data() + CurrentOffset, sizeof(uint32_t));
+      TotalFileSize = TotalFileSize32;
+      CurrentOffset += FileSizeFieldSizeV2;
+    } else { // Version 3
+      memcpy(&TotalFileSize, Blob.data() + CurrentOffset, sizeof(uint64_t));
+      CurrentOffset += FileSizeFieldSizeV3;
+    }
   }
 
-  uint32_t UncompressedSize;
-  memcpy(&UncompressedSize, Blob.data() + CurrentOffset, sizeof(uint32_t));
-  CurrentOffset += UncompressedSizeFieldSize;
+  // Read uncompressed size
+  uint64_t UncompressedSize = 0;
+  if (ThisVersion <= 2) {
+    uint32_t UncompressedSize32;
+    memcpy(&UncompressedSize32, Blob.data() + CurrentOffset, sizeof(uint32_t));
+    UncompressedSize = UncompressedSize32;
+    CurrentOffset += UncompressedSizeFieldSizeV2;
+  } else { // Version 3
+    memcpy(&UncompressedSize, Blob.data() + CurrentOffset, sizeof(uint64_t));
+    CurrentOffset += UncompressedSizeFieldSizeV3;
+  }
 
+  // Read hash
   uint64_t StoredHash;
   memcpy(&StoredHash, Blob.data() + CurrentOffset, sizeof(uint64_t));
   CurrentOffset += HashFieldSize;
 
+  // Determine compression format
   llvm::compression::Format CompressionFormat;
   if (CompressionMethod ==
       static_cast<uint16_t>(llvm::compression::Format::Zlib))
@@ -1381,7 +1454,8 @@ Error OffloadBundler::BundleFiles() {
     auto CompressionResult = CompressedOffloadBundle::compress(
         {BundlerConfig.CompressionFormat, BundlerConfig.CompressionLevel,
          /*zstdEnableLdm=*/true},
-        *BufferMemory, BundlerConfig.Verbose);
+        *BufferMemory, BundlerConfig.CompressedBundleVersion,
+        BundlerConfig.Verbose);
     if (auto Error = CompressionResult.takeError())
       return Error;
 
diff --git clang/lib/Driver/ToolChains/Cuda.cpp clang/lib/Driver/ToolChains/Cuda.cpp
index 8967115bcc73..27e1969dabe5 100644
--- clang/lib/Driver/ToolChains/Cuda.cpp
+++ clang/lib/Driver/ToolChains/Cuda.cpp
@@ -89,6 +89,12 @@ CudaVersion getCudaVersion(uint32_t raw_version) {
     return CudaVersion::CUDA_125;
   if (raw_version < 12070)
     return CudaVersion::CUDA_126;
+  if (raw_version < 12080)
+    return CudaVersion::CUDA_127;
+  if (raw_version < 12090)
+    return CudaVersion::CUDA_128;
+  if (raw_version < 12100)
+    return CudaVersion::CUDA_129;
   return CudaVersion::NEW;
 }
 
@@ -682,6 +688,9 @@ void NVPTX::getNVPTXTargetFeatures(const Driver &D, const llvm::Triple &Triple,
   case CudaVersion::CUDA_##CUDA_VER:                                           \
     PtxFeature = "+ptx" #PTX_VER;                                              \
     break;
+    CASE_CUDA_VERSION(129, 87);
+    CASE_CUDA_VERSION(128, 87);
+    CASE_CUDA_VERSION(127, 86);
     CASE_CUDA_VERSION(126, 85);
     CASE_CUDA_VERSION(125, 85);
     CASE_CUDA_VERSION(124, 84);
diff --git clang/lib/Driver/ToolChains/Flang.cpp clang/lib/Driver/ToolChains/Flang.cpp
index 86ed25badfa2..9c1fd28a3a8a 100644
--- clang/lib/Driver/ToolChains/Flang.cpp
+++ clang/lib/Driver/ToolChains/Flang.cpp
@@ -155,8 +155,10 @@ void Flang::addCodegenOptions(const ArgList &Args,
                    options::OPT_flang_deprecated_no_hlfir,
                    options::OPT_fno_ppc_native_vec_elem_order,
                    options::OPT_fppc_native_vec_elem_order,
-                   options::OPT_ftime_report, options::OPT_ftime_report_EQ,
-                   options::OPT_funroll_loops, options::OPT_fno_unroll_loops});
+                   options::OPT_finit_global_zero,
+                   options::OPT_fno_init_global_zero, options::OPT_ftime_report,
+                   options::OPT_ftime_report_EQ, options::OPT_funroll_loops,
+                   options::OPT_fno_unroll_loops});
 }
 
 void Flang::addPicOptions(const ArgList &Args, ArgStringList &CmdArgs) const {
diff --git clang/lib/Driver/ToolChains/FreeBSD.cpp clang/lib/Driver/ToolChains/FreeBSD.cpp
index 88a27e319282..a6d859f0ebfe 100644
--- clang/lib/Driver/ToolChains/FreeBSD.cpp
+++ clang/lib/Driver/ToolChains/FreeBSD.cpp
@@ -213,10 +213,6 @@ void freebsd::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back("-m");
     CmdArgs.push_back("elf64lriscv");
     break;
-  case llvm::Triple::loongarch32:
-    CmdArgs.push_back("-m");
-    CmdArgs.push_back("elf32loongarch");
-    break;
   case llvm::Triple::loongarch64:
     CmdArgs.push_back("-m");
     CmdArgs.push_back("elf64loongarch");
diff --git clang/lib/Format/ContinuationIndenter.cpp clang/lib/Format/ContinuationIndenter.cpp
index 554b55fa75c9..c311deaa17bb 100644
--- clang/lib/Format/ContinuationIndenter.cpp
+++ clang/lib/Format/ContinuationIndenter.cpp
@@ -148,6 +148,7 @@ static bool startsNextOperand(const FormatToken &Current) {
 static bool mustBreakBinaryOperation(const FormatToken &Current,
                                      const FormatStyle &Style) {
   return Style.BreakBinaryOperations != FormatStyle::BBO_Never &&
+         Current.CanBreakBefore &&
          (Style.BreakBeforeBinaryOperators == FormatStyle::BOS_None
               ? startsNextOperand
               : isAlignableBinaryOperator)(Current);
diff --git clang/lib/Format/Format.cpp clang/lib/Format/Format.cpp
index fc60c5ec5eeb..c25d9bf7c225 100644
--- clang/lib/Format/Format.cpp
+++ clang/lib/Format/Format.cpp
@@ -1051,6 +1051,7 @@ template <> struct MappingTraits<FormatStyle> {
     IO.mapOptional("IndentAccessModifiers", Style.IndentAccessModifiers);
     IO.mapOptional("IndentCaseBlocks", Style.IndentCaseBlocks);
     IO.mapOptional("IndentCaseLabels", Style.IndentCaseLabels);
+    IO.mapOptional("IndentExportBlock", Style.IndentExportBlock);
     IO.mapOptional("IndentExternBlock", Style.IndentExternBlock);
     IO.mapOptional("IndentGotoLabels", Style.IndentGotoLabels);
     IO.mapOptional("IndentPPDirectives", Style.IndentPPDirectives);
@@ -1565,6 +1566,7 @@ FormatStyle getLLVMStyle(FormatStyle::LanguageKind Language) {
   LLVMStyle.IndentAccessModifiers = false;
   LLVMStyle.IndentCaseBlocks = false;
   LLVMStyle.IndentCaseLabels = false;
+  LLVMStyle.IndentExportBlock = true;
   LLVMStyle.IndentExternBlock = FormatStyle::IEBS_AfterExternBlock;
   LLVMStyle.IndentGotoLabels = true;
   LLVMStyle.IndentPPDirectives = FormatStyle::PPDIS_None;
diff --git clang/lib/Format/TokenAnnotator.h clang/lib/Format/TokenAnnotator.h
index 16e920e8ad8a..6aea310a56d6 100644
--- clang/lib/Format/TokenAnnotator.h
+++ clang/lib/Format/TokenAnnotator.h
@@ -154,6 +154,11 @@ public:
            startsWith(tok::kw_export, tok::kw_namespace);
   }
 
+  /// \c true if this line starts a C++ export block.
+  bool startsWithExportBlock() const {
+    return startsWith(tok::kw_export, tok::l_brace);
+  }
+
   FormatToken *getFirstNonComment() const {
     assert(First);
     return First->is(tok::comment) ? First->getNextNonComment() : First;
diff --git clang/lib/Format/UnwrappedLineFormatter.cpp clang/lib/Format/UnwrappedLineFormatter.cpp
index cee84fb1191a..46545aa1f4c0 100644
--- clang/lib/Format/UnwrappedLineFormatter.cpp
+++ clang/lib/Format/UnwrappedLineFormatter.cpp
@@ -432,8 +432,9 @@ private:
 
     // Try to merge a control statement block with left brace unwrapped.
     if (TheLine->Last->is(tok::l_brace) && FirstNonComment != TheLine->Last &&
-        FirstNonComment->isOneOf(tok::kw_if, tok::kw_while, tok::kw_for,
-                                 TT_ForEachMacro)) {
+        (FirstNonComment->isOneOf(tok::kw_if, tok::kw_while, tok::kw_for,
+                                  TT_ForEachMacro) ||
+         TheLine->startsWithExportBlock())) {
       return Style.AllowShortBlocksOnASingleLine != FormatStyle::SBS_Never
                  ? tryMergeSimpleBlock(I, E, Limit)
                  : 0;
@@ -832,7 +833,8 @@ private:
     if (IsCtrlStmt(Line) ||
         Line.First->isOneOf(tok::kw_try, tok::kw___try, tok::kw_catch,
                             tok::kw___finally, tok::r_brace,
-                            Keywords.kw___except)) {
+                            Keywords.kw___except) ||
+        Line.startsWithExportBlock()) {
       if (IsSplitBlock)
         return 0;
       // Don't merge when we can't except the case when
diff --git clang/lib/Format/UnwrappedLineParser.cpp clang/lib/Format/UnwrappedLineParser.cpp
index 317717241c17..425832913634 100644
--- clang/lib/Format/UnwrappedLineParser.cpp
+++ clang/lib/Format/UnwrappedLineParser.cpp
@@ -503,14 +503,14 @@ void UnwrappedLineParser::calculateBraceTypes(bool ExpectClassBody) {
     auto *NextTok = Tokens->getNextNonComment();
 
     if (!Line->InMacroBody && !Style.isTableGen()) {
-      // Skip PPDirective lines and comments.
+      // Skip PPDirective lines (except macro definitions) and comments.
       while (NextTok->is(tok::hash)) {
         NextTok = Tokens->getNextToken();
-        if (NextTok->is(tok::pp_not_keyword))
+        if (NextTok->isOneOf(tok::pp_not_keyword, tok::pp_define))
           break;
         do {
           NextTok = Tokens->getNextToken();
-        } while (!NextTok->HasUnescapedNewline && NextTok->isNot(tok::eof));
+        } while (NextTok->NewlinesBefore == 0 && NextTok->isNot(tok::eof));
 
         while (NextTok->is(tok::comment))
           NextTok = Tokens->getNextToken();
@@ -1625,6 +1625,10 @@ void UnwrappedLineParser::parseStructuralElement(
         parseNamespace();
         return;
       }
+      if (FormatTok->is(tok::l_brace)) {
+        parseCppExportBlock();
+        return;
+      }
       if (FormatTok->is(Keywords.kw_import) && parseModuleImport())
         return;
     }
@@ -3105,6 +3109,26 @@ void UnwrappedLineParser::parseTryCatch() {
     addUnwrappedLine();
 }
 
+void UnwrappedLineParser::parseNamespaceOrExportBlock(unsigned AddLevels) {
+  bool ManageWhitesmithsBraces =
+      AddLevels == 0u && Style.BreakBeforeBraces == FormatStyle::BS_Whitesmiths;
+
+  // If we're in Whitesmiths mode, indent the brace if we're not indenting
+  // the whole block.
+  if (ManageWhitesmithsBraces)
+    ++Line->Level;
+
+  // Munch the semicolon after the block. This is more common than one would
+  // think. Putting the semicolon into its own line is very ugly.
+  parseBlock(/*MustBeDeclaration=*/true, AddLevels, /*MunchSemi=*/true,
+             /*KeepBraces=*/true, /*IfKind=*/nullptr, ManageWhitesmithsBraces);
+
+  addUnwrappedLine(AddLevels > 0 ? LineLevel::Remove : LineLevel::Keep);
+
+  if (ManageWhitesmithsBraces)
+    --Line->Level;
+}
+
 void UnwrappedLineParser::parseNamespace() {
   assert(FormatTok->isOneOf(tok::kw_namespace, TT_NamespaceMacro) &&
          "'namespace' expected");
@@ -3137,29 +3161,15 @@ void UnwrappedLineParser::parseNamespace() {
                  DeclarationScopeStack.size() > 1)
             ? 1u
             : 0u;
-    bool ManageWhitesmithsBraces =
-        AddLevels == 0u &&
-        Style.BreakBeforeBraces == FormatStyle::BS_Whitesmiths;
-
-    // If we're in Whitesmiths mode, indent the brace if we're not indenting
-    // the whole block.
-    if (ManageWhitesmithsBraces)
-      ++Line->Level;
-
-    // Munch the semicolon after a namespace. This is more common than one would
-    // think. Putting the semicolon into its own line is very ugly.
-    parseBlock(/*MustBeDeclaration=*/true, AddLevels, /*MunchSemi=*/true,
-               /*KeepBraces=*/true, /*IfKind=*/nullptr,
-               ManageWhitesmithsBraces);
-
-    addUnwrappedLine(AddLevels > 0 ? LineLevel::Remove : LineLevel::Keep);
-
-    if (ManageWhitesmithsBraces)
-      --Line->Level;
+    parseNamespaceOrExportBlock(AddLevels);
   }
   // FIXME: Add error handling.
 }
 
+void UnwrappedLineParser::parseCppExportBlock() {
+  parseNamespaceOrExportBlock(/*AddLevels=*/Style.IndentExportBlock ? 1 : 0);
+}
+
 void UnwrappedLineParser::parseNew() {
   assert(FormatTok->is(tok::kw_new) && "'new' expected");
   nextToken();
diff --git clang/lib/Format/UnwrappedLineParser.h clang/lib/Format/UnwrappedLineParser.h
index 8160d5e84186..08bff2748eb8 100644
--- clang/lib/Format/UnwrappedLineParser.h
+++ clang/lib/Format/UnwrappedLineParser.h
@@ -171,6 +171,8 @@ private:
   void parseRequiresClause(FormatToken *RequiresToken);
   void parseRequiresExpression(FormatToken *RequiresToken);
   void parseConstraintExpression();
+  void parseCppExportBlock();
+  void parseNamespaceOrExportBlock(unsigned AddLevels);
   void parseJavaEnumBody();
   // Parses a record (aka class) as a top level element. If ParseAsExpr is true,
   // parses the record as a child block, i.e. if the class declaration is an
diff --git clang/lib/Frontend/Rewrite/RewriteModernObjC.cpp clang/lib/Frontend/Rewrite/RewriteModernObjC.cpp
index fc65559e9d4a..8f275536b98a 100644
--- clang/lib/Frontend/Rewrite/RewriteModernObjC.cpp
+++ clang/lib/Frontend/Rewrite/RewriteModernObjC.cpp
@@ -3800,8 +3800,8 @@ QualType RewriteModernObjC::GetGroupRecordTypeForObjCIvarBitfield(ObjCIvarDecl *
   const ObjCInterfaceDecl *CDecl = IV->getContainingInterface();
   unsigned GroupNo = ObjCIvarBitfieldGroupNo(IV);
   std::pair<const ObjCInterfaceDecl*, unsigned> tuple = std::make_pair(CDecl, GroupNo);
-  if (GroupRecordType.count(tuple))
-    return GroupRecordType[tuple];
+  if (auto It = GroupRecordType.find(tuple); It != GroupRecordType.end())
+    return It->second;
 
   SmallVector<ObjCIvarDecl *, 8> IVars;
   for (const ObjCIvarDecl *IVD = CDecl->all_declared_ivar_begin();
diff --git clang/lib/Headers/avx10_2_512minmaxintrin.h clang/lib/Headers/avx10_2_512minmaxintrin.h
index e175365d11df..fbc7fbadbc6b 100644
--- clang/lib/Headers/avx10_2_512minmaxintrin.h
+++ clang/lib/Headers/avx10_2_512minmaxintrin.h
@@ -14,22 +14,22 @@
 #ifndef __AVX10_2_512MINMAXINTRIN_H
 #define __AVX10_2_512MINMAXINTRIN_H
 
-#define _mm512_minmaxne_pbh(A, B, C)                                           \
-  ((__m512bh)__builtin_ia32_vminmaxnepbf16512(                                 \
-      (__v32bf)(__m512bh)(A), (__v32bf)(__m512bh)(A), (int)(C)))
+#define _mm512_minmax_pbh(A, B, C)                                             \
+  ((__m512bh)__builtin_ia32_vminmaxbf16512((__v32bf)(__m512bh)(A),             \
+                                           (__v32bf)(__m512bh)(A), (int)(C)))
 
-#define _mm512_mask_minmaxne_pbh(W, U, A, B, C)                                \
+#define _mm512_mask_minmax_pbh(W, U, A, B, C)                                  \
   ((__m512bh)__builtin_ia32_selectpbf_512(                                     \
       (__mmask32)(U),                                                          \
-      (__v32bf)_mm512_minmaxne_pbh((__v32bf)(__m512bh)(A),                     \
-                                   (__v32bf)(__m512bh)(B), (int)(C)),          \
+      (__v32bf)_mm512_minmax_pbh((__v32bf)(__m512bh)(A),                       \
+                                 (__v32bf)(__m512bh)(B), (int)(C)),            \
       (__v32bf)(__m512bh)(W)))
 
-#define _mm512_maskz_minmaxne_pbh(U, A, B, C)                                  \
+#define _mm512_maskz_minmax_pbh(U, A, B, C)                                    \
   ((__m512bh)__builtin_ia32_selectpbf_512(                                     \
       (__mmask32)(U),                                                          \
-      (__v32bf)_mm512_minmaxne_pbh((__v32bf)(__m512bh)(A),                     \
-                                   (__v32bf)(__m512bh)(B), (int)(C)),          \
+      (__v32bf)_mm512_minmax_pbh((__v32bf)(__m512bh)(A),                       \
+                                 (__v32bf)(__m512bh)(B), (int)(C)),            \
       (__v32bf) __builtin_bit_cast(__m512bh, _mm512_setzero_ps())))
 
 #define _mm512_minmax_pd(A, B, C)                                              \
diff --git clang/lib/Headers/avx10_2minmaxintrin.h clang/lib/Headers/avx10_2minmaxintrin.h
index a9367e742465..8164d49d89f1 100644
--- clang/lib/Headers/avx10_2minmaxintrin.h
+++ clang/lib/Headers/avx10_2minmaxintrin.h
@@ -14,40 +14,40 @@
 #ifndef __AVX10_2MINMAXINTRIN_H
 #define __AVX10_2MINMAXINTRIN_H
 
-#define _mm_minmaxne_pbh(A, B, C)                                              \
-  ((__m128bh)__builtin_ia32_vminmaxnepbf16128(                                 \
-      (__m128bh)(__v8bf)(A), (__m128bh)(__v8bf)(B), (int)(C)))
+#define _mm_minmax_pbh(A, B, C)                                                \
+  ((__m128bh)__builtin_ia32_vminmaxbf16128((__m128bh)(__v8bf)(A),              \
+                                           (__m128bh)(__v8bf)(B), (int)(C)))
 
-#define _mm_mask_minmaxne_pbh(W, U, A, B, C)                                   \
+#define _mm_mask_minmax_pbh(W, U, A, B, C)                                     \
   ((__m128bh)__builtin_ia32_selectpbf_128(                                     \
       (__mmask8)(U),                                                           \
-      (__v8bf)_mm_minmaxne_pbh((__m128bh)(__v8bf)(A), (__m128bh)(__v8bf)(B),   \
-                               (int)(C)),                                      \
+      (__v8bf)_mm_minmax_pbh((__m128bh)(__v8bf)(A), (__m128bh)(__v8bf)(B),     \
+                             (int)(C)),                                        \
       (__v8bf)(W)))
 
-#define _mm_maskz_minmaxne_pbh(U, A, B, C)                                     \
+#define _mm_maskz_minmax_pbh(U, A, B, C)                                       \
   ((__m128bh)__builtin_ia32_selectpbf_128(                                     \
       (__mmask8)(U),                                                           \
-      (__v8bf)_mm_minmaxne_pbh((__m128bh)(__v8bf)(A), (__m128bh)(__v8bf)(B),   \
-                               (int)(C)),                                      \
+      (__v8bf)_mm_minmax_pbh((__m128bh)(__v8bf)(A), (__m128bh)(__v8bf)(B),     \
+                             (int)(C)),                                        \
       (__v8bf) __builtin_bit_cast(__m128bh, _mm_setzero_ps())))
 
-#define _mm256_minmaxne_pbh(A, B, C)                                           \
-  ((__m256bh)__builtin_ia32_vminmaxnepbf16256(                                 \
-      (__m256bh)(__v16bf)(A), (__m256bh)(__v16bf)(B), (int)(C)))
+#define _mm256_minmax_pbh(A, B, C)                                             \
+  ((__m256bh)__builtin_ia32_vminmaxbf16256((__m256bh)(__v16bf)(A),             \
+                                           (__m256bh)(__v16bf)(B), (int)(C)))
 
-#define _mm256_mask_minmaxne_pbh(W, U, A, B, C)                                \
+#define _mm256_mask_minmax_pbh(W, U, A, B, C)                                  \
   ((__m256bh)__builtin_ia32_selectpbf_256(                                     \
       (__mmask16)(U),                                                          \
-      (__v16bf)_mm256_minmaxne_pbh((__m256bh)(__v16bf)(A),                     \
-                                   (__m256bh)(__v16bf)(B), (int)(C)),          \
+      (__v16bf)_mm256_minmax_pbh((__m256bh)(__v16bf)(A),                       \
+                                 (__m256bh)(__v16bf)(B), (int)(C)),            \
       (__v16bf)(W)))
 
-#define _mm256_maskz_minmaxne_pbh(U, A, B, C)                                  \
+#define _mm256_maskz_minmax_pbh(U, A, B, C)                                    \
   ((__m256bh)__builtin_ia32_selectpbf_256(                                     \
       (__mmask16)(U),                                                          \
-      (__v16bf)_mm256_minmaxne_pbh((__m256bh)(__v16bf)(A),                     \
-                                   (__m256bh)(__v16bf)(B), (int)(C)),          \
+      (__v16bf)_mm256_minmax_pbh((__m256bh)(__v16bf)(A),                       \
+                                 (__m256bh)(__v16bf)(B), (int)(C)),            \
       (__v16bf) __builtin_bit_cast(__m256bh, _mm256_setzero_ps())))
 
 #define _mm_minmax_pd(A, B, C)                                                 \
diff --git clang/lib/Headers/vecintrin.h clang/lib/Headers/vecintrin.h
index c842edd6756f..a14c39f9f731 100644
--- clang/lib/Headers/vecintrin.h
+++ clang/lib/Headers/vecintrin.h
@@ -468,6 +468,27 @@ vec_perm(__vector __bool long long __a, __vector __bool long long __b,
            (__vector unsigned char)__a, (__vector unsigned char)__b, __c);
 }
 
+static inline __ATTRS_o_ai __vector signed __int128
+vec_perm(__vector signed __int128 __a, __vector signed __int128 __b,
+         __vector unsigned char __c) {
+  return (__vector signed __int128)__builtin_s390_vperm(
+           (__vector unsigned char)__a, (__vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai __vector unsigned __int128
+vec_perm(__vector unsigned __int128 __a, __vector unsigned __int128 __b,
+         __vector unsigned char __c) {
+  return (__vector unsigned __int128)__builtin_s390_vperm(
+           (__vector unsigned char)__a, (__vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai __vector __bool __int128
+vec_perm(__vector __bool __int128 __a, __vector __bool __int128 __b,
+         __vector unsigned char __c) {
+  return (__vector __bool __int128)__builtin_s390_vperm(
+           (__vector unsigned char)__a, (__vector unsigned char)__b, __c);
+}
+
 #if __ARCH__ >= 12
 static inline __ATTRS_o_ai __vector float
 vec_perm(__vector float __a, __vector float __b,
@@ -514,9 +535,19 @@ vec_permi(__vector double __a, __vector double __b, int __c)
                       (__vector unsigned long long)(Y), \
                       (((Z) & 2) << 1) | ((Z) & 1)))
 
+/*-- vec_bperm --------------------------------------------------------------*/
+
+#if __ARCH__ >= 12
+static inline __ATTRS_ai __vector unsigned long long
+vec_bperm(__vector unsigned __int128 __a, __vector unsigned char __b) {
+  return __builtin_s390_vbperm((__vector unsigned char)__a, __b);
+}
+#endif
+
 /*-- vec_bperm_u128 ---------------------------------------------------------*/
 
 #if __ARCH__ >= 12
+// This prototype is deprecated.
 static inline __ATTRS_ai __vector unsigned long long
 vec_bperm_u128(__vector unsigned char __a, __vector unsigned char __b) {
   return __builtin_s390_vbperm(__a, __b);
@@ -558,6 +589,18 @@ vec_revb(__vector unsigned long long __vec) {
   return __builtin_s390_vlbrg(__vec);
 }
 
+static inline __ATTRS_o_ai __vector signed __int128
+vec_revb(__vector signed __int128 __vec) {
+  return (__vector signed __int128)
+         __builtin_s390_vlbrq((unsigned __int128)__vec);
+}
+
+static inline __ATTRS_o_ai __vector unsigned __int128
+vec_revb(__vector unsigned __int128 __vec) {
+  return (__vector unsigned __int128)
+         __builtin_s390_vlbrq((unsigned __int128)__vec);
+}
+
 #if __ARCH__ >= 12
 static inline __ATTRS_o_ai __vector float
 vec_revb(__vector float __vec) {
@@ -820,6 +863,46 @@ vec_sel(__vector unsigned long long __a, __vector unsigned long long __b,
           (~(__vector unsigned long long)__c & __a));
 }
 
+static inline __ATTRS_o_ai __vector signed __int128
+vec_sel(__vector signed __int128 __a, __vector signed __int128 __b,
+        __vector unsigned __int128 __c) {
+  return (((__vector signed __int128)__c & __b) |
+          (~(__vector signed __int128)__c & __a));
+}
+
+static inline __ATTRS_o_ai __vector signed __int128
+vec_sel(__vector signed __int128 __a, __vector signed __int128 __b,
+        __vector __bool __int128 __c) {
+  return (((__vector signed __int128)__c & __b) |
+          (~(__vector signed __int128)__c & __a));
+}
+
+static inline __ATTRS_o_ai __vector __bool __int128
+vec_sel(__vector __bool __int128 __a, __vector __bool __int128 __b,
+        __vector unsigned __int128 __c) {
+  return (((__vector __bool __int128)__c & __b) |
+          (~(__vector __bool __int128)__c & __a));
+}
+
+static inline __ATTRS_o_ai __vector __bool __int128
+vec_sel(__vector __bool __int128 __a, __vector __bool __int128 __b,
+        __vector __bool __int128 __c) {
+  return (__c & __b) | (~__c & __a);
+}
+
+static inline __ATTRS_o_ai __vector unsigned __int128
+vec_sel(__vector unsigned __int128 __a, __vector unsigned __int128 __b,
+        __vector unsigned __int128 __c) {
+  return (__c & __b) | (~__c & __a);
+}
+
+static inline __ATTRS_o_ai __vector unsigned __int128
+vec_sel(__vector unsigned __int128 __a, __vector unsigned __int128 __b,
+        __vector __bool __int128 __c) {
+  return (((__vector unsigned __int128)__c & __b) |
+          (~(__vector unsigned __int128)__c & __a));
+}
+
 #if __ARCH__ >= 12
 static inline __ATTRS_o_ai __vector float
 vec_sel(__vector float __a, __vector float __b, __vector unsigned int __c) {
@@ -1078,6 +1161,22 @@ vec_xl(long __offset, const unsigned long long *__ptr) {
   return V;
 }
 
+static inline __ATTRS_o_ai __vector signed __int128
+vec_xl(long __offset, const signed __int128 *__ptr) {
+  __vector signed __int128 V;
+  __builtin_memcpy(&V, ((const char *)__ptr + __offset),
+                   sizeof(__vector signed __int128));
+  return V;
+}
+
+static inline __ATTRS_o_ai __vector unsigned __int128
+vec_xl(long __offset, const unsigned __int128 *__ptr) {
+  __vector unsigned __int128 V;
+  __builtin_memcpy(&V, ((const char *)__ptr + __offset),
+                   sizeof(__vector unsigned __int128));
+  return V;
+}
+
 #if __ARCH__ >= 12
 static inline __ATTRS_o_ai __vector float
 vec_xl(long __offset, const float *__ptr) {
@@ -1294,6 +1393,22 @@ vec_xst(__vector unsigned long long __vec, long __offset,
                    sizeof(__vector unsigned long long));
 }
 
+static inline __ATTRS_o_ai void
+vec_xst(__vector signed __int128 __vec, long __offset,
+        signed __int128 *__ptr) {
+  __vector signed __int128 V = __vec;
+  __builtin_memcpy(((char *)__ptr + __offset), &V,
+                   sizeof(__vector signed __int128));
+}
+
+static inline __ATTRS_o_ai void
+vec_xst(__vector unsigned __int128 __vec, long __offset,
+        unsigned __int128 *__ptr) {
+  __vector unsigned __int128 V = __vec;
+  __builtin_memcpy(((char *)__ptr + __offset), &V,
+                   sizeof(__vector unsigned __int128));
+}
+
 #if __ARCH__ >= 12
 static inline __ATTRS_o_ai void
 vec_xst(__vector float __vec, long __offset, float *__ptr) {
@@ -1465,6 +1580,14 @@ extern __ATTRS_o __vector unsigned long long
 vec_load_bndry(const unsigned long long *__ptr, unsigned short __len)
   __constant_pow2_range(__len, 64, 4096);
 
+extern __ATTRS_o __vector signed __int128
+vec_load_bndry(const signed __int128 *__ptr, unsigned short __len)
+  __constant_pow2_range(__len, 64, 4096);
+
+extern __ATTRS_o __vector unsigned __int128
+vec_load_bndry(const unsigned __int128 *__ptr, unsigned short __len)
+  __constant_pow2_range(__len, 64, 4096);
+
 #if __ARCH__ >= 12
 extern __ATTRS_o __vector float
 vec_load_bndry(const float *__ptr, unsigned short __len)
@@ -1496,43 +1619,51 @@ vec_load_len(const unsigned char *__ptr, unsigned int __len) {
   return (__vector unsigned char)__builtin_s390_vll(__len, __ptr);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector signed short
 vec_load_len(const signed short *__ptr, unsigned int __len) {
   return (__vector signed short)__builtin_s390_vll(__len, __ptr);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector unsigned short
 vec_load_len(const unsigned short *__ptr, unsigned int __len) {
   return (__vector unsigned short)__builtin_s390_vll(__len, __ptr);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector signed int
 vec_load_len(const signed int *__ptr, unsigned int __len) {
   return (__vector signed int)__builtin_s390_vll(__len, __ptr);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector unsigned int
 vec_load_len(const unsigned int *__ptr, unsigned int __len) {
   return (__vector unsigned int)__builtin_s390_vll(__len, __ptr);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector signed long long
 vec_load_len(const signed long long *__ptr, unsigned int __len) {
   return (__vector signed long long)__builtin_s390_vll(__len, __ptr);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector unsigned long long
 vec_load_len(const unsigned long long *__ptr, unsigned int __len) {
   return (__vector unsigned long long)__builtin_s390_vll(__len, __ptr);
 }
 
 #if __ARCH__ >= 12
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector float
 vec_load_len(const float *__ptr, unsigned int __len) {
   return (__vector float)__builtin_s390_vll(__len, __ptr);
 }
 #endif
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector double
 vec_load_len(const double *__ptr, unsigned int __len) {
   return (__vector double)__builtin_s390_vll(__len, __ptr);
@@ -1541,7 +1672,12 @@ vec_load_len(const double *__ptr, unsigned int __len) {
 /*-- vec_load_len_r ---------------------------------------------------------*/
 
 #if __ARCH__ >= 12
-static inline __ATTRS_ai __vector unsigned char
+static inline __ATTRS_o_ai __vector signed char
+vec_load_len_r(const signed char *__ptr, unsigned int __len) {
+  return (__vector signed char)__builtin_s390_vlrlr(__len, __ptr);
+}
+
+static inline __ATTRS_o_ai __vector unsigned char
 vec_load_len_r(const unsigned char *__ptr, unsigned int __len) {
   return (__vector unsigned char)__builtin_s390_vlrlr(__len, __ptr);
 }
@@ -1561,36 +1697,42 @@ vec_store_len(__vector unsigned char __vec, unsigned char *__ptr,
   __builtin_s390_vstl((__vector signed char)__vec, __len, __ptr);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_store_len(__vector signed short __vec, signed short *__ptr,
               unsigned int __len) {
   __builtin_s390_vstl((__vector signed char)__vec, __len, __ptr);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_store_len(__vector unsigned short __vec, unsigned short *__ptr,
               unsigned int __len) {
   __builtin_s390_vstl((__vector signed char)__vec, __len, __ptr);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_store_len(__vector signed int __vec, signed int *__ptr,
               unsigned int __len) {
   __builtin_s390_vstl((__vector signed char)__vec, __len, __ptr);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_store_len(__vector unsigned int __vec, unsigned int *__ptr,
               unsigned int __len) {
   __builtin_s390_vstl((__vector signed char)__vec, __len, __ptr);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_store_len(__vector signed long long __vec, signed long long *__ptr,
               unsigned int __len) {
   __builtin_s390_vstl((__vector signed char)__vec, __len, __ptr);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_store_len(__vector unsigned long long __vec, unsigned long long *__ptr,
               unsigned int __len) {
@@ -1598,6 +1740,7 @@ vec_store_len(__vector unsigned long long __vec, unsigned long long *__ptr,
 }
 
 #if __ARCH__ >= 12
+// This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_store_len(__vector float __vec, float *__ptr,
               unsigned int __len) {
@@ -1605,6 +1748,7 @@ vec_store_len(__vector float __vec, float *__ptr,
 }
 #endif
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_store_len(__vector double __vec, double *__ptr,
               unsigned int __len) {
@@ -1614,7 +1758,13 @@ vec_store_len(__vector double __vec, double *__ptr,
 /*-- vec_store_len_r --------------------------------------------------------*/
 
 #if __ARCH__ >= 12
-static inline __ATTRS_ai void
+static inline __ATTRS_o_ai void
+vec_store_len_r(__vector signed char __vec, signed char *__ptr,
+                unsigned int __len) {
+  __builtin_s390_vstrlr(__vec, __len, __ptr);
+}
+
+static inline __ATTRS_o_ai void
 vec_store_len_r(__vector unsigned char __vec, unsigned char *__ptr,
                 unsigned int __len) {
   __builtin_s390_vstrlr((__vector signed char)__vec, __len, __ptr);
@@ -1711,6 +1861,35 @@ vec_genmasks_64(unsigned char __first, unsigned char __last)
   return (__vector unsigned long long)__value;
 }
 
+/*-- vec_gen_element_masks_* ------------------------------------------------*/
+
+#if __ARCH__ >= 15
+static inline __ATTRS_ai __vector unsigned char
+vec_gen_element_masks_8(__vector unsigned short __mask) {
+  return __builtin_s390_vgemb(__mask);
+}
+
+static inline __ATTRS_ai __vector unsigned short
+vec_gen_element_masks_16(__vector unsigned char __mask) {
+  return __builtin_s390_vgemh(__mask);
+}
+
+static inline __ATTRS_ai __vector unsigned int
+vec_gen_element_masks_32(__vector unsigned char __mask) {
+  return __builtin_s390_vgemf(__mask);
+}
+
+static inline __ATTRS_ai __vector unsigned long long
+vec_gen_element_masks_64(__vector unsigned char __mask) {
+  return __builtin_s390_vgemg(__mask);
+}
+
+static inline __ATTRS_ai __vector unsigned __int128
+vec_gen_element_masks_128(__vector unsigned char __mask) {
+  return (__vector unsigned __int128)__builtin_s390_vgemq(__mask);
+}
+#endif
+
 /*-- vec_splat --------------------------------------------------------------*/
 
 static inline __ATTRS_o_ai __vector signed char
@@ -1894,6 +2073,16 @@ vec_splats(unsigned long long __scalar) {
   return (__vector unsigned long long)__scalar;
 }
 
+static inline __ATTRS_o_ai __vector signed __int128
+vec_splats(signed __int128 __scalar) {
+  return (__vector signed __int128)__scalar;
+}
+
+static inline __ATTRS_o_ai __vector unsigned __int128
+vec_splats(unsigned __int128 __scalar) {
+  return (__vector unsigned __int128)__scalar;
+}
+
 #if __ARCH__ >= 12
 static inline __ATTRS_o_ai __vector float
 vec_splats(float __scalar) {
@@ -2166,6 +2355,27 @@ vec_pack(__vector unsigned long long __a, __vector unsigned long long __b) {
   return (__vector unsigned int)(__ac[1], __ac[3], __bc[1], __bc[3]);
 }
 
+static inline __ATTRS_o_ai __vector signed long long
+vec_pack(__vector signed __int128 __a, __vector signed __int128 __b) {
+  __vector signed long long __ac = (__vector signed long long)__a;
+  __vector signed long long __bc = (__vector signed long long)__b;
+  return (__vector signed long long)(__ac[1], __bc[1]);
+}
+
+static inline __ATTRS_o_ai __vector __bool long long
+vec_pack(__vector __bool __int128 __a, __vector __bool __int128 __b) {
+  __vector __bool long long __ac = (__vector __bool long long)__a;
+  __vector __bool long long __bc = (__vector __bool long long)__b;
+  return (__vector __bool long long)(__ac[1], __bc[1]);
+}
+
+static inline __ATTRS_o_ai __vector unsigned long long
+vec_pack(__vector unsigned __int128 __a, __vector unsigned __int128 __b) {
+  __vector unsigned long long __ac = (__vector unsigned long long)__a;
+  __vector unsigned long long __bc = (__vector unsigned long long)__b;
+  return (__vector unsigned long long)(__ac[1], __bc[1]);
+}
+
 /*-- vec_packs --------------------------------------------------------------*/
 
 static inline __ATTRS_o_ai __vector signed char
@@ -2344,6 +2554,24 @@ vec_unpackh(__vector unsigned int __a) {
   return __builtin_s390_vuplhf(__a);
 }
 
+#if __ARCH__ >= 15
+static inline __ATTRS_o_ai __vector signed __int128
+vec_unpackh(__vector signed long long __a) {
+  return (__vector signed __int128)__builtin_s390_vuphg(__a);
+}
+
+static inline __ATTRS_o_ai __vector __bool __int128
+vec_unpackh(__vector __bool long long __a) {
+  return ((__vector __bool __int128)
+          __builtin_s390_vuphg((__vector signed long long)__a));
+}
+
+static inline __ATTRS_o_ai __vector unsigned __int128
+vec_unpackh(__vector unsigned long long __a) {
+  return (__vector unsigned __int128)__builtin_s390_vuplhg(__a);
+}
+#endif
+
 /*-- vec_unpackl ------------------------------------------------------------*/
 
 static inline __ATTRS_o_ai __vector signed short
@@ -2394,6 +2622,24 @@ vec_unpackl(__vector unsigned int __a) {
   return __builtin_s390_vupllf(__a);
 }
 
+#if __ARCH__ >= 15
+static inline __ATTRS_o_ai __vector signed __int128
+vec_unpackl(__vector signed long long __a) {
+  return (__vector signed __int128)__builtin_s390_vuplg(__a);
+}
+
+static inline __ATTRS_o_ai __vector __bool __int128
+vec_unpackl(__vector __bool long long __a) {
+  return ((__vector __bool __int128)
+          __builtin_s390_vuplg((__vector signed long long)__a));
+}
+
+static inline __ATTRS_o_ai __vector unsigned __int128
+vec_unpackl(__vector unsigned long long __a) {
+  return (__vector unsigned __int128)__builtin_s390_vupllg(__a);
+}
+#endif
+
 /*-- vec_cmpeq --------------------------------------------------------------*/
 
 static inline __ATTRS_o_ai __vector __bool char
@@ -2456,6 +2702,21 @@ vec_cmpeq(__vector unsigned long long __a, __vector unsigned long long __b) {
   return (__vector __bool long long)(__a == __b);
 }
 
+static inline __ATTRS_o_ai __vector __bool __int128
+vec_cmpeq(__vector __bool __int128 __a, __vector __bool __int128 __b) {
+  return (__vector __bool __int128)(__a == __b);
+}
+
+static inline __ATTRS_o_ai __vector __bool __int128
+vec_cmpeq(__vector signed __int128 __a, __vector signed __int128 __b) {
+  return (__vector __bool __int128)(__a == __b);
+}
+
+static inline __ATTRS_o_ai __vector __bool __int128
+vec_cmpeq(__vector unsigned __int128 __a, __vector unsigned __int128 __b) {
+  return (__vector __bool __int128)(__a == __b);
+}
+
 #if __ARCH__ >= 12
 static inline __ATTRS_o_ai __vector __bool int
 vec_cmpeq(__vector float __a, __vector float __b) {
@@ -2510,6 +2771,16 @@ vec_cmpge(__vector unsigned long long __a, __vector unsigned long long __b) {
   return (__vector __bool long long)(__a >= __b);
 }
 
+static inline __ATTRS_o_ai __vector __bool __int128
+vec_cmpge(__vector signed __int128 __a, __vector signed __int128 __b) {
+  return (__vector __bool __int128)(__a >= __b);
+}
+
+static inline __ATTRS_o_ai __vector __bool __int128
+vec_cmpge(__vector unsigned __int128 __a, __vector unsigned __int128 __b) {
+  return (__vector __bool __int128)(__a >= __b);
+}
+
 #if __ARCH__ >= 12
 static inline __ATTRS_o_ai __vector __bool int
 vec_cmpge(__vector float __a, __vector float __b) {
@@ -2564,6 +2835,16 @@ vec_cmpgt(__vector unsigned long long __a, __vector unsigned long long __b) {
   return (__vector __bool long long)(__a > __b);
 }
 
+static inline __ATTRS_o_ai __vector __bool __int128
+vec_cmpgt(__vector signed __int128 __a, __vector signed __int128 __b) {
+  return (__vector __bool __int128)(__a > __b);
+}
+
+static inline __ATTRS_o_ai __vector __bool __int128
+vec_cmpgt(__vector unsigned __int128 __a, __vector unsigned __int128 __b) {
+  return (__vector __bool __int128)(__a > __b);
+}
+
 #if __ARCH__ >= 12
 static inline __ATTRS_o_ai __vector __bool int
 vec_cmpgt(__vector float __a, __vector float __b) {
@@ -2618,6 +2899,16 @@ vec_cmple(__vector unsigned long long __a, __vector unsigned long long __b) {
   return (__vector __bool long long)(__a <= __b);
 }
 
+static inline __ATTRS_o_ai __vector __bool __int128
+vec_cmple(__vector signed __int128 __a, __vector signed __int128 __b) {
+  return (__vector __bool __int128)(__a <= __b);
+}
+
+static inline __ATTRS_o_ai __vector __bool __int128
+vec_cmple(__vector unsigned __int128 __a, __vector unsigned __int128 __b) {
+  return (__vector __bool __int128)(__a <= __b);
+}
+
 #if __ARCH__ >= 12
 static inline __ATTRS_o_ai __vector __bool int
 vec_cmple(__vector float __a, __vector float __b) {
@@ -2672,6 +2963,16 @@ vec_cmplt(__vector unsigned long long __a, __vector unsigned long long __b) {
   return (__vector __bool long long)(__a < __b);
 }
 
+static inline __ATTRS_o_ai __vector __bool __int128
+vec_cmplt(__vector signed __int128 __a, __vector signed __int128 __b) {
+  return (__vector __bool __int128)(__a < __b);
+}
+
+static inline __ATTRS_o_ai __vector __bool __int128
+vec_cmplt(__vector unsigned __int128 __a, __vector unsigned __int128 __b) {
+  return (__vector __bool __int128)(__a < __b);
+}
+
 #if __ARCH__ >= 12
 static inline __ATTRS_o_ai __vector __bool int
 vec_cmplt(__vector float __a, __vector float __b) {
@@ -2914,6 +3215,29 @@ vec_all_eq(__vector __bool long long __a, __vector __bool long long __b) {
   return __cc == 0;
 }
 
+#if __ARCH__ >= 15
+static inline __ATTRS_o_ai int
+vec_all_eq(__vector signed __int128 __a, __vector signed __int128 __b) {
+  int __cc;
+  __builtin_s390_vceqqs((unsigned __int128)__a, (unsigned __int128)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(__vector unsigned __int128 __a, __vector unsigned __int128 __b) {
+  int __cc;
+  __builtin_s390_vceqqs((unsigned __int128)__a, (unsigned __int128)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(__vector __bool __int128 __a, __vector __bool __int128 __b) {
+  int __cc;
+  __builtin_s390_vceqqs((unsigned __int128)__a, (unsigned __int128)__b, &__cc);
+  return __cc == 0;
+}
+#endif
+
 #if __ARCH__ >= 12
 static inline __ATTRS_o_ai int
 vec_all_eq(__vector float __a, __vector float __b) {
@@ -3161,6 +3485,29 @@ vec_all_ne(__vector __bool long long __a, __vector __bool long long __b) {
   return __cc == 3;
 }
 
+#if __ARCH__ >= 15
+static inline __ATTRS_o_ai int
+vec_all_ne(__vector signed __int128 __a, __vector signed __int128 __b) {
+  int __cc;
+  __builtin_s390_vceqqs((unsigned __int128)__a, (unsigned __int128)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(__vector unsigned __int128 __a, __vector unsigned __int128 __b) {
+  int __cc;
+  __builtin_s390_vceqqs((unsigned __int128)__a, (unsigned __int128)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(__vector __bool __int128 __a, __vector __bool __int128 __b) {
+  int __cc;
+  __builtin_s390_vceqqs((unsigned __int128)__a, (unsigned __int128)__b, &__cc);
+  return __cc == 3;
+}
+#endif
+
 #if __ARCH__ >= 12
 static inline __ATTRS_o_ai int
 vec_all_ne(__vector float __a, __vector float __b) {
@@ -3399,6 +3746,22 @@ vec_all_ge(__vector __bool long long __a, __vector __bool long long __b) {
   return __cc == 3;
 }
 
+#if __ARCH__ >= 15
+static inline __ATTRS_o_ai int
+vec_all_ge(__vector signed __int128 __a, __vector signed __int128 __b) {
+  int __cc;
+  __builtin_s390_vchqs((signed __int128)__b, (signed __int128)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(__vector unsigned __int128 __a, __vector unsigned __int128 __b) {
+  int __cc;
+  __builtin_s390_vchlqs((unsigned __int128)__b, (unsigned __int128)__a, &__cc);
+  return __cc == 3;
+}
+#endif
+
 #if __ARCH__ >= 12
 static inline __ATTRS_o_ai int
 vec_all_ge(__vector float __a, __vector float __b) {
@@ -3637,6 +4000,22 @@ vec_all_gt(__vector __bool long long __a, __vector __bool long long __b) {
   return __cc == 0;
 }
 
+#if __ARCH__ >= 15
+static inline __ATTRS_o_ai int
+vec_all_gt(__vector signed __int128 __a, __vector signed __int128 __b) {
+  int __cc;
+  __builtin_s390_vchqs((signed __int128)__a, (signed __int128)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(__vector unsigned __int128 __a, __vector unsigned __int128 __b) {
+  int __cc;
+  __builtin_s390_vchlqs((unsigned __int128)__a, (unsigned __int128)__b, &__cc);
+  return __cc == 0;
+}
+#endif
+
 #if __ARCH__ >= 12
 static inline __ATTRS_o_ai int
 vec_all_gt(__vector float __a, __vector float __b) {
@@ -3875,6 +4254,22 @@ vec_all_le(__vector __bool long long __a, __vector __bool long long __b) {
   return __cc == 3;
 }
 
+#if __ARCH__ >= 15
+static inline __ATTRS_o_ai int
+vec_all_le(__vector signed __int128 __a, __vector signed __int128 __b) {
+  int __cc;
+  __builtin_s390_vchqs((signed __int128)__a, (signed __int128)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(__vector unsigned __int128 __a, __vector unsigned __int128 __b) {
+  int __cc;
+  __builtin_s390_vchlqs((unsigned __int128)__a, (unsigned __int128)__b, &__cc);
+  return __cc == 3;
+}
+#endif
+
 #if __ARCH__ >= 12
 static inline __ATTRS_o_ai int
 vec_all_le(__vector float __a, __vector float __b) {
@@ -4113,6 +4508,22 @@ vec_all_lt(__vector __bool long long __a, __vector __bool long long __b) {
   return __cc == 0;
 }
 
+#if __ARCH__ >= 15
+static inline __ATTRS_o_ai int
+vec_all_lt(__vector signed __int128 __a, __vector signed __int128 __b) {
+  int __cc;
+  __builtin_s390_vchqs((signed __int128)__b, (signed __int128)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(__vector unsigned __int128 __a, __vector unsigned __int128 __b) {
+  int __cc;
+  __builtin_s390_vchlqs((unsigned __int128)__b, (unsigned __int128)__a, &__cc);
+  return __cc == 0;
+}
+#endif
+
 #if __ARCH__ >= 12
 static inline __ATTRS_o_ai int
 vec_all_lt(__vector float __a, __vector float __b) {
@@ -4467,6 +4878,29 @@ vec_any_eq(__vector __bool long long __a, __vector __bool long long __b) {
   return __cc <= 1;
 }
 
+#if __ARCH__ >= 15
+static inline __ATTRS_o_ai int
+vec_any_eq(__vector signed __int128 __a, __vector signed __int128 __b) {
+  int __cc;
+  __builtin_s390_vceqqs((unsigned __int128)__a, (unsigned __int128)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(__vector unsigned __int128 __a, __vector unsigned __int128 __b) {
+  int __cc;
+  __builtin_s390_vceqqs((unsigned __int128)__a, (unsigned __int128)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(__vector __bool __int128 __a, __vector __bool __int128 __b) {
+  int __cc;
+  __builtin_s390_vceqqs((unsigned __int128)__a, (unsigned __int128)__b, &__cc);
+  return __cc <= 1;
+}
+#endif
+
 #if __ARCH__ >= 12
 static inline __ATTRS_o_ai int
 vec_any_eq(__vector float __a, __vector float __b) {
@@ -4713,28 +5147,51 @@ vec_any_ne(__vector __bool long long __a, __vector __bool long long __b) {
   return __cc != 0;
 }
 
-#if __ARCH__ >= 12
+#if __ARCH__ >= 15
 static inline __ATTRS_o_ai int
-vec_any_ne(__vector float __a, __vector float __b) {
+vec_any_ne(__vector signed __int128 __a, __vector signed __int128 __b) {
   int __cc;
-  __builtin_s390_vfcesbs(__a, __b, &__cc);
+  __builtin_s390_vceqqs((unsigned __int128)__a, (unsigned __int128)__b, &__cc);
   return __cc != 0;
 }
-#endif
 
 static inline __ATTRS_o_ai int
-vec_any_ne(__vector double __a, __vector double __b) {
+vec_any_ne(__vector unsigned __int128 __a, __vector unsigned __int128 __b) {
   int __cc;
-  __builtin_s390_vfcedbs(__a, __b, &__cc);
+  __builtin_s390_vceqqs((unsigned __int128)__a, (unsigned __int128)__b, &__cc);
   return __cc != 0;
 }
 
-/*-- vec_any_ge -------------------------------------------------------------*/
-
 static inline __ATTRS_o_ai int
-vec_any_ge(__vector signed char __a, __vector signed char __b) {
+vec_any_ne(__vector __bool __int128 __a, __vector __bool __int128 __b) {
   int __cc;
-  __builtin_s390_vchbs(__b, __a, &__cc);
+  __builtin_s390_vceqqs((unsigned __int128)__a, (unsigned __int128)__b, &__cc);
+  return __cc != 0;
+}
+#endif
+
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_any_ne(__vector float __a, __vector float __b) {
+  int __cc;
+  __builtin_s390_vfcesbs(__a, __b, &__cc);
+  return __cc != 0;
+}
+#endif
+
+static inline __ATTRS_o_ai int
+vec_any_ne(__vector double __a, __vector double __b) {
+  int __cc;
+  __builtin_s390_vfcedbs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+/*-- vec_any_ge -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_any_ge(__vector signed char __a, __vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__b, __a, &__cc);
   return __cc != 0;
 }
 
@@ -4951,6 +5408,22 @@ vec_any_ge(__vector __bool long long __a, __vector __bool long long __b) {
   return __cc != 0;
 }
 
+#if __ARCH__ >= 15
+static inline __ATTRS_o_ai int
+vec_any_ge(__vector signed __int128 __a, __vector signed __int128 __b) {
+  int __cc;
+  __builtin_s390_vchqs((signed __int128)__b, (signed __int128)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(__vector unsigned __int128 __a, __vector unsigned __int128 __b) {
+  int __cc;
+  __builtin_s390_vchlqs((unsigned __int128)__b, (unsigned __int128)__a, &__cc);
+  return __cc != 0;
+}
+#endif
+
 #if __ARCH__ >= 12
 static inline __ATTRS_o_ai int
 vec_any_ge(__vector float __a, __vector float __b) {
@@ -5189,6 +5662,22 @@ vec_any_gt(__vector __bool long long __a, __vector __bool long long __b) {
   return __cc <= 1;
 }
 
+#if __ARCH__ >= 15
+static inline __ATTRS_o_ai int
+vec_any_gt(__vector signed __int128 __a, __vector signed __int128 __b) {
+  int __cc;
+  __builtin_s390_vchqs((signed __int128)__a, (signed __int128)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(__vector unsigned __int128 __a, __vector unsigned __int128 __b) {
+  int __cc;
+  __builtin_s390_vchlqs((unsigned __int128)__a, (unsigned __int128)__b, &__cc);
+  return __cc <= 1;
+}
+#endif
+
 #if __ARCH__ >= 12
 static inline __ATTRS_o_ai int
 vec_any_gt(__vector float __a, __vector float __b) {
@@ -5427,6 +5916,22 @@ vec_any_le(__vector __bool long long __a, __vector __bool long long __b) {
   return __cc != 0;
 }
 
+#if __ARCH__ >= 15
+static inline __ATTRS_o_ai int
+vec_any_le(__vector signed __int128 __a, __vector signed __int128 __b) {
+  int __cc;
+  __builtin_s390_vchqs((signed __int128)__a, (signed __int128)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(__vector unsigned __int128 __a, __vector unsigned __int128 __b) {
+  int __cc;
+  __builtin_s390_vchlqs((unsigned __int128)__a, (unsigned __int128)__b, &__cc);
+  return __cc != 0;
+}
+#endif
+
 #if __ARCH__ >= 12
 static inline __ATTRS_o_ai int
 vec_any_le(__vector float __a, __vector float __b) {
@@ -5665,6 +6170,22 @@ vec_any_lt(__vector __bool long long __a, __vector __bool long long __b) {
   return __cc <= 1;
 }
 
+#if __ARCH__ >= 15
+static inline __ATTRS_o_ai int
+vec_any_lt(__vector signed __int128 __a, __vector signed __int128 __b) {
+  int __cc;
+  __builtin_s390_vchqs((signed __int128)__b, (signed __int128)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(__vector unsigned __int128 __a, __vector unsigned __int128 __b) {
+  int __cc;
+  __builtin_s390_vchlqs((unsigned __int128)__b, (unsigned __int128)__a, &__cc);
+  return __cc <= 1;
+}
+#endif
+
 #if __ARCH__ >= 12
 static inline __ATTRS_o_ai int
 vec_any_lt(__vector float __a, __vector float __b) {
@@ -5753,40 +6274,419 @@ vec_any_nlt(__vector double __a, __vector double __b) {
   return __cc != 0;
 }
 
-/*-- vec_any_nan ------------------------------------------------------------*/
+/*-- vec_any_nan ------------------------------------------------------------*/
+
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_any_nan(__vector float __a) {
+  int __cc;
+  __builtin_s390_vftcisb(__a, 15, &__cc);
+  return __cc != 3;
+}
+#endif
+
+static inline __ATTRS_o_ai int
+vec_any_nan(__vector double __a) {
+  int __cc;
+  __builtin_s390_vftcidb(__a, 15, &__cc);
+  return __cc != 3;
+}
+
+/*-- vec_any_numeric --------------------------------------------------------*/
+
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_any_numeric(__vector float __a) {
+  int __cc;
+  __builtin_s390_vftcisb(__a, 15, &__cc);
+  return __cc != 0;
+}
+#endif
+
+static inline __ATTRS_o_ai int
+vec_any_numeric(__vector double __a) {
+  int __cc;
+  __builtin_s390_vftcidb(__a, 15, &__cc);
+  return __cc != 0;
+}
+
+/*-- vec_blend --------------------------------------------------------------*/
+
+#if __ARCH__ >= 15
+static inline __ATTRS_o_ai __vector signed char
+vec_blend(__vector signed char __a, __vector signed char __b,
+          __vector signed char __c) {
+  return vec_sel(__a, __b, vec_cmplt(__c, (__vector signed char)0));
+}
+
+static inline __ATTRS_o_ai __vector __bool char
+vec_blend(__vector __bool char __a, __vector __bool char __b,
+          __vector signed char __c) {
+  return vec_sel(__a, __b, vec_cmplt(__c, (__vector signed char)0));
+}
+
+static inline __ATTRS_o_ai __vector unsigned char
+vec_blend(__vector unsigned char __a, __vector unsigned char __b,
+          __vector signed char __c) {
+  return vec_sel(__a, __b, vec_cmplt(__c, (__vector signed char)0));
+}
+
+static inline __ATTRS_o_ai __vector signed short
+vec_blend(__vector signed short __a, __vector signed short __b,
+          __vector signed short __c) {
+  return vec_sel(__a, __b, vec_cmplt(__c, (__vector signed short)0));
+}
+
+static inline __ATTRS_o_ai __vector __bool short
+vec_blend(__vector __bool short __a, __vector __bool short __b,
+          __vector signed short __c) {
+  return vec_sel(__a, __b, vec_cmplt(__c, (__vector signed short)0));
+}
+
+static inline __ATTRS_o_ai __vector unsigned short
+vec_blend(__vector unsigned short __a, __vector unsigned short __b,
+          __vector signed short __c) {
+  return vec_sel(__a, __b, vec_cmplt(__c, (__vector signed short)0));
+}
+
+static inline __ATTRS_o_ai __vector signed int
+vec_blend(__vector signed int __a, __vector signed int __b,
+          __vector signed int __c) {
+  return vec_sel(__a, __b, vec_cmplt(__c, (__vector signed int)0));
+}
+
+static inline __ATTRS_o_ai __vector __bool int
+vec_blend(__vector __bool int __a, __vector __bool int __b,
+          __vector signed int __c) {
+  return vec_sel(__a, __b, vec_cmplt(__c, (__vector signed int)0));
+}
+
+static inline __ATTRS_o_ai __vector unsigned int
+vec_blend(__vector unsigned int __a, __vector unsigned int __b,
+          __vector signed int __c) {
+  return vec_sel(__a, __b, vec_cmplt(__c, (__vector signed int)0));
+}
+
+static inline __ATTRS_o_ai __vector signed long long
+vec_blend(__vector signed long long __a, __vector signed long long __b,
+          __vector signed long long __c) {
+  return vec_sel(__a, __b, vec_cmplt(__c, (__vector signed long long)0));
+}
+
+static inline __ATTRS_o_ai __vector __bool long long
+vec_blend(__vector __bool long long __a, __vector __bool long long __b,
+          __vector signed long long __c) {
+  return vec_sel(__a, __b, vec_cmplt(__c, (__vector signed long long)0));
+}
+
+static inline __ATTRS_o_ai __vector unsigned long long
+vec_blend(__vector unsigned long long __a, __vector unsigned long long __b,
+          __vector signed long long __c) {
+  return vec_sel(__a, __b, vec_cmplt(__c, (__vector signed long long)0));
+}
+
+static inline __ATTRS_o_ai __vector signed __int128
+vec_blend(__vector signed __int128 __a, __vector signed __int128 __b,
+          __vector signed __int128 __c) {
+  return vec_sel(__a, __b, vec_cmplt(__c, (__vector signed __int128)0));
+}
+
+static inline __ATTRS_o_ai __vector __bool __int128
+vec_blend(__vector __bool __int128 __a, __vector __bool __int128 __b,
+          __vector signed __int128 __c) {
+  return vec_sel(__a, __b, vec_cmplt(__c, (__vector signed __int128)0));
+}
+
+static inline __ATTRS_o_ai __vector unsigned __int128
+vec_blend(__vector unsigned __int128 __a, __vector unsigned __int128 __b,
+          __vector signed __int128 __c) {
+  return vec_sel(__a, __b, vec_cmplt(__c, (__vector signed __int128)0));
+}
+
+static inline __ATTRS_o_ai __vector float
+vec_blend(__vector float __a, __vector float __b,
+          __vector signed int __c) {
+  return vec_sel(__a, __b, vec_cmplt(__c, (__vector signed int)0));
+}
+
+static inline __ATTRS_o_ai __vector double
+vec_blend(__vector double __a, __vector double __b,
+          __vector signed long long __c) {
+  return vec_sel(__a, __b, vec_cmplt(__c, (__vector signed long long)0));
+}
+#endif
+
+/*-- vec_and ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai __vector __bool char
+vec_and(__vector __bool char __a, __vector __bool char __b) {
+  return __a & __b;
+}
+
+static inline __ATTRS_o_ai __vector signed char
+vec_and(__vector signed char __a, __vector signed char __b) {
+  return __a & __b;
+}
+
+static inline __ATTRS_o_ai __vector unsigned char
+vec_and(__vector unsigned char __a, __vector unsigned char __b) {
+  return __a & __b;
+}
+
+static inline __ATTRS_o_ai __vector __bool short
+vec_and(__vector __bool short __a, __vector __bool short __b) {
+  return __a & __b;
+}
+
+static inline __ATTRS_o_ai __vector signed short
+vec_and(__vector signed short __a, __vector signed short __b) {
+  return __a & __b;
+}
+
+static inline __ATTRS_o_ai __vector unsigned short
+vec_and(__vector unsigned short __a, __vector unsigned short __b) {
+  return __a & __b;
+}
+
+static inline __ATTRS_o_ai __vector __bool int
+vec_and(__vector __bool int __a, __vector __bool int __b) {
+  return __a & __b;
+}
+
+static inline __ATTRS_o_ai __vector signed int
+vec_and(__vector signed int __a, __vector signed int __b) {
+  return __a & __b;
+}
+
+static inline __ATTRS_o_ai __vector unsigned int
+vec_and(__vector unsigned int __a, __vector unsigned int __b) {
+  return __a & __b;
+}
+
+static inline __ATTRS_o_ai __vector __bool long long
+vec_and(__vector __bool long long __a, __vector __bool long long __b) {
+  return __a & __b;
+}
+
+static inline __ATTRS_o_ai __vector signed long long
+vec_and(__vector signed long long __a, __vector signed long long __b) {
+  return __a & __b;
+}
+
+static inline __ATTRS_o_ai __vector unsigned long long
+vec_and(__vector unsigned long long __a, __vector unsigned long long __b) {
+  return __a & __b;
+}
+
+static inline __ATTRS_o_ai __vector __bool __int128
+vec_and(__vector __bool __int128 __a, __vector __bool __int128 __b) {
+  return __a & __b;
+}
+
+static inline __ATTRS_o_ai __vector signed __int128
+vec_and(__vector signed __int128 __a, __vector signed __int128 __b) {
+  return __a & __b;
+}
+
+static inline __ATTRS_o_ai __vector unsigned __int128
+vec_and(__vector unsigned __int128 __a, __vector unsigned __int128 __b) {
+  return __a & __b;
+}
+
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai __vector float
+vec_and(__vector float __a, __vector float __b) {
+  return (__vector float)((__vector unsigned int)__a &
+                          (__vector unsigned int)__b);
+}
+#endif
+
+static inline __ATTRS_o_ai __vector double
+vec_and(__vector double __a, __vector double __b) {
+  return (__vector double)((__vector unsigned long long)__a &
+                           (__vector unsigned long long)__b);
+}
+
+/*-- vec_or ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai __vector __bool char
+vec_or(__vector __bool char __a, __vector __bool char __b) {
+  return __a | __b;
+}
+
+static inline __ATTRS_o_ai __vector signed char
+vec_or(__vector signed char __a, __vector signed char __b) {
+  return __a | __b;
+}
+
+static inline __ATTRS_o_ai __vector unsigned char
+vec_or(__vector unsigned char __a, __vector unsigned char __b) {
+  return __a | __b;
+}
+
+static inline __ATTRS_o_ai __vector __bool short
+vec_or(__vector __bool short __a, __vector __bool short __b) {
+  return __a | __b;
+}
+
+static inline __ATTRS_o_ai __vector signed short
+vec_or(__vector signed short __a, __vector signed short __b) {
+  return __a | __b;
+}
+
+static inline __ATTRS_o_ai __vector unsigned short
+vec_or(__vector unsigned short __a, __vector unsigned short __b) {
+  return __a | __b;
+}
+
+static inline __ATTRS_o_ai __vector __bool int
+vec_or(__vector __bool int __a, __vector __bool int __b) {
+  return __a | __b;
+}
+
+static inline __ATTRS_o_ai __vector signed int
+vec_or(__vector signed int __a, __vector signed int __b) {
+  return __a | __b;
+}
+
+static inline __ATTRS_o_ai __vector unsigned int
+vec_or(__vector unsigned int __a, __vector unsigned int __b) {
+  return __a | __b;
+}
+
+static inline __ATTRS_o_ai __vector __bool long long
+vec_or(__vector __bool long long __a, __vector __bool long long __b) {
+  return __a | __b;
+}
+
+static inline __ATTRS_o_ai __vector signed long long
+vec_or(__vector signed long long __a, __vector signed long long __b) {
+  return __a | __b;
+}
+
+static inline __ATTRS_o_ai __vector unsigned long long
+vec_or(__vector unsigned long long __a, __vector unsigned long long __b) {
+  return __a | __b;
+}
+
+static inline __ATTRS_o_ai __vector __bool __int128
+vec_or(__vector __bool __int128 __a, __vector __bool __int128 __b) {
+  return __a | __b;
+}
+
+static inline __ATTRS_o_ai __vector signed __int128
+vec_or(__vector signed __int128 __a, __vector signed __int128 __b) {
+  return __a | __b;
+}
+
+static inline __ATTRS_o_ai __vector unsigned __int128
+vec_or(__vector unsigned __int128 __a, __vector unsigned __int128 __b) {
+  return __a | __b;
+}
+
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai __vector float
+vec_or(__vector float __a, __vector float __b) {
+  return (__vector float)((__vector unsigned int)__a |
+                          (__vector unsigned int)__b);
+}
+#endif
+
+static inline __ATTRS_o_ai __vector double
+vec_or(__vector double __a, __vector double __b) {
+  return (__vector double)((__vector unsigned long long)__a |
+                           (__vector unsigned long long)__b);
+}
+
+/*-- vec_xor ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai __vector __bool char
+vec_xor(__vector __bool char __a, __vector __bool char __b) {
+  return __a ^ __b;
+}
+
+static inline __ATTRS_o_ai __vector signed char
+vec_xor(__vector signed char __a, __vector signed char __b) {
+  return __a ^ __b;
+}
+
+static inline __ATTRS_o_ai __vector unsigned char
+vec_xor(__vector unsigned char __a, __vector unsigned char __b) {
+  return __a ^ __b;
+}
+
+static inline __ATTRS_o_ai __vector __bool short
+vec_xor(__vector __bool short __a, __vector __bool short __b) {
+  return __a ^ __b;
+}
+
+static inline __ATTRS_o_ai __vector signed short
+vec_xor(__vector signed short __a, __vector signed short __b) {
+  return __a ^ __b;
+}
+
+static inline __ATTRS_o_ai __vector unsigned short
+vec_xor(__vector unsigned short __a, __vector unsigned short __b) {
+  return __a ^ __b;
+}
+
+static inline __ATTRS_o_ai __vector __bool int
+vec_xor(__vector __bool int __a, __vector __bool int __b) {
+  return __a ^ __b;
+}
+
+static inline __ATTRS_o_ai __vector signed int
+vec_xor(__vector signed int __a, __vector signed int __b) {
+  return __a ^ __b;
+}
+
+static inline __ATTRS_o_ai __vector unsigned int
+vec_xor(__vector unsigned int __a, __vector unsigned int __b) {
+  return __a ^ __b;
+}
+
+static inline __ATTRS_o_ai __vector __bool long long
+vec_xor(__vector __bool long long __a, __vector __bool long long __b) {
+  return __a ^ __b;
+}
+
+static inline __ATTRS_o_ai __vector signed long long
+vec_xor(__vector signed long long __a, __vector signed long long __b) {
+  return __a ^ __b;
+}
+
+static inline __ATTRS_o_ai __vector unsigned long long
+vec_xor(__vector unsigned long long __a, __vector unsigned long long __b) {
+  return __a ^ __b;
+}
 
-#if __ARCH__ >= 12
-static inline __ATTRS_o_ai int
-vec_any_nan(__vector float __a) {
-  int __cc;
-  __builtin_s390_vftcisb(__a, 15, &__cc);
-  return __cc != 3;
+static inline __ATTRS_o_ai __vector __bool __int128
+vec_xor(__vector __bool __int128 __a, __vector __bool __int128 __b) {
+  return __a ^ __b;
 }
-#endif
 
-static inline __ATTRS_o_ai int
-vec_any_nan(__vector double __a) {
-  int __cc;
-  __builtin_s390_vftcidb(__a, 15, &__cc);
-  return __cc != 3;
+static inline __ATTRS_o_ai __vector signed __int128
+vec_xor(__vector signed __int128 __a, __vector signed __int128 __b) {
+  return __a ^ __b;
 }
 
-/*-- vec_any_numeric --------------------------------------------------------*/
+static inline __ATTRS_o_ai __vector unsigned __int128
+vec_xor(__vector unsigned __int128 __a, __vector unsigned __int128 __b) {
+  return __a ^ __b;
+}
 
 #if __ARCH__ >= 12
-static inline __ATTRS_o_ai int
-vec_any_numeric(__vector float __a) {
-  int __cc;
-  __builtin_s390_vftcisb(__a, 15, &__cc);
-  return __cc != 0;
+static inline __ATTRS_o_ai __vector float
+vec_xor(__vector float __a, __vector float __b) {
+  return (__vector float)((__vector unsigned int)__a ^
+                          (__vector unsigned int)__b);
 }
 #endif
 
-static inline __ATTRS_o_ai int
-vec_any_numeric(__vector double __a) {
-  int __cc;
-  __builtin_s390_vftcidb(__a, 15, &__cc);
-  return __cc != 0;
+static inline __ATTRS_o_ai __vector double
+vec_xor(__vector double __a, __vector double __b) {
+  return (__vector double)((__vector unsigned long long)__a ^
+                           (__vector unsigned long long)__b);
 }
 
 /*-- vec_andc ---------------------------------------------------------------*/
@@ -5947,6 +6847,21 @@ vec_andc(__vector unsigned long long __a, __vector __bool long long __b) {
   return __a & ~__b;
 }
 
+static inline __ATTRS_o_ai __vector __bool __int128
+vec_andc(__vector __bool __int128 __a, __vector __bool __int128 __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai __vector signed __int128
+vec_andc(__vector signed __int128 __a, __vector signed __int128 __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai __vector unsigned __int128
+vec_andc(__vector unsigned __int128 __a, __vector unsigned __int128 __b) {
+  return __a & ~__b;
+}
+
 #if __ARCH__ >= 12
 static inline __ATTRS_o_ai __vector float
 vec_andc(__vector float __a, __vector float __b) {
@@ -6133,6 +7048,21 @@ vec_nor(__vector unsigned long long __a, __vector __bool long long __b) {
   return ~(__a | __b);
 }
 
+static inline __ATTRS_o_ai __vector __bool __int128
+vec_nor(__vector __bool __int128 __a, __vector __bool __int128 __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai __vector signed __int128
+vec_nor(__vector signed __int128 __a, __vector signed __int128 __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai __vector unsigned __int128
+vec_nor(__vector unsigned __int128 __a, __vector unsigned __int128 __b) {
+  return ~(__a | __b);
+}
+
 #if __ARCH__ >= 12
 static inline __ATTRS_o_ai __vector float
 vec_nor(__vector float __a, __vector float __b) {
@@ -6224,6 +7154,21 @@ vec_orc(__vector unsigned long long __a, __vector unsigned long long __b) {
   return __a | ~__b;
 }
 
+static inline __ATTRS_o_ai __vector __bool __int128
+vec_orc(__vector __bool __int128 __a, __vector __bool __int128 __b) {
+  return __a | ~__b;
+}
+
+static inline __ATTRS_o_ai __vector signed __int128
+vec_orc(__vector signed __int128 __a, __vector signed __int128 __b) {
+  return __a | ~__b;
+}
+
+static inline __ATTRS_o_ai __vector unsigned __int128
+vec_orc(__vector unsigned __int128 __a, __vector unsigned __int128 __b) {
+  return __a | ~__b;
+}
+
 static inline __ATTRS_o_ai __vector float
 vec_orc(__vector float __a, __vector float __b) {
   return (__vector float)((__vector unsigned int)__a |
@@ -6300,6 +7245,21 @@ vec_nand(__vector unsigned long long __a, __vector unsigned long long __b) {
   return ~(__a & __b);
 }
 
+static inline __ATTRS_o_ai __vector __bool __int128
+vec_nand(__vector __bool __int128 __a, __vector __bool __int128 __b) {
+  return ~(__a & __b);
+}
+
+static inline __ATTRS_o_ai __vector signed __int128
+vec_nand(__vector signed __int128 __a, __vector signed __int128 __b) {
+  return ~(__a & __b);
+}
+
+static inline __ATTRS_o_ai __vector unsigned __int128
+vec_nand(__vector unsigned __int128 __a, __vector unsigned __int128 __b) {
+  return ~(__a & __b);
+}
+
 static inline __ATTRS_o_ai __vector float
 vec_nand(__vector float __a, __vector float __b) {
   return (__vector float)~((__vector unsigned int)__a &
@@ -6376,6 +7336,21 @@ vec_eqv(__vector unsigned long long __a, __vector unsigned long long __b) {
   return ~(__a ^ __b);
 }
 
+static inline __ATTRS_o_ai __vector __bool __int128
+vec_eqv(__vector __bool __int128 __a, __vector __bool __int128 __b) {
+  return ~(__a ^ __b);
+}
+
+static inline __ATTRS_o_ai __vector signed __int128
+vec_eqv(__vector signed __int128 __a, __vector signed __int128 __b) {
+  return ~(__a ^ __b);
+}
+
+static inline __ATTRS_o_ai __vector unsigned __int128
+vec_eqv(__vector unsigned __int128 __a, __vector unsigned __int128 __b) {
+  return ~(__a ^ __b);
+}
+
 static inline __ATTRS_o_ai __vector float
 vec_eqv(__vector float __a, __vector float __b) {
   return (__vector float)~((__vector unsigned int)__a ^
@@ -6389,6 +7364,91 @@ vec_eqv(__vector double __a, __vector double __b) {
 }
 #endif
 
+/*-- vec_evaluate -----------------------------------------------------------*/
+
+#if __ARCH__ >= 15
+extern __ATTRS_o __vector signed char
+vec_evaluate(__vector signed char __a, __vector signed char __b,
+             __vector signed char __c, unsigned char __d)
+  __constant(__d);
+
+extern __ATTRS_o __vector unsigned char
+vec_evaluate(__vector unsigned char __a, __vector unsigned char __b,
+             __vector unsigned char __c, unsigned char __d)
+  __constant(__d);
+
+extern __ATTRS_o __vector __bool char
+vec_evaluate(__vector __bool char __a, __vector __bool char __b,
+             __vector __bool char __c, unsigned char __d)
+  __constant(__d);
+
+extern __ATTRS_o __vector signed short
+vec_evaluate(__vector signed short __a, __vector signed short __b,
+             __vector signed short __c, unsigned char __d)
+  __constant(__d);
+
+extern __ATTRS_o __vector unsigned short
+vec_evaluate(__vector unsigned short __a, __vector unsigned short __b,
+             __vector unsigned short __c, unsigned char __d)
+  __constant(__d);
+
+extern __ATTRS_o __vector __bool short
+vec_evaluate(__vector __bool short __a, __vector __bool short __b,
+             __vector __bool short __c, unsigned char __d)
+  __constant(__d);
+
+extern __ATTRS_o __vector signed int
+vec_evaluate(__vector signed int __a, __vector signed int __b,
+             __vector signed int __c, unsigned char __d)
+  __constant(__d);
+
+extern __ATTRS_o __vector unsigned int
+vec_evaluate(__vector unsigned int __a, __vector unsigned int __b,
+             __vector unsigned int __c, unsigned char __d)
+  __constant(__d);
+
+extern __ATTRS_o __vector __bool int
+vec_evaluate(__vector __bool int __a, __vector __bool int __b,
+             __vector __bool int __c, unsigned char __d)
+  __constant(__d);
+
+extern __ATTRS_o __vector signed long long
+vec_evaluate(__vector signed long long __a, __vector signed long long __b,
+             __vector signed long long __c, unsigned char __d)
+  __constant(__d);
+
+extern __ATTRS_o __vector unsigned long long
+vec_evaluate(__vector unsigned long long __a, __vector unsigned long long __b,
+             __vector unsigned long long __c, unsigned char __d)
+  __constant(__d);
+
+extern __ATTRS_o __vector __bool long long
+vec_evaluate(__vector __bool long long __a, __vector __bool long long __b,
+             __vector __bool long long __c, unsigned char __d)
+  __constant(__d);
+
+extern __ATTRS_o __vector signed __int128
+vec_evaluate(__vector signed __int128 __a, __vector signed __int128 __b,
+             __vector signed __int128 __c, unsigned char __d)
+  __constant(__d);
+
+extern __ATTRS_o __vector unsigned __int128
+vec_evaluate(__vector unsigned __int128 __a, __vector unsigned __int128 __b,
+             __vector unsigned __int128 __c, unsigned char __d)
+  __constant(__d);
+
+extern __ATTRS_o __vector __bool __int128
+vec_evaluate(__vector __bool __int128 __a, __vector __bool __int128 __b,
+             __vector __bool __int128 __c, unsigned char __d)
+  __constant(__d);
+
+#define vec_evaluate(A, B, C, D) \
+  ((__typeof__((vec_evaluate)((A), (B), (C), (D)))) \
+  __builtin_s390_veval((__vector unsigned char)(A), \
+                       (__vector unsigned char)(B), \
+                       (__vector unsigned char)(C), (D)))
+#endif
+
 /*-- vec_cntlz --------------------------------------------------------------*/
 
 static inline __ATTRS_o_ai __vector unsigned char
@@ -6431,6 +7491,20 @@ vec_cntlz(__vector unsigned long long __a) {
   return __builtin_s390_vclzg(__a);
 }
 
+#if __ARCH__ >= 15
+static inline __ATTRS_o_ai __vector unsigned __int128
+vec_cntlz(__vector signed __int128 __a) {
+  return (__vector unsigned __int128)
+         __builtin_s390_vclzq((unsigned __int128)__a);
+}
+
+static inline __ATTRS_o_ai __vector unsigned __int128
+vec_cntlz(__vector unsigned __int128 __a) {
+  return (__vector unsigned __int128)
+         __builtin_s390_vclzq((unsigned __int128)__a);
+}
+#endif
+
 /*-- vec_cnttz --------------------------------------------------------------*/
 
 static inline __ATTRS_o_ai __vector unsigned char
@@ -6473,6 +7547,20 @@ vec_cnttz(__vector unsigned long long __a) {
   return __builtin_s390_vctzg(__a);
 }
 
+#if __ARCH__ >= 15
+static inline __ATTRS_o_ai __vector unsigned __int128
+vec_cnttz(__vector signed __int128 __a) {
+  return (__vector unsigned __int128)
+         __builtin_s390_vctzq((unsigned __int128)__a);
+}
+
+static inline __ATTRS_o_ai __vector unsigned __int128
+vec_cnttz(__vector unsigned __int128 __a) {
+  return (__vector unsigned __int128)
+         __builtin_s390_vctzq((unsigned __int128)__a);
+}
+#endif
+
 /*-- vec_popcnt -------------------------------------------------------------*/
 
 static inline __ATTRS_o_ai __vector unsigned char
@@ -6904,8 +7992,21 @@ vec_sll(__vector unsigned long long __a, __vector unsigned int __b) {
     (__vector unsigned char)__a, (__vector unsigned char)__b);
 }
 
+static inline __ATTRS_o_ai __vector signed __int128
+vec_sll(__vector signed __int128 __a, __vector unsigned char __b) {
+  return (__vector signed __int128)__builtin_s390_vsl(
+    (__vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai __vector unsigned __int128
+vec_sll(__vector unsigned __int128 __a, __vector unsigned char __b) {
+  return (__vector unsigned __int128)__builtin_s390_vsl(
+    (__vector unsigned char)__a, __b);
+}
+
 /*-- vec_slb ----------------------------------------------------------------*/
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector signed char
 vec_slb(__vector signed char __a, __vector signed char __b) {
   return (__vector signed char)__builtin_s390_vslb(
@@ -6918,6 +8019,7 @@ vec_slb(__vector signed char __a, __vector unsigned char __b) {
     (__vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector unsigned char
 vec_slb(__vector unsigned char __a, __vector signed char __b) {
   return __builtin_s390_vslb(__a, (__vector unsigned char)__b);
@@ -6928,110 +8030,187 @@ vec_slb(__vector unsigned char __a, __vector unsigned char __b) {
   return __builtin_s390_vslb(__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector signed short
 vec_slb(__vector signed short __a, __vector signed short __b) {
   return (__vector signed short)__builtin_s390_vslb(
     (__vector unsigned char)__a, (__vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector signed short
 vec_slb(__vector signed short __a, __vector unsigned short __b) {
   return (__vector signed short)__builtin_s390_vslb(
     (__vector unsigned char)__a, (__vector unsigned char)__b);
 }
 
+static inline __ATTRS_o_ai __vector signed short
+vec_slb(__vector signed short __a, __vector unsigned char __b) {
+  return (__vector signed short)__builtin_s390_vslb(
+    (__vector unsigned char)__a, __b);
+}
+
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector unsigned short
 vec_slb(__vector unsigned short __a, __vector signed short __b) {
   return (__vector unsigned short)__builtin_s390_vslb(
     (__vector unsigned char)__a, (__vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector unsigned short
 vec_slb(__vector unsigned short __a, __vector unsigned short __b) {
   return (__vector unsigned short)__builtin_s390_vslb(
     (__vector unsigned char)__a, (__vector unsigned char)__b);
 }
 
+static inline __ATTRS_o_ai __vector unsigned short
+vec_slb(__vector unsigned short __a, __vector unsigned char __b) {
+  return (__vector unsigned short)__builtin_s390_vslb(
+    (__vector unsigned char)__a, __b);
+}
+
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector signed int
 vec_slb(__vector signed int __a, __vector signed int __b) {
   return (__vector signed int)__builtin_s390_vslb(
     (__vector unsigned char)__a, (__vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector signed int
 vec_slb(__vector signed int __a, __vector unsigned int __b) {
   return (__vector signed int)__builtin_s390_vslb(
     (__vector unsigned char)__a, (__vector unsigned char)__b);
 }
 
+static inline __ATTRS_o_ai __vector signed int
+vec_slb(__vector signed int __a, __vector unsigned char __b) {
+  return (__vector signed int)__builtin_s390_vslb(
+    (__vector unsigned char)__a, __b);
+}
+
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector unsigned int
 vec_slb(__vector unsigned int __a, __vector signed int __b) {
   return (__vector unsigned int)__builtin_s390_vslb(
     (__vector unsigned char)__a, (__vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector unsigned int
 vec_slb(__vector unsigned int __a, __vector unsigned int __b) {
   return (__vector unsigned int)__builtin_s390_vslb(
     (__vector unsigned char)__a, (__vector unsigned char)__b);
 }
 
+static inline __ATTRS_o_ai __vector unsigned int
+vec_slb(__vector unsigned int __a, __vector unsigned char __b) {
+  return (__vector unsigned int)__builtin_s390_vslb(
+    (__vector unsigned char)__a, __b);
+}
+
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector signed long long
 vec_slb(__vector signed long long __a, __vector signed long long __b) {
   return (__vector signed long long)__builtin_s390_vslb(
     (__vector unsigned char)__a, (__vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector signed long long
 vec_slb(__vector signed long long __a, __vector unsigned long long __b) {
   return (__vector signed long long)__builtin_s390_vslb(
     (__vector unsigned char)__a, (__vector unsigned char)__b);
 }
 
+static inline __ATTRS_o_ai __vector signed long long
+vec_slb(__vector signed long long __a, __vector unsigned char __b) {
+  return (__vector signed long long)__builtin_s390_vslb(
+    (__vector unsigned char)__a, __b);
+}
+
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector unsigned long long
 vec_slb(__vector unsigned long long __a, __vector signed long long __b) {
   return (__vector unsigned long long)__builtin_s390_vslb(
     (__vector unsigned char)__a, (__vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector unsigned long long
 vec_slb(__vector unsigned long long __a, __vector unsigned long long __b) {
   return (__vector unsigned long long)__builtin_s390_vslb(
     (__vector unsigned char)__a, (__vector unsigned char)__b);
 }
 
+static inline __ATTRS_o_ai __vector unsigned long long
+vec_slb(__vector unsigned long long __a, __vector unsigned char __b) {
+  return (__vector unsigned long long)__builtin_s390_vslb(
+    (__vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai __vector signed __int128
+vec_slb(__vector signed __int128 __a, __vector unsigned char __b) {
+  return (__vector signed __int128)__builtin_s390_vslb(
+    (__vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai __vector unsigned __int128
+vec_slb(__vector unsigned __int128 __a, __vector unsigned char __b) {
+  return (__vector unsigned __int128)__builtin_s390_vslb(
+    (__vector unsigned char)__a, __b);
+}
+
 #if __ARCH__ >= 12
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector float
 vec_slb(__vector float __a, __vector signed int __b) {
   return (__vector float)__builtin_s390_vslb(
     (__vector unsigned char)__a, (__vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector float
 vec_slb(__vector float __a, __vector unsigned int __b) {
   return (__vector float)__builtin_s390_vslb(
     (__vector unsigned char)__a, (__vector unsigned char)__b);
 }
+
+static inline __ATTRS_o_ai __vector float
+vec_slb(__vector float __a, __vector unsigned char __b) {
+  return (__vector float)__builtin_s390_vslb(
+    (__vector unsigned char)__a, __b);
+}
 #endif
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector double
 vec_slb(__vector double __a, __vector signed long long __b) {
   return (__vector double)__builtin_s390_vslb(
     (__vector unsigned char)__a, (__vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector double
 vec_slb(__vector double __a, __vector unsigned long long __b) {
   return (__vector double)__builtin_s390_vslb(
     (__vector unsigned char)__a, (__vector unsigned char)__b);
 }
 
+static inline __ATTRS_o_ai __vector double
+vec_slb(__vector double __a, __vector unsigned char __b) {
+  return (__vector double)__builtin_s390_vslb(
+    (__vector unsigned char)__a, __b);
+}
+
 /*-- vec_sld ----------------------------------------------------------------*/
 
 extern __ATTRS_o __vector signed char
 vec_sld(__vector signed char __a, __vector signed char __b, int __c)
   __constant_range(__c, 0, 15);
 
+// This prototype is deprecated.
 extern __ATTRS_o __vector __bool char
 vec_sld(__vector __bool char __a, __vector __bool char __b, int __c)
   __constant_range(__c, 0, 15);
@@ -7044,6 +8223,7 @@ extern __ATTRS_o __vector signed short
 vec_sld(__vector signed short __a, __vector signed short __b, int __c)
   __constant_range(__c, 0, 15);
 
+// This prototype is deprecated.
 extern __ATTRS_o __vector __bool short
 vec_sld(__vector __bool short __a, __vector __bool short __b, int __c)
   __constant_range(__c, 0, 15);
@@ -7056,6 +8236,7 @@ extern __ATTRS_o __vector signed int
 vec_sld(__vector signed int __a, __vector signed int __b, int __c)
   __constant_range(__c, 0, 15);
 
+// This prototype is deprecated.
 extern __ATTRS_o __vector __bool int
 vec_sld(__vector __bool int __a, __vector __bool int __b, int __c)
   __constant_range(__c, 0, 15);
@@ -7068,6 +8249,7 @@ extern __ATTRS_o __vector signed long long
 vec_sld(__vector signed long long __a, __vector signed long long __b, int __c)
   __constant_range(__c, 0, 15);
 
+// This prototype is deprecated.
 extern __ATTRS_o __vector __bool long long
 vec_sld(__vector __bool long long __a, __vector __bool long long __b, int __c)
   __constant_range(__c, 0, 15);
@@ -7077,6 +8259,15 @@ vec_sld(__vector unsigned long long __a, __vector unsigned long long __b,
         int __c)
   __constant_range(__c, 0, 15);
 
+extern __ATTRS_o __vector signed __int128
+vec_sld(__vector signed __int128 __a, __vector signed __int128 __b, int __c)
+  __constant_range(__c, 0, 15);
+
+extern __ATTRS_o __vector unsigned __int128
+vec_sld(__vector unsigned __int128 __a, __vector unsigned __int128 __b,
+        int __c)
+  __constant_range(__c, 0, 15);
+
 #if __ARCH__ >= 12
 extern __ATTRS_o __vector float
 vec_sld(__vector float __a, __vector float __b, int __c)
@@ -7126,6 +8317,15 @@ vec_sldw(__vector unsigned long long __a, __vector unsigned long long __b,
          int __c)
   __constant_range(__c, 0, 3);
 
+extern __ATTRS_o __vector signed __int128
+vec_sldw(__vector signed __int128 __a, __vector signed __int128 __b, int __c)
+  __constant_range(__c, 0, 3);
+
+extern __ATTRS_o __vector unsigned __int128
+vec_sldw(__vector unsigned __int128 __a, __vector unsigned __int128 __b,
+         int __c)
+  __constant_range(__c, 0, 3);
+
 // This prototype is deprecated.
 extern __ATTRS_o __vector double
 vec_sldw(__vector double __a, __vector double __b, int __c)
@@ -7172,6 +8372,15 @@ vec_sldb(__vector unsigned long long __a, __vector unsigned long long __b,
          int __c)
   __constant_range(__c, 0, 7);
 
+extern __ATTRS_o __vector signed __int128
+vec_sldb(__vector signed __int128 __a, __vector signed __int128 __b, int __c)
+  __constant_range(__c, 0, 7);
+
+extern __ATTRS_o __vector unsigned __int128
+vec_sldb(__vector unsigned __int128 __a, __vector unsigned __int128 __b,
+         int __c)
+  __constant_range(__c, 0, 7);
+
 extern __ATTRS_o __vector float
 vec_sldb(__vector float __a, __vector float __b, int __c)
   __constant_range(__c, 0, 7);
@@ -7429,8 +8638,21 @@ vec_sral(__vector unsigned long long __a, __vector unsigned int __b) {
     (__vector unsigned char)__a, (__vector unsigned char)__b);
 }
 
+static inline __ATTRS_o_ai __vector signed __int128
+vec_sral(__vector signed __int128 __a, __vector unsigned char __b) {
+  return (__vector signed __int128)__builtin_s390_vsra(
+    (__vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai __vector unsigned __int128
+vec_sral(__vector unsigned __int128 __a, __vector unsigned char __b) {
+  return (__vector unsigned __int128)__builtin_s390_vsra(
+    (__vector unsigned char)__a, __b);
+}
+
 /*-- vec_srab ---------------------------------------------------------------*/
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector signed char
 vec_srab(__vector signed char __a, __vector signed char __b) {
   return (__vector signed char)__builtin_s390_vsrab(
@@ -7443,6 +8665,7 @@ vec_srab(__vector signed char __a, __vector unsigned char __b) {
     (__vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector unsigned char
 vec_srab(__vector unsigned char __a, __vector signed char __b) {
   return __builtin_s390_vsrab(__a, (__vector unsigned char)__b);
@@ -7453,104 +8676,180 @@ vec_srab(__vector unsigned char __a, __vector unsigned char __b) {
   return __builtin_s390_vsrab(__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector signed short
 vec_srab(__vector signed short __a, __vector signed short __b) {
   return (__vector signed short)__builtin_s390_vsrab(
     (__vector unsigned char)__a, (__vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector signed short
 vec_srab(__vector signed short __a, __vector unsigned short __b) {
   return (__vector signed short)__builtin_s390_vsrab(
     (__vector unsigned char)__a, (__vector unsigned char)__b);
 }
 
+static inline __ATTRS_o_ai __vector signed short
+vec_srab(__vector signed short __a, __vector unsigned char __b) {
+  return (__vector signed short)__builtin_s390_vsrab(
+    (__vector unsigned char)__a, __b);
+}
+
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector unsigned short
 vec_srab(__vector unsigned short __a, __vector signed short __b) {
   return (__vector unsigned short)__builtin_s390_vsrab(
     (__vector unsigned char)__a, (__vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector unsigned short
 vec_srab(__vector unsigned short __a, __vector unsigned short __b) {
   return (__vector unsigned short)__builtin_s390_vsrab(
     (__vector unsigned char)__a, (__vector unsigned char)__b);
 }
 
+static inline __ATTRS_o_ai __vector unsigned short
+vec_srab(__vector unsigned short __a, __vector unsigned char __b) {
+  return (__vector unsigned short)__builtin_s390_vsrab(
+    (__vector unsigned char)__a, __b);
+}
+
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector signed int
 vec_srab(__vector signed int __a, __vector signed int __b) {
   return (__vector signed int)__builtin_s390_vsrab(
     (__vector unsigned char)__a, (__vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector signed int
 vec_srab(__vector signed int __a, __vector unsigned int __b) {
   return (__vector signed int)__builtin_s390_vsrab(
     (__vector unsigned char)__a, (__vector unsigned char)__b);
 }
 
+static inline __ATTRS_o_ai __vector signed int
+vec_srab(__vector signed int __a, __vector unsigned char __b) {
+  return (__vector signed int)__builtin_s390_vsrab(
+    (__vector unsigned char)__a, __b);
+}
+
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector unsigned int
 vec_srab(__vector unsigned int __a, __vector signed int __b) {
   return (__vector unsigned int)__builtin_s390_vsrab(
     (__vector unsigned char)__a, (__vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector unsigned int
 vec_srab(__vector unsigned int __a, __vector unsigned int __b) {
   return (__vector unsigned int)__builtin_s390_vsrab(
     (__vector unsigned char)__a, (__vector unsigned char)__b);
 }
 
+static inline __ATTRS_o_ai __vector unsigned int
+vec_srab(__vector unsigned int __a, __vector unsigned char __b) {
+  return (__vector unsigned int)__builtin_s390_vsrab(
+    (__vector unsigned char)__a, __b);
+}
+
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector signed long long
 vec_srab(__vector signed long long __a, __vector signed long long __b) {
   return (__vector signed long long)__builtin_s390_vsrab(
     (__vector unsigned char)__a, (__vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector signed long long
 vec_srab(__vector signed long long __a, __vector unsigned long long __b) {
   return (__vector signed long long)__builtin_s390_vsrab(
     (__vector unsigned char)__a, (__vector unsigned char)__b);
 }
 
+static inline __ATTRS_o_ai __vector signed long long
+vec_srab(__vector signed long long __a, __vector unsigned char __b) {
+  return (__vector signed long long)__builtin_s390_vsrab(
+    (__vector unsigned char)__a, __b);
+}
+
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector unsigned long long
 vec_srab(__vector unsigned long long __a, __vector signed long long __b) {
   return (__vector unsigned long long)__builtin_s390_vsrab(
     (__vector unsigned char)__a, (__vector unsigned char)__b);
 }
 
-static inline __ATTRS_o_ai __vector unsigned long long
-vec_srab(__vector unsigned long long __a, __vector unsigned long long __b) {
-  return (__vector unsigned long long)__builtin_s390_vsrab(
-    (__vector unsigned char)__a, (__vector unsigned char)__b);
+// This prototype is deprecated.
+static inline __ATTRS_o_ai __vector unsigned long long
+vec_srab(__vector unsigned long long __a, __vector unsigned long long __b) {
+  return (__vector unsigned long long)__builtin_s390_vsrab(
+    (__vector unsigned char)__a, (__vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai __vector unsigned long long
+vec_srab(__vector unsigned long long __a, __vector unsigned char __b) {
+  return (__vector unsigned long long)__builtin_s390_vsrab(
+    (__vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai __vector signed __int128
+vec_srab(__vector signed __int128 __a, __vector unsigned char __b) {
+  return (__vector signed __int128)__builtin_s390_vsrab(
+    (__vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai __vector unsigned __int128
+vec_srab(__vector unsigned __int128 __a, __vector unsigned char __b) {
+  return (__vector unsigned __int128)__builtin_s390_vsrab(
+    (__vector unsigned char)__a, __b);
 }
 
 #if __ARCH__ >= 12
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector float
 vec_srab(__vector float __a, __vector signed int __b) {
   return (__vector float)__builtin_s390_vsrab(
     (__vector unsigned char)__a, (__vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector float
 vec_srab(__vector float __a, __vector unsigned int __b) {
   return (__vector float)__builtin_s390_vsrab(
     (__vector unsigned char)__a, (__vector unsigned char)__b);
 }
+
+static inline __ATTRS_o_ai __vector float
+vec_srab(__vector float __a, __vector unsigned char __b) {
+  return (__vector float)__builtin_s390_vsrab(
+    (__vector unsigned char)__a, __b);
+}
 #endif
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector double
 vec_srab(__vector double __a, __vector signed long long __b) {
   return (__vector double)__builtin_s390_vsrab(
     (__vector unsigned char)__a, (__vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector double
 vec_srab(__vector double __a, __vector unsigned long long __b) {
   return (__vector double)__builtin_s390_vsrab(
     (__vector unsigned char)__a, (__vector unsigned char)__b);
 }
 
+static inline __ATTRS_o_ai __vector double
+vec_srab(__vector double __a, __vector unsigned char __b) {
+  return (__vector double)__builtin_s390_vsrab(
+    (__vector unsigned char)__a, __b);
+}
+
 /*-- vec_srl ----------------------------------------------------------------*/
 
 static inline __ATTRS_o_ai __vector signed char
@@ -7794,8 +9093,21 @@ vec_srl(__vector unsigned long long __a, __vector unsigned int __b) {
     (__vector unsigned char)__a, (__vector unsigned char)__b);
 }
 
+static inline __ATTRS_o_ai __vector signed __int128
+vec_srl(__vector signed __int128 __a, __vector unsigned char __b) {
+  return (__vector signed __int128)__builtin_s390_vsrl(
+    (__vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai __vector unsigned __int128
+vec_srl(__vector unsigned __int128 __a, __vector unsigned char __b) {
+  return (__vector unsigned __int128)__builtin_s390_vsrl(
+    (__vector unsigned char)__a, __b);
+}
+
 /*-- vec_srb ----------------------------------------------------------------*/
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector signed char
 vec_srb(__vector signed char __a, __vector signed char __b) {
   return (__vector signed char)__builtin_s390_vsrlb(
@@ -7808,6 +9120,7 @@ vec_srb(__vector signed char __a, __vector unsigned char __b) {
     (__vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector unsigned char
 vec_srb(__vector unsigned char __a, __vector signed char __b) {
   return __builtin_s390_vsrlb(__a, (__vector unsigned char)__b);
@@ -7818,104 +9131,180 @@ vec_srb(__vector unsigned char __a, __vector unsigned char __b) {
   return __builtin_s390_vsrlb(__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector signed short
 vec_srb(__vector signed short __a, __vector signed short __b) {
   return (__vector signed short)__builtin_s390_vsrlb(
     (__vector unsigned char)__a, (__vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector signed short
 vec_srb(__vector signed short __a, __vector unsigned short __b) {
   return (__vector signed short)__builtin_s390_vsrlb(
     (__vector unsigned char)__a, (__vector unsigned char)__b);
 }
 
+static inline __ATTRS_o_ai __vector signed short
+vec_srb(__vector signed short __a, __vector unsigned char __b) {
+  return (__vector signed short)__builtin_s390_vsrlb(
+    (__vector unsigned char)__a, __b);
+}
+
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector unsigned short
 vec_srb(__vector unsigned short __a, __vector signed short __b) {
   return (__vector unsigned short)__builtin_s390_vsrlb(
     (__vector unsigned char)__a, (__vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector unsigned short
 vec_srb(__vector unsigned short __a, __vector unsigned short __b) {
   return (__vector unsigned short)__builtin_s390_vsrlb(
     (__vector unsigned char)__a, (__vector unsigned char)__b);
 }
 
+static inline __ATTRS_o_ai __vector unsigned short
+vec_srb(__vector unsigned short __a, __vector unsigned char __b) {
+  return (__vector unsigned short)__builtin_s390_vsrlb(
+    (__vector unsigned char)__a, __b);
+}
+
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector signed int
 vec_srb(__vector signed int __a, __vector signed int __b) {
   return (__vector signed int)__builtin_s390_vsrlb(
     (__vector unsigned char)__a, (__vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector signed int
 vec_srb(__vector signed int __a, __vector unsigned int __b) {
   return (__vector signed int)__builtin_s390_vsrlb(
     (__vector unsigned char)__a, (__vector unsigned char)__b);
 }
 
+static inline __ATTRS_o_ai __vector signed int
+vec_srb(__vector signed int __a, __vector unsigned char __b) {
+  return (__vector signed int)__builtin_s390_vsrlb(
+    (__vector unsigned char)__a, __b);
+}
+
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector unsigned int
 vec_srb(__vector unsigned int __a, __vector signed int __b) {
   return (__vector unsigned int)__builtin_s390_vsrlb(
     (__vector unsigned char)__a, (__vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector unsigned int
 vec_srb(__vector unsigned int __a, __vector unsigned int __b) {
   return (__vector unsigned int)__builtin_s390_vsrlb(
     (__vector unsigned char)__a, (__vector unsigned char)__b);
 }
 
+static inline __ATTRS_o_ai __vector unsigned int
+vec_srb(__vector unsigned int __a, __vector unsigned char __b) {
+  return (__vector unsigned int)__builtin_s390_vsrlb(
+    (__vector unsigned char)__a, __b);
+}
+
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector signed long long
 vec_srb(__vector signed long long __a, __vector signed long long __b) {
   return (__vector signed long long)__builtin_s390_vsrlb(
     (__vector unsigned char)__a, (__vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector signed long long
 vec_srb(__vector signed long long __a, __vector unsigned long long __b) {
   return (__vector signed long long)__builtin_s390_vsrlb(
     (__vector unsigned char)__a, (__vector unsigned char)__b);
 }
 
+static inline __ATTRS_o_ai __vector signed long long
+vec_srb(__vector signed long long __a, __vector unsigned char __b) {
+  return (__vector signed long long)__builtin_s390_vsrlb(
+    (__vector unsigned char)__a, __b);
+}
+
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector unsigned long long
 vec_srb(__vector unsigned long long __a, __vector signed long long __b) {
   return (__vector unsigned long long)__builtin_s390_vsrlb(
     (__vector unsigned char)__a, (__vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector unsigned long long
 vec_srb(__vector unsigned long long __a, __vector unsigned long long __b) {
   return (__vector unsigned long long)__builtin_s390_vsrlb(
     (__vector unsigned char)__a, (__vector unsigned char)__b);
 }
 
+static inline __ATTRS_o_ai __vector unsigned long long
+vec_srb(__vector unsigned long long __a, __vector unsigned char __b) {
+  return (__vector unsigned long long)__builtin_s390_vsrlb(
+    (__vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai __vector signed __int128
+vec_srb(__vector signed __int128 __a, __vector unsigned char __b) {
+  return (__vector signed __int128)__builtin_s390_vsrlb(
+    (__vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai __vector unsigned __int128
+vec_srb(__vector unsigned __int128 __a, __vector unsigned char __b) {
+  return (__vector unsigned __int128)__builtin_s390_vsrlb(
+    (__vector unsigned char)__a, __b);
+}
+
 #if __ARCH__ >= 12
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector float
 vec_srb(__vector float __a, __vector signed int __b) {
   return (__vector float)__builtin_s390_vsrlb(
     (__vector unsigned char)__a, (__vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector float
 vec_srb(__vector float __a, __vector unsigned int __b) {
   return (__vector float)__builtin_s390_vsrlb(
     (__vector unsigned char)__a, (__vector unsigned char)__b);
 }
+
+static inline __ATTRS_o_ai __vector float
+vec_srb(__vector float __a, __vector unsigned char __b) {
+  return (__vector float)__builtin_s390_vsrlb(
+    (__vector unsigned char)__a, __b);
+}
 #endif
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector double
 vec_srb(__vector double __a, __vector signed long long __b) {
   return (__vector double)__builtin_s390_vsrlb(
     (__vector unsigned char)__a, (__vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector double
 vec_srb(__vector double __a, __vector unsigned long long __b) {
   return (__vector double)__builtin_s390_vsrlb(
     (__vector unsigned char)__a, (__vector unsigned char)__b);
 }
 
+static inline __ATTRS_o_ai __vector double
+vec_srb(__vector double __a, __vector unsigned char __b) {
+  return (__vector double)__builtin_s390_vsrlb(
+    (__vector unsigned char)__a, __b);
+}
+
 /*-- vec_srdb ---------------------------------------------------------------*/
 
 #if __ARCH__ >= 13
@@ -7953,6 +9342,15 @@ vec_srdb(__vector unsigned long long __a, __vector unsigned long long __b,
          int __c)
   __constant_range(__c, 0, 7);
 
+extern __ATTRS_o __vector signed __int128
+vec_srdb(__vector signed __int128 __a, __vector signed __int128 __b, int __c)
+  __constant_range(__c, 0, 7);
+
+extern __ATTRS_o __vector unsigned __int128
+vec_srdb(__vector unsigned __int128 __a, __vector unsigned __int128 __b,
+         int __c)
+  __constant_range(__c, 0, 7);
+
 extern __ATTRS_o __vector float
 vec_srdb(__vector float __a, __vector float __b, int __c)
   __constant_range(__c, 0, 7);
@@ -7989,6 +9387,11 @@ vec_abs(__vector signed long long __a) {
   return vec_sel(__a, -__a, vec_cmplt(__a, (__vector signed long long)0));
 }
 
+static inline __ATTRS_o_ai __vector signed __int128
+vec_abs(__vector signed __int128 __a) {
+  return vec_sel(__a, -__a, vec_cmplt(__a, (__vector signed __int128)0));
+}
+
 #if __ARCH__ >= 12
 static inline __ATTRS_o_ai __vector float
 vec_abs(__vector float __a) {
@@ -8169,6 +9572,16 @@ vec_max(__vector __bool long long __a, __vector unsigned long long __b) {
   return vec_sel(__b, __ac, vec_cmpgt(__ac, __b));
 }
 
+static inline __ATTRS_o_ai __vector signed __int128
+vec_max(__vector signed __int128 __a, __vector signed __int128 __b) {
+  return vec_sel(__b, __a, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai __vector unsigned __int128
+vec_max(__vector unsigned __int128 __a, __vector unsigned __int128 __b) {
+  return vec_sel(__b, __a, vec_cmpgt(__a, __b));
+}
+
 #if __ARCH__ >= 12
 static inline __ATTRS_o_ai __vector float
 vec_max(__vector float __a, __vector float __b) {
@@ -8339,6 +9752,16 @@ vec_min(__vector __bool long long __a, __vector unsigned long long __b) {
   return vec_sel(__ac, __b, vec_cmpgt(__ac, __b));
 }
 
+static inline __ATTRS_o_ai __vector signed __int128
+vec_min(__vector signed __int128 __a, __vector signed __int128 __b) {
+  return vec_sel(__a, __b, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai __vector unsigned __int128
+vec_min(__vector unsigned __int128 __a, __vector unsigned __int128 __b) {
+  return vec_sel(__a, __b, vec_cmpgt(__a, __b));
+}
+
 #if __ARCH__ >= 12
 static inline __ATTRS_o_ai __vector float
 vec_min(__vector float __a, __vector float __b) {
@@ -8357,10 +9780,10 @@ vec_min(__vector double __a, __vector double __b) {
 
 /*-- vec_add_u128 -----------------------------------------------------------*/
 
+// This prototype is deprecated.
 static inline __ATTRS_ai __vector unsigned char
 vec_add_u128(__vector unsigned char __a, __vector unsigned char __b) {
-  return (__vector unsigned char)
-         (unsigned __int128 __attribute__((__vector_size__(16))))
+  return (__vector unsigned char)(__vector unsigned __int128)
          ((__int128)__a + (__int128)__b);
 }
 
@@ -8386,33 +9809,59 @@ vec_addc(__vector unsigned long long __a, __vector unsigned long long __b) {
   return __builtin_s390_vaccg(__a, __b);
 }
 
+static inline __ATTRS_o_ai __vector unsigned __int128
+vec_addc(__vector unsigned __int128 __a, __vector unsigned __int128 __b) {
+  return (__vector unsigned __int128)
+         __builtin_s390_vaccq((unsigned __int128)__a, (unsigned __int128)__b);
+}
+
 /*-- vec_addc_u128 ----------------------------------------------------------*/
 
+// This prototype is deprecated.
 static inline __ATTRS_ai __vector unsigned char
 vec_addc_u128(__vector unsigned char __a, __vector unsigned char __b) {
-  return (__vector unsigned char)
-         (unsigned __int128 __attribute__((__vector_size__(16))))
+  return (__vector unsigned char)(__vector unsigned __int128)
          __builtin_s390_vaccq((unsigned __int128)__a, (unsigned __int128)__b);
 }
 
+/*-- vec_adde ---------------------------------------------------------------*/
+
+static inline __ATTRS_ai __vector unsigned __int128
+vec_adde(__vector unsigned __int128 __a, __vector unsigned __int128 __b,
+         __vector unsigned __int128 __c) {
+  return (__vector unsigned __int128)
+         __builtin_s390_vacq((unsigned __int128)__a, (unsigned __int128)__b,
+                             (unsigned __int128)__c);
+}
+
 /*-- vec_adde_u128 ----------------------------------------------------------*/
 
+// This prototype is deprecated.
 static inline __ATTRS_ai __vector unsigned char
 vec_adde_u128(__vector unsigned char __a, __vector unsigned char __b,
               __vector unsigned char __c) {
-  return (__vector unsigned char)
-         (unsigned __int128 __attribute__((__vector_size__(16))))
+  return (__vector unsigned char)(__vector unsigned __int128)
          __builtin_s390_vacq((unsigned __int128)__a, (unsigned __int128)__b,
                              (unsigned __int128)__c);
 }
 
+/*-- vec_addec --------------------------------------------------------------*/
+
+static inline __ATTRS_ai __vector unsigned __int128
+vec_addec(__vector unsigned __int128 __a, __vector unsigned __int128 __b,
+          __vector unsigned __int128 __c) {
+  return (__vector unsigned __int128)
+         __builtin_s390_vacccq((unsigned __int128)__a, (unsigned __int128)__b,
+                               (unsigned __int128)__c);
+}
+
 /*-- vec_addec_u128 ---------------------------------------------------------*/
 
+// This prototype is deprecated.
 static inline __ATTRS_ai __vector unsigned char
 vec_addec_u128(__vector unsigned char __a, __vector unsigned char __b,
                __vector unsigned char __c) {
-  return (__vector unsigned char)
-         (unsigned __int128 __attribute__((__vector_size__(16))))
+  return (__vector unsigned char)(__vector unsigned __int128)
          __builtin_s390_vacccq((unsigned __int128)__a, (unsigned __int128)__b,
                                (unsigned __int128)__c);
 }
@@ -8439,6 +9888,14 @@ vec_avg(__vector signed long long __a, __vector signed long long __b) {
   return __builtin_s390_vavgg(__a, __b);
 }
 
+#if __ARCH__ >= 15
+static inline __ATTRS_o_ai __vector signed __int128
+vec_avg(__vector signed __int128 __a, __vector signed __int128 __b) {
+  return (__vector signed __int128)
+         __builtin_s390_vavgq((signed __int128)__a, (signed __int128)__b);
+}
+#endif
+
 static inline __ATTRS_o_ai __vector unsigned char
 vec_avg(__vector unsigned char __a, __vector unsigned char __b) {
   return __builtin_s390_vavglb(__a, __b);
@@ -8459,6 +9916,14 @@ vec_avg(__vector unsigned long long __a, __vector unsigned long long __b) {
   return __builtin_s390_vavglg(__a, __b);
 }
 
+#if __ARCH__ >= 15
+static inline __ATTRS_o_ai __vector unsigned __int128
+vec_avg(__vector unsigned __int128 __a, __vector unsigned __int128 __b) {
+  return (__vector unsigned __int128)
+         __builtin_s390_vavglq((unsigned __int128)__a, (unsigned __int128)__b);
+}
+#endif
+
 /*-- vec_checksum -----------------------------------------------------------*/
 
 static inline __ATTRS_ai __vector unsigned int
@@ -8483,13 +9948,18 @@ vec_gfmsum(__vector unsigned int __a, __vector unsigned int __b) {
   return __builtin_s390_vgfmf(__a, __b);
 }
 
+static inline __ATTRS_o_ai __vector unsigned __int128
+vec_gfmsum(__vector unsigned long long __a, __vector unsigned long long __b) {
+  return (__vector unsigned __int128)__builtin_s390_vgfmg(__a, __b);
+}
+
 /*-- vec_gfmsum_128 ---------------------------------------------------------*/
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector unsigned char
 vec_gfmsum_128(__vector unsigned long long __a,
                __vector unsigned long long __b) {
-  return (__vector unsigned char)
-         (unsigned __int128 __attribute__((__vector_size__(16))))
+  return (__vector unsigned char)(__vector unsigned __int128)
          __builtin_s390_vgfmg(__a, __b);
 }
 
@@ -8513,14 +9983,21 @@ vec_gfmsum_accum(__vector unsigned int __a, __vector unsigned int __b,
   return __builtin_s390_vgfmaf(__a, __b, __c);
 }
 
+static inline __ATTRS_o_ai __vector unsigned __int128
+vec_gfmsum_accum(__vector unsigned long long __a, __vector unsigned long long __b,
+                 __vector unsigned __int128 __c) {
+  return (__vector unsigned __int128)
+         __builtin_s390_vgfmag(__a, __b, (unsigned __int128)__c);
+}
+
 /*-- vec_gfmsum_accum_128 ---------------------------------------------------*/
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector unsigned char
 vec_gfmsum_accum_128(__vector unsigned long long __a,
                      __vector unsigned long long __b,
                      __vector unsigned char __c) {
-  return (__vector unsigned char)
-         (unsigned __int128 __attribute__((__vector_size__(16))))
+  return (__vector unsigned char)(__vector unsigned __int128)
          __builtin_s390_vgfmag(__a, __b, (unsigned __int128)__c);
 }
 
@@ -8598,6 +10075,56 @@ vec_mladd(__vector unsigned int __a, __vector unsigned int __b,
   return __a * __b + __c;
 }
 
+#if __ARCH__ >= 15
+static inline __ATTRS_o_ai __vector signed long long
+vec_mladd(__vector signed long long __a, __vector signed long long __b,
+          __vector signed long long __c) {
+  return __a * __b + __c;
+}
+
+static inline __ATTRS_o_ai __vector signed long long
+vec_mladd(__vector unsigned long long __a, __vector signed long long __b,
+          __vector signed long long __c) {
+  return (__vector signed long long)__a * __b + __c;
+}
+
+static inline __ATTRS_o_ai __vector signed long long
+vec_mladd(__vector signed long long __a, __vector unsigned long long __b,
+          __vector unsigned long long __c) {
+  return __a * (__vector signed long long)__b + (__vector signed long long)__c;
+}
+
+static inline __ATTRS_o_ai __vector unsigned long long
+vec_mladd(__vector unsigned long long __a, __vector unsigned long long __b,
+          __vector unsigned long long __c) {
+  return __a * __b + __c;
+}
+
+static inline __ATTRS_o_ai __vector signed __int128
+vec_mladd(__vector signed __int128 __a, __vector signed __int128 __b,
+          __vector signed __int128 __c) {
+  return __a * __b + __c;
+}
+
+static inline __ATTRS_o_ai __vector signed __int128
+vec_mladd(__vector unsigned __int128 __a, __vector signed __int128 __b,
+          __vector signed __int128 __c) {
+  return (__vector signed __int128)__a * __b + __c;
+}
+
+static inline __ATTRS_o_ai __vector signed __int128
+vec_mladd(__vector signed __int128 __a, __vector unsigned __int128 __b,
+          __vector unsigned __int128 __c) {
+  return __a * (__vector signed __int128)__b + (__vector signed __int128)__c;
+}
+
+static inline __ATTRS_o_ai __vector unsigned __int128
+vec_mladd(__vector unsigned __int128 __a, __vector unsigned __int128 __b,
+          __vector unsigned __int128 __c) {
+  return __a * __b + __c;
+}
+#endif
+
 /*-- vec_mhadd --------------------------------------------------------------*/
 
 static inline __ATTRS_o_ai __vector signed char
@@ -8636,6 +10163,34 @@ vec_mhadd(__vector unsigned int __a, __vector unsigned int __b,
   return __builtin_s390_vmalhf(__a, __b, __c);
 }
 
+#if __ARCH__ >= 15
+static inline __ATTRS_o_ai __vector signed long long
+vec_mhadd(__vector signed long long __a, __vector signed long long __b,
+          __vector signed long long __c) {
+  return __builtin_s390_vmahg(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai __vector unsigned long long
+vec_mhadd(__vector unsigned long long __a, __vector unsigned long long __b,
+          __vector unsigned long long __c) {
+  return __builtin_s390_vmalhg(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai __vector signed __int128
+vec_mhadd(__vector signed __int128 __a, __vector signed __int128 __b,
+          __vector signed __int128 __c) {
+  return (__vector signed __int128)
+         __builtin_s390_vmahq((signed __int128)__a, (signed __int128)__b, (signed __int128)__c);
+}
+
+static inline __ATTRS_o_ai __vector unsigned __int128
+vec_mhadd(__vector unsigned __int128 __a, __vector unsigned __int128 __b,
+          __vector unsigned __int128 __c) {
+  return (__vector unsigned __int128)
+         __builtin_s390_vmalhq((unsigned __int128)__a, (unsigned __int128)__b, (unsigned __int128)__c);
+}
+#endif
+
 /*-- vec_meadd --------------------------------------------------------------*/
 
 static inline __ATTRS_o_ai __vector signed short
@@ -8674,6 +10229,22 @@ vec_meadd(__vector unsigned int __a, __vector unsigned int __b,
   return __builtin_s390_vmalef(__a, __b, __c);
 }
 
+#if __ARCH__ >= 15
+static inline __ATTRS_o_ai __vector signed __int128
+vec_meadd(__vector signed long long __a, __vector signed long long __b,
+          __vector signed __int128 __c) {
+  return (__vector signed __int128)
+         __builtin_s390_vmaeg(__a, __b, (signed __int128)__c);
+}
+
+static inline __ATTRS_o_ai __vector unsigned __int128
+vec_meadd(__vector unsigned long long __a, __vector unsigned long long __b,
+          __vector unsigned __int128 __c) {
+  return (__vector unsigned __int128)
+         __builtin_s390_vmaleg(__a, __b, (unsigned __int128)__c);
+}
+#endif
+
 /*-- vec_moadd --------------------------------------------------------------*/
 
 static inline __ATTRS_o_ai __vector signed short
@@ -8712,6 +10283,22 @@ vec_moadd(__vector unsigned int __a, __vector unsigned int __b,
   return __builtin_s390_vmalof(__a, __b, __c);
 }
 
+#if __ARCH__ >= 15
+static inline __ATTRS_o_ai __vector signed __int128
+vec_moadd(__vector signed long long __a, __vector signed long long __b,
+          __vector signed __int128 __c) {
+  return (__vector signed __int128)
+         __builtin_s390_vmaog(__a, __b, (signed __int128)__c);
+}
+
+static inline __ATTRS_o_ai __vector unsigned __int128
+vec_moadd(__vector unsigned long long __a, __vector unsigned long long __b,
+          __vector unsigned __int128 __c) {
+  return (__vector unsigned __int128)
+         __builtin_s390_vmalog(__a, __b, (unsigned __int128)__c);
+}
+#endif
+
 /*-- vec_mulh ---------------------------------------------------------------*/
 
 static inline __ATTRS_o_ai __vector signed char
@@ -8744,6 +10331,30 @@ vec_mulh(__vector unsigned int __a, __vector unsigned int __b) {
   return __builtin_s390_vmlhf(__a, __b);
 }
 
+#if __ARCH__ >= 15
+static inline __ATTRS_o_ai __vector signed long long
+vec_mulh(__vector signed long long __a, __vector signed long long __b) {
+  return __builtin_s390_vmhg(__a, __b);
+}
+
+static inline __ATTRS_o_ai __vector unsigned long long
+vec_mulh(__vector unsigned long long __a, __vector unsigned long long __b) {
+  return __builtin_s390_vmlhg(__a, __b);
+}
+
+static inline __ATTRS_o_ai __vector signed __int128
+vec_mulh(__vector signed __int128 __a, __vector signed __int128 __b) {
+  return (__vector signed __int128)
+         __builtin_s390_vmhq((signed __int128)__a, (signed __int128)__b);
+}
+
+static inline __ATTRS_o_ai __vector unsigned __int128
+vec_mulh(__vector unsigned __int128 __a, __vector unsigned __int128 __b) {
+  return (__vector unsigned __int128)
+         __builtin_s390_vmlhq((unsigned __int128)__a, (unsigned __int128)__b);
+}
+#endif
+
 /*-- vec_mule ---------------------------------------------------------------*/
 
 static inline __ATTRS_o_ai __vector signed short
@@ -8776,6 +10387,18 @@ vec_mule(__vector unsigned int __a, __vector unsigned int __b) {
   return __builtin_s390_vmlef(__a, __b);
 }
 
+#if __ARCH__ >= 15
+static inline __ATTRS_o_ai __vector signed __int128
+vec_mule(__vector signed long long __a, __vector signed long long __b) {
+  return (__vector signed __int128)__builtin_s390_vmeg(__a, __b);
+}
+
+static inline __ATTRS_o_ai __vector unsigned __int128
+vec_mule(__vector unsigned long long __a, __vector unsigned long long __b) {
+  return (__vector unsigned __int128)__builtin_s390_vmleg(__a, __b);
+}
+#endif
+
 /*-- vec_mulo ---------------------------------------------------------------*/
 
 static inline __ATTRS_o_ai __vector signed short
@@ -8808,9 +10431,35 @@ vec_mulo(__vector unsigned int __a, __vector unsigned int __b) {
   return __builtin_s390_vmlof(__a, __b);
 }
 
+#if __ARCH__ >= 15
+static inline __ATTRS_o_ai __vector signed __int128
+vec_mulo(__vector signed long long __a, __vector signed long long __b) {
+  return (__vector signed __int128)__builtin_s390_vmog(__a, __b);
+}
+
+static inline __ATTRS_o_ai __vector unsigned __int128
+vec_mulo(__vector unsigned long long __a, __vector unsigned long long __b) {
+  return (__vector unsigned __int128)__builtin_s390_vmlog(__a, __b);
+}
+#endif
+
+/*-- vec_msum ---------------------------------------------------------------*/
+
+#if __ARCH__ >= 12
+extern __ATTRS_o __vector unsigned __int128
+vec_msum(__vector unsigned long long __a, __vector unsigned long long __b,
+         __vector unsigned __int128 __c, int __d)
+  __constant_range(__d, 0, 15);
+
+#define vec_msum(X, Y, Z, W) \
+  ((__typeof__((vec_msum)((X), (Y), (Z), (W)))) \
+   __builtin_s390_vmslg((X), (Y), (unsigned __int128)(Z), (W)))
+#endif
+
 /*-- vec_msum_u128 ----------------------------------------------------------*/
 
 #if __ARCH__ >= 12
+// This prototype is deprecated.
 extern __ATTRS_o __vector unsigned char
 vec_msum_u128(__vector unsigned long long __a, __vector unsigned long long __b,
               __vector unsigned char __c, int __d)
@@ -8818,16 +10467,16 @@ vec_msum_u128(__vector unsigned long long __a, __vector unsigned long long __b,
 
 #define vec_msum_u128(X, Y, Z, W) \
   ((__typeof__((vec_msum_u128)((X), (Y), (Z), (W)))) \
-   (unsigned __int128 __attribute__((__vector_size__(16)))) \
+   (__vector unsigned __int128) \
    __builtin_s390_vmslg((X), (Y), (unsigned __int128)(Z), (W)))
 #endif
 
 /*-- vec_sub_u128 -----------------------------------------------------------*/
 
+// This prototype is deprecated.
 static inline __ATTRS_ai __vector unsigned char
 vec_sub_u128(__vector unsigned char __a, __vector unsigned char __b) {
-  return (__vector unsigned char)
-         (unsigned __int128 __attribute__((__vector_size__(16))))
+  return (__vector unsigned char)(__vector unsigned __int128)
          ((__int128)__a - (__int128)__b);
 }
 
@@ -8853,33 +10502,59 @@ vec_subc(__vector unsigned long long __a, __vector unsigned long long __b) {
   return __builtin_s390_vscbig(__a, __b);
 }
 
+static inline __ATTRS_o_ai __vector unsigned __int128
+vec_subc(__vector unsigned __int128 __a, __vector unsigned __int128 __b) {
+  return (__vector unsigned __int128)
+         __builtin_s390_vscbiq((unsigned __int128)__a, (unsigned __int128)__b);
+}
+
 /*-- vec_subc_u128 ----------------------------------------------------------*/
 
+// This prototype is deprecated.
 static inline __ATTRS_ai __vector unsigned char
 vec_subc_u128(__vector unsigned char __a, __vector unsigned char __b) {
-  return (__vector unsigned char)
-         (unsigned __int128 __attribute__((__vector_size__(16))))
+  return (__vector unsigned char)(__vector unsigned __int128)
          __builtin_s390_vscbiq((unsigned __int128)__a, (unsigned __int128)__b);
 }
 
+/*-- vec_sube ---------------------------------------------------------------*/
+
+static inline __ATTRS_ai __vector unsigned __int128
+vec_sube(__vector unsigned __int128 __a, __vector unsigned __int128 __b,
+         __vector unsigned __int128 __c) {
+  return (__vector unsigned __int128)
+         __builtin_s390_vsbiq((unsigned __int128)__a, (unsigned __int128)__b,
+                              (unsigned __int128)__c);
+}
+
 /*-- vec_sube_u128 ----------------------------------------------------------*/
 
+// This prototype is deprecated.
 static inline __ATTRS_ai __vector unsigned char
 vec_sube_u128(__vector unsigned char __a, __vector unsigned char __b,
               __vector unsigned char __c) {
-  return (__vector unsigned char)
-         (unsigned __int128 __attribute__((__vector_size__(16))))
+  return (__vector unsigned char)(__vector unsigned __int128)
          __builtin_s390_vsbiq((unsigned __int128)__a, (unsigned __int128)__b,
                               (unsigned __int128)__c);
 }
 
+/*-- vec_subec --------------------------------------------------------------*/
+
+static inline __ATTRS_ai __vector unsigned __int128
+vec_subec(__vector unsigned __int128 __a, __vector unsigned __int128 __b,
+          __vector unsigned __int128 __c) {
+  return (__vector unsigned __int128)
+         __builtin_s390_vsbcbiq((unsigned __int128)__a, (unsigned __int128)__b,
+                                (unsigned __int128)__c);
+}
+
 /*-- vec_subec_u128 ---------------------------------------------------------*/
 
+// This prototype is deprecated.
 static inline __ATTRS_ai __vector unsigned char
 vec_subec_u128(__vector unsigned char __a, __vector unsigned char __b,
                __vector unsigned char __c) {
-  return (__vector unsigned char)
-         (unsigned __int128 __attribute__((__vector_size__(16))))
+  return (__vector unsigned char)(__vector unsigned __int128)
          __builtin_s390_vsbcbiq((unsigned __int128)__a, (unsigned __int128)__b,
                                 (unsigned __int128)__c);
 }
@@ -8896,19 +10571,31 @@ vec_sum2(__vector unsigned int __a, __vector unsigned int __b) {
   return __builtin_s390_vsumgf(__a, __b);
 }
 
+/*-- vec_sum ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai __vector unsigned __int128
+vec_sum(__vector unsigned int __a, __vector unsigned int __b) {
+  return (__vector unsigned __int128)__builtin_s390_vsumqf(__a, __b);
+}
+
+static inline __ATTRS_o_ai __vector unsigned __int128
+vec_sum(__vector unsigned long long __a, __vector unsigned long long __b) {
+  return (__vector unsigned __int128)__builtin_s390_vsumqg(__a, __b);
+}
+
 /*-- vec_sum_u128 -----------------------------------------------------------*/
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector unsigned char
 vec_sum_u128(__vector unsigned int __a, __vector unsigned int __b) {
-  return (__vector unsigned char)
-         (unsigned __int128 __attribute__((__vector_size__(16))))
+  return (__vector unsigned char)(__vector unsigned __int128)
          __builtin_s390_vsumqf(__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai __vector unsigned char
 vec_sum_u128(__vector unsigned long long __a, __vector unsigned long long __b) {
-  return (__vector unsigned char)
-         (unsigned __int128 __attribute__((__vector_size__(16))))
+  return (__vector unsigned char)(__vector unsigned __int128)
          __builtin_s390_vsumqg(__a, __b);
 }
 
@@ -8974,6 +10661,19 @@ vec_test_mask(__vector unsigned long long __a,
                             (__vector unsigned char)__b);
 }
 
+static inline __ATTRS_o_ai int
+vec_test_mask(__vector signed __int128 __a, __vector unsigned __int128 __b) {
+  return __builtin_s390_vtm((__vector unsigned char)__a,
+                            (__vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai int
+vec_test_mask(__vector unsigned __int128 __a,
+              __vector unsigned __int128 __b) {
+  return __builtin_s390_vtm((__vector unsigned char)__a,
+                            (__vector unsigned char)__b);
+}
+
 #if __ARCH__ >= 12
 static inline __ATTRS_o_ai int
 vec_test_mask(__vector float __a, __vector unsigned int __b) {
diff --git clang/lib/Sema/CMakeLists.txt clang/lib/Sema/CMakeLists.txt
index a656b5062391..19cf3a2db00f 100644
--- clang/lib/Sema/CMakeLists.txt
+++ clang/lib/Sema/CMakeLists.txt
@@ -19,6 +19,7 @@ add_clang_library(clangSema
   CodeCompleteConsumer.cpp
   DeclSpec.cpp
   DelayedDiagnostic.cpp
+  HeuristicResolver.cpp
   HLSLExternalSemaSource.cpp
   IdentifierResolver.cpp
   JumpDiagnostics.cpp
diff --git clang/lib/Sema/DeclSpec.cpp clang/lib/Sema/DeclSpec.cpp
index 47644680b720..95e14ca0fa3b 100644
--- clang/lib/Sema/DeclSpec.cpp
+++ clang/lib/Sema/DeclSpec.cpp
@@ -1201,9 +1201,10 @@ void DeclSpec::Finish(Sema &S, const PrintingPolicy &Policy) {
         !S.getLangOpts().ZVector)
       S.Diag(TSWRange.getBegin(), diag::err_invalid_vector_long_long_decl_spec);
 
-    // No vector __int128 prior to Power8.
+    // No vector __int128 prior to Power8 (or ZVector).
     if ((TypeSpecType == TST_int128) &&
-        !S.Context.getTargetInfo().hasFeature("power8-vector"))
+        !S.Context.getTargetInfo().hasFeature("power8-vector") &&
+        !S.getLangOpts().ZVector)
       S.Diag(TSTLoc, diag::err_invalid_vector_int128_decl_spec);
 
     // Complex vector types are not supported.
@@ -1225,9 +1226,10 @@ void DeclSpec::Finish(Sema &S, const PrintingPolicy &Policy) {
           << (TypeAltiVecPixel ? "__pixel" :
                                  getSpecifierName((TST)TypeSpecType, Policy));
       }
-      // vector bool __int128 requires Power10.
+      // vector bool __int128 requires Power10 (or ZVector).
       if ((TypeSpecType == TST_int128) &&
-          (!S.Context.getTargetInfo().hasFeature("power10-vector")))
+          (!S.Context.getTargetInfo().hasFeature("power10-vector") &&
+           !S.getLangOpts().ZVector))
         S.Diag(TSTLoc, diag::err_invalid_vector_bool_int128_decl_spec);
 
       // Only 'short' and 'long long' are valid with vector bool. (PIM 2.1)
diff --git clang-tools-extra/clangd/HeuristicResolver.cpp clang/lib/Sema/HeuristicResolver.cpp
similarity index 88%
rename from clang-tools-extra/clangd/HeuristicResolver.cpp
rename to clang/lib/Sema/HeuristicResolver.cpp
index 9eb892e8e4a8..2a726fe51d35 100644
--- clang-tools-extra/clangd/HeuristicResolver.cpp
+++ clang/lib/Sema/HeuristicResolver.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "HeuristicResolver.h"
+#include "clang/Sema/HeuristicResolver.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/CXXInheritance.h"
 #include "clang/AST/DeclTemplate.h"
@@ -14,7 +14,6 @@
 #include "clang/AST/Type.h"
 
 namespace clang {
-namespace clangd {
 
 namespace {
 
@@ -43,8 +42,8 @@ public:
   resolveDependentNameType(const DependentNameType *DNT);
   std::vector<const NamedDecl *> resolveTemplateSpecializationType(
       const DependentTemplateSpecializationType *DTST);
-  const Type *resolveNestedNameSpecifierToType(const NestedNameSpecifier *NNS);
-  const Type *getPointeeType(const Type *T);
+  QualType resolveNestedNameSpecifierToType(const NestedNameSpecifier *NNS);
+  QualType getPointeeType(QualType T);
 
 private:
   ASTContext &Ctx;
@@ -62,12 +61,12 @@ private:
   // This heuristic will give the desired answer in many cases, e.g.
   // for a call to vector<T>::size().
   std::vector<const NamedDecl *>
-  resolveDependentMember(const Type *T, DeclarationName Name,
+  resolveDependentMember(QualType T, DeclarationName Name,
                          llvm::function_ref<bool(const NamedDecl *ND)> Filter);
 
   // Try to heuristically resolve the type of a possibly-dependent expression
   // `E`.
-  const Type *resolveExprToType(const Expr *E);
+  QualType resolveExprToType(const Expr *E);
   std::vector<const NamedDecl *> resolveExprToDecls(const Expr *E);
 
   // Helper function for HeuristicResolver::resolveDependentMember()
@@ -105,17 +104,17 @@ const auto TemplateFilter = [](const NamedDecl *D) {
   return isa<TemplateDecl>(D);
 };
 
-const Type *resolveDeclsToType(const std::vector<const NamedDecl *> &Decls,
-                               ASTContext &Ctx) {
+QualType resolveDeclsToType(const std::vector<const NamedDecl *> &Decls,
+                            ASTContext &Ctx) {
   if (Decls.size() != 1) // Names an overload set -- just bail.
-    return nullptr;
+    return QualType();
   if (const auto *TD = dyn_cast<TypeDecl>(Decls[0])) {
-    return Ctx.getTypeDeclType(TD).getTypePtr();
+    return Ctx.getTypeDeclType(TD);
   }
   if (const auto *VD = dyn_cast<ValueDecl>(Decls[0])) {
-    return VD->getType().getTypePtrOrNull();
+    return VD->getType();
   }
-  return nullptr;
+  return QualType();
 }
 
 TemplateName getReferencedTemplateName(const Type *T) {
@@ -138,7 +137,8 @@ CXXRecordDecl *HeuristicResolverImpl::resolveTypeToRecordDecl(const Type *T) {
   T = T->getCanonicalTypeInternal().getTypePtr();
 
   if (const auto *DNT = T->getAs<DependentNameType>()) {
-    T = resolveDeclsToType(resolveDependentNameType(DNT), Ctx);
+    T = resolveDeclsToType(resolveDependentNameType(DNT), Ctx)
+            .getTypePtrOrNull();
     if (!T)
       return nullptr;
     T = T->getCanonicalTypeInternal().getTypePtr();
@@ -164,12 +164,12 @@ CXXRecordDecl *HeuristicResolverImpl::resolveTypeToRecordDecl(const Type *T) {
   return TD->getTemplatedDecl();
 }
 
-const Type *HeuristicResolverImpl::getPointeeType(const Type *T) {
-  if (!T)
-    return nullptr;
+QualType HeuristicResolverImpl::getPointeeType(QualType T) {
+  if (T.isNull())
+    return QualType();
 
   if (T->isPointerType())
-    return T->castAs<PointerType>()->getPointeeType().getTypePtrOrNull();
+    return T->castAs<PointerType>()->getPointeeType();
 
   // Try to handle smart pointer types.
 
@@ -178,7 +178,7 @@ const Type *HeuristicResolverImpl::getPointeeType(const Type *T) {
   auto ArrowOps = resolveDependentMember(
       T, Ctx.DeclarationNames.getCXXOperatorName(OO_Arrow), NonStaticFilter);
   if (ArrowOps.empty())
-    return nullptr;
+    return QualType();
 
   // Getting the return type of the found operator-> method decl isn't useful,
   // because we discarded template arguments to perform lookup in the primary
@@ -188,13 +188,13 @@ const Type *HeuristicResolverImpl::getPointeeType(const Type *T) {
   // form of SmartPtr<X, ...>, and assume X is the pointee type.
   auto *TST = T->getAs<TemplateSpecializationType>();
   if (!TST)
-    return nullptr;
+    return QualType();
   if (TST->template_arguments().size() == 0)
-    return nullptr;
+    return QualType();
   const TemplateArgument &FirstArg = TST->template_arguments()[0];
   if (FirstArg.getKind() != TemplateArgument::Type)
-    return nullptr;
-  return FirstArg.getAsType().getTypePtrOrNull();
+    return QualType();
+  return FirstArg.getAsType();
 }
 
 std::vector<const NamedDecl *> HeuristicResolverImpl::resolveMemberExpr(
@@ -211,7 +211,8 @@ std::vector<const NamedDecl *> HeuristicResolverImpl::resolveMemberExpr(
   //      with `this` as the base expression as `X` as the qualifier
   //      (which could be valid if `X` names a base class after instantiation).
   if (NestedNameSpecifier *NNS = ME->getQualifier()) {
-    if (const Type *QualifierType = resolveNestedNameSpecifierToType(NNS)) {
+    if (QualType QualifierType = resolveNestedNameSpecifierToType(NNS);
+        !QualifierType.isNull()) {
       auto Decls =
           resolveDependentMember(QualifierType, ME->getMember(), NoFilter);
       if (!Decls.empty())
@@ -226,11 +227,11 @@ std::vector<const NamedDecl *> HeuristicResolverImpl::resolveMemberExpr(
   }
 
   // Try resolving the member inside the expression's base type.
-  const Type *BaseType = ME->getBaseType().getTypePtrOrNull();
+  QualType BaseType = ME->getBaseType();
   if (ME->isArrow()) {
     BaseType = getPointeeType(BaseType);
   }
-  if (!BaseType)
+  if (BaseType.isNull())
     return {};
   if (const auto *BT = BaseType->getAs<BuiltinType>()) {
     // If BaseType is the type of a dependent expression, it's just
@@ -246,17 +247,17 @@ std::vector<const NamedDecl *> HeuristicResolverImpl::resolveMemberExpr(
 
 std::vector<const NamedDecl *>
 HeuristicResolverImpl::resolveDeclRefExpr(const DependentScopeDeclRefExpr *RE) {
-  return resolveDependentMember(RE->getQualifier()->getAsType(),
+  return resolveDependentMember(QualType(RE->getQualifier()->getAsType(), 0),
                                 RE->getDeclName(), StaticFilter);
 }
 
 std::vector<const NamedDecl *>
 HeuristicResolverImpl::resolveTypeOfCallExpr(const CallExpr *CE) {
-  const auto *CalleeType = resolveExprToType(CE->getCallee());
-  if (!CalleeType)
+  QualType CalleeType = resolveExprToType(CE->getCallee());
+  if (CalleeType.isNull())
     return {};
   if (const auto *FnTypePtr = CalleeType->getAs<PointerType>())
-    CalleeType = FnTypePtr->getPointeeType().getTypePtr();
+    CalleeType = FnTypePtr->getPointeeType();
   if (const FunctionType *FnType = CalleeType->getAs<FunctionType>()) {
     if (const auto *D =
             resolveTypeToRecordDecl(FnType->getReturnType().getTypePtr())) {
@@ -277,7 +278,7 @@ HeuristicResolverImpl::resolveCalleeOfCallExpr(const CallExpr *CE) {
 
 std::vector<const NamedDecl *> HeuristicResolverImpl::resolveUsingValueDecl(
     const UnresolvedUsingValueDecl *UUVD) {
-  return resolveDependentMember(UUVD->getQualifier()->getAsType(),
+  return resolveDependentMember(QualType(UUVD->getQualifier()->getAsType(), 0),
                                 UUVD->getNameInfo().getName(), ValueFilter);
 }
 
@@ -318,18 +319,18 @@ HeuristicResolverImpl::resolveExprToDecls(const Expr *E) {
   return {};
 }
 
-const Type *HeuristicResolverImpl::resolveExprToType(const Expr *E) {
+QualType HeuristicResolverImpl::resolveExprToType(const Expr *E) {
   std::vector<const NamedDecl *> Decls = resolveExprToDecls(E);
   if (!Decls.empty())
     return resolveDeclsToType(Decls, Ctx);
 
-  return E->getType().getTypePtr();
+  return E->getType();
 }
 
-const Type *HeuristicResolverImpl::resolveNestedNameSpecifierToType(
+QualType HeuristicResolverImpl::resolveNestedNameSpecifierToType(
     const NestedNameSpecifier *NNS) {
   if (!NNS)
-    return nullptr;
+    return QualType();
 
   // The purpose of this function is to handle the dependent (Kind ==
   // Identifier) case, but we need to recurse on the prefix because
@@ -338,7 +339,7 @@ const Type *HeuristicResolverImpl::resolveNestedNameSpecifierToType(
   switch (NNS->getKind()) {
   case NestedNameSpecifier::TypeSpec:
   case NestedNameSpecifier::TypeSpecWithTemplate:
-    return NNS->getAsType();
+    return QualType(NNS->getAsType(), 0);
   case NestedNameSpecifier::Identifier: {
     return resolveDeclsToType(
         resolveDependentMember(
@@ -349,7 +350,7 @@ const Type *HeuristicResolverImpl::resolveNestedNameSpecifierToType(
   default:
     break;
   }
-  return nullptr;
+  return QualType();
 }
 
 bool isOrdinaryMember(const NamedDecl *ND) {
@@ -411,8 +412,9 @@ std::vector<const NamedDecl *> HeuristicResolverImpl::lookupDependentName(
 }
 
 std::vector<const NamedDecl *> HeuristicResolverImpl::resolveDependentMember(
-    const Type *T, DeclarationName Name,
+    QualType QT, DeclarationName Name,
     llvm::function_ref<bool(const NamedDecl *ND)> Filter) {
+  const Type *T = QT.getTypePtrOrNull();
   if (!T)
     return {};
   if (auto *ET = T->getAs<EnumType>()) {
@@ -423,7 +425,15 @@ std::vector<const NamedDecl *> HeuristicResolverImpl::resolveDependentMember(
     if (!RD->hasDefinition())
       return {};
     RD = RD->getDefinition();
-    return lookupDependentName(RD, Name, Filter);
+    return lookupDependentName(RD, Name, [&](const NamedDecl *ND) {
+      if (!Filter(ND))
+        return false;
+      if (const auto *MD = dyn_cast<CXXMethodDecl>(ND)) {
+        return MD->getMethodQualifiers().compatiblyIncludes(QT.getQualifiers(),
+                                                            Ctx);
+      }
+      return true;
+    });
   }
   return {};
 }
@@ -458,13 +468,12 @@ HeuristicResolver::resolveTemplateSpecializationType(
     const DependentTemplateSpecializationType *DTST) const {
   return HeuristicResolverImpl(Ctx).resolveTemplateSpecializationType(DTST);
 }
-const Type *HeuristicResolver::resolveNestedNameSpecifierToType(
+QualType HeuristicResolver::resolveNestedNameSpecifierToType(
     const NestedNameSpecifier *NNS) const {
   return HeuristicResolverImpl(Ctx).resolveNestedNameSpecifierToType(NNS);
 }
-const Type *HeuristicResolver::getPointeeType(const Type *T) const {
+const QualType HeuristicResolver::getPointeeType(QualType T) const {
   return HeuristicResolverImpl(Ctx).getPointeeType(T);
 }
 
-} // namespace clangd
 } // namespace clang
diff --git clang/lib/Sema/SemaCast.cpp clang/lib/Sema/SemaCast.cpp
index f98857f852b5..54bc52fa2ac4 100644
--- clang/lib/Sema/SemaCast.cpp
+++ clang/lib/Sema/SemaCast.cpp
@@ -2094,6 +2094,10 @@ void Sema::CheckCompatibleReinterpretCast(QualType SrcType, QualType DestType,
     }
   }
 
+  if (SrcTy->isDependentType() || DestTy->isDependentType()) {
+    return;
+  }
+
   Diag(Range.getBegin(), DiagID) << SrcType << DestType << Range;
 }
 
diff --git clang/lib/Sema/SemaChecking.cpp clang/lib/Sema/SemaChecking.cpp
index 881907ac311a..c41164a2f1af 100644
--- clang/lib/Sema/SemaChecking.cpp
+++ clang/lib/Sema/SemaChecking.cpp
@@ -8454,26 +8454,43 @@ static bool IsInfOrNanFunction(StringRef calleeName, MathCheck Check) {
   llvm_unreachable("unknown MathCheck");
 }
 
+static bool IsInfinityFunction(const FunctionDecl *FDecl) {
+  if (FDecl->getName() != "infinity")
+    return false;
+
+  if (const CXXMethodDecl *MDecl = dyn_cast<CXXMethodDecl>(FDecl)) {
+    const CXXRecordDecl *RDecl = MDecl->getParent();
+    if (RDecl->getName() != "numeric_limits")
+      return false;
+
+    if (const NamespaceDecl *NSDecl =
+            dyn_cast<NamespaceDecl>(RDecl->getDeclContext()))
+      return NSDecl->isStdNamespace();
+  }
+
+  return false;
+}
+
 void Sema::CheckInfNaNFunction(const CallExpr *Call,
                                const FunctionDecl *FDecl) {
+  if (!FDecl->getIdentifier())
+    return;
+
   FPOptions FPO = Call->getFPFeaturesInEffect(getLangOpts());
-  bool HasIdentifier = FDecl->getIdentifier() != nullptr;
-  bool IsNaNOrIsUnordered =
-      IsStdFunction(FDecl, "isnan") || IsStdFunction(FDecl, "isunordered");
-  bool IsSpecialNaN =
-      HasIdentifier && IsInfOrNanFunction(FDecl->getName(), MathCheck::NaN);
-  if ((IsNaNOrIsUnordered || IsSpecialNaN) && FPO.getNoHonorNaNs()) {
+  if (FPO.getNoHonorNaNs() &&
+      (IsStdFunction(FDecl, "isnan") || IsStdFunction(FDecl, "isunordered") ||
+       IsInfOrNanFunction(FDecl->getName(), MathCheck::NaN))) {
     Diag(Call->getBeginLoc(), diag::warn_fp_nan_inf_when_disabled)
         << 1 << 0 << Call->getSourceRange();
-  } else {
-    bool IsInfOrIsFinite =
-        IsStdFunction(FDecl, "isinf") || IsStdFunction(FDecl, "isfinite");
-    bool IsInfinityOrIsSpecialInf =
-        HasIdentifier && ((FDecl->getName() == "infinity") ||
-                          IsInfOrNanFunction(FDecl->getName(), MathCheck::Inf));
-    if ((IsInfOrIsFinite || IsInfinityOrIsSpecialInf) && FPO.getNoHonorInfs())
-      Diag(Call->getBeginLoc(), diag::warn_fp_nan_inf_when_disabled)
-          << 0 << 0 << Call->getSourceRange();
+    return;
+  }
+
+  if (FPO.getNoHonorInfs() &&
+      (IsStdFunction(FDecl, "isinf") || IsStdFunction(FDecl, "isfinite") ||
+       IsInfinityFunction(FDecl) ||
+       IsInfOrNanFunction(FDecl->getName(), MathCheck::Inf))) {
+    Diag(Call->getBeginLoc(), diag::warn_fp_nan_inf_when_disabled)
+        << 0 << 0 << Call->getSourceRange();
   }
 }
 
diff --git clang/lib/Sema/SemaConcept.cpp clang/lib/Sema/SemaConcept.cpp
index 539de00bd104..6a40a59c977d 100644
--- clang/lib/Sema/SemaConcept.cpp
+++ clang/lib/Sema/SemaConcept.cpp
@@ -752,6 +752,9 @@ bool Sema::SetupConstraintScope(
     FunctionDecl *FD, std::optional<ArrayRef<TemplateArgument>> TemplateArgs,
     const MultiLevelTemplateArgumentList &MLTAL,
     LocalInstantiationScope &Scope) {
+  assert(!isLambdaCallOperator(FD) &&
+         "Use LambdaScopeForCallOperatorInstantiationRAII to handle lambda "
+         "instantiations");
   if (FD->isTemplateInstantiation() && FD->getPrimaryTemplate()) {
     FunctionTemplateDecl *PrimaryTemplate = FD->getPrimaryTemplate();
     InstantiatingTemplate Inst(
@@ -777,14 +780,8 @@ bool Sema::SetupConstraintScope(
 
     // If this is a member function, make sure we get the parameters that
     // reference the original primary template.
-    // We walk up the instantiated template chain so that nested lambdas get
-    // handled properly.
-    // We should only collect instantiated parameters from the primary template.
-    // Otherwise, we may have mismatched template parameter depth!
     if (FunctionTemplateDecl *FromMemTempl =
             PrimaryTemplate->getInstantiatedFromMemberTemplate()) {
-      while (FromMemTempl->getInstantiatedFromMemberTemplate())
-        FromMemTempl = FromMemTempl->getInstantiatedFromMemberTemplate();
       if (addInstantiatedParametersToScope(FD, FromMemTempl->getTemplatedDecl(),
                                            Scope, MLTAL))
         return true;
@@ -834,6 +831,9 @@ Sema::SetupConstraintCheckingTemplateArgumentsAndScope(
                                    /*RelativeToPrimary=*/true,
                                    /*Pattern=*/nullptr,
                                    /*ForConstraintInstantiation=*/true);
+  // Lambdas are handled by LambdaScopeForCallOperatorInstantiationRAII.
+  if (isLambdaCallOperator(FD))
+    return MLTAL;
   if (SetupConstraintScope(FD, TemplateArgs, MLTAL, Scope))
     return std::nullopt;
 
diff --git clang/lib/Sema/SemaHLSL.cpp clang/lib/Sema/SemaHLSL.cpp
index 238e19651dc6..5001883003ee 100644
--- clang/lib/Sema/SemaHLSL.cpp
+++ clang/lib/Sema/SemaHLSL.cpp
@@ -1688,13 +1688,21 @@ static bool CheckVectorElementCallArgs(Sema *S, CallExpr *TheCall) {
   auto *VecTyA = ArgTyA->getAs<VectorType>();
   SourceLocation BuiltinLoc = TheCall->getBeginLoc();
 
+  bool AllBArgAreVectors = true;
   for (unsigned i = 1; i < TheCall->getNumArgs(); ++i) {
     ExprResult B = TheCall->getArg(i);
     QualType ArgTyB = B.get()->getType();
     auto *VecTyB = ArgTyB->getAs<VectorType>();
-    if (VecTyA == nullptr && VecTyB == nullptr)
-      return false;
-
+    if (VecTyB == nullptr)
+      AllBArgAreVectors &= false;
+    if (VecTyA && VecTyB == nullptr) {
+      // Note: if we get here 'B' is scalar which
+      // requires a VectorSplat on ArgN
+      S->Diag(BuiltinLoc, diag::err_vec_builtin_non_vector)
+          << TheCall->getDirectCallee() << /*useAllTerminology*/ true
+          << SourceRange(A.get()->getBeginLoc(), B.get()->getEndLoc());
+      return true;
+    }
     if (VecTyA && VecTyB) {
       bool retValue = false;
       if (VecTyA->getElementType() != VecTyB->getElementType()) {
@@ -1712,21 +1720,23 @@ static bool CheckVectorElementCallArgs(Sema *S, CallExpr *TheCall) {
         // HLSLVectorTruncation.
         S->Diag(BuiltinLoc, diag::err_vec_builtin_incompatible_vector)
             << TheCall->getDirectCallee() << /*useAllTerminology*/ true
-            << SourceRange(TheCall->getArg(0)->getBeginLoc(),
-                           TheCall->getArg(1)->getEndLoc());
+            << SourceRange(A.get()->getBeginLoc(), B.get()->getEndLoc());
         retValue = true;
       }
-      return retValue;
+      if (retValue)
+        return retValue;
     }
   }
 
-  // Note: if we get here one of the args is a scalar which
-  // requires a VectorSplat on Arg0 or Arg1
-  S->Diag(BuiltinLoc, diag::err_vec_builtin_non_vector)
-      << TheCall->getDirectCallee() << /*useAllTerminology*/ true
-      << SourceRange(TheCall->getArg(0)->getBeginLoc(),
-                     TheCall->getArg(1)->getEndLoc());
-  return true;
+  if (VecTyA == nullptr && AllBArgAreVectors) {
+    // Note: if we get here 'A' is a scalar which
+    // requires a VectorSplat on Arg0
+    S->Diag(BuiltinLoc, diag::err_vec_builtin_non_vector)
+        << TheCall->getDirectCallee() << /*useAllTerminology*/ true
+        << SourceRange(A.get()->getBeginLoc(), A.get()->getEndLoc());
+    return true;
+  }
+  return false;
 }
 
 static bool CheckArgTypeMatches(Sema *S, Expr *Arg, QualType ExpectedType) {
diff --git clang/lib/Sema/SemaLambda.cpp clang/lib/Sema/SemaLambda.cpp
index 0c5467cfd54a..87b3ca53cefa 100644
--- clang/lib/Sema/SemaLambda.cpp
+++ clang/lib/Sema/SemaLambda.cpp
@@ -2408,35 +2408,31 @@ Sema::LambdaScopeForCallOperatorInstantiationRAII::
   if (!ShouldAddDeclsFromParentScope)
     return;
 
-  FunctionDecl *InnermostFD = FD, *InnermostFDPattern = FDPattern;
   llvm::SmallVector<std::pair<FunctionDecl *, FunctionDecl *>, 4>
-      ParentInstantiations;
-  while (true) {
+      InstantiationAndPatterns;
+  while (FDPattern && FD) {
+    InstantiationAndPatterns.emplace_back(FDPattern, FD);
+
     FDPattern =
         dyn_cast<FunctionDecl>(getLambdaAwareParentOfDeclContext(FDPattern));
     FD = dyn_cast<FunctionDecl>(getLambdaAwareParentOfDeclContext(FD));
-
-    if (!FDPattern || !FD)
-      break;
-
-    ParentInstantiations.emplace_back(FDPattern, FD);
   }
 
   // Add instantiated parameters and local vars to scopes, starting from the
   // outermost lambda to the innermost lambda. This ordering ensures that
-  // parameters in inner lambdas can correctly depend on those defined
-  // in outer lambdas, e.g. auto L = [](auto... x) {
-  //   return [](decltype(x)... y) { }; // `y` depends on `x`
-  // };
+  // the outer instantiations can be found when referenced from within inner
+  // lambdas.
+  //
+  //   auto L = [](auto... x) {
+  //     return [](decltype(x)... y) { }; // Instantiating y needs x
+  //   };
+  //
 
-  for (const auto &[FDPattern, FD] : llvm::reverse(ParentInstantiations)) {
+  for (auto [FDPattern, FD] : llvm::reverse(InstantiationAndPatterns)) {
     SemaRef.addInstantiatedParametersToScope(FD, FDPattern, Scope, MLTAL);
     SemaRef.addInstantiatedLocalVarsToScope(FD, FDPattern, Scope);
 
     if (isLambdaCallOperator(FD))
       SemaRef.addInstantiatedCapturesToScope(FD, FDPattern, Scope, MLTAL);
   }
-
-  SemaRef.addInstantiatedCapturesToScope(InnermostFD, InnermostFDPattern, Scope,
-                                         MLTAL);
 }
diff --git clang/lib/Sema/SemaOpenACCClause.cpp clang/lib/Sema/SemaOpenACCClause.cpp
index 27da14de4c04..000934225402 100644
--- clang/lib/Sema/SemaOpenACCClause.cpp
+++ clang/lib/Sema/SemaOpenACCClause.cpp
@@ -1360,7 +1360,6 @@ ExprResult CheckGangKernelsExpr(SemaOpenACC &S,
   }
   case OpenACCGangKind::Static:
     return CheckGangStaticExpr(S, E);
-    return ExprError();
   }
   llvm_unreachable("Unknown gang kind in gang kernels check");
 }
diff --git clang/lib/Sema/SemaOverload.cpp clang/lib/Sema/SemaOverload.cpp
index 34c287926b1d..7e8811b5274e 100644
--- clang/lib/Sema/SemaOverload.cpp
+++ clang/lib/Sema/SemaOverload.cpp
@@ -6977,7 +6977,7 @@ void Sema::AddOverloadCandidate(
   Candidate.Viable = true;
   Candidate.RewriteKind =
       CandidateSet.getRewriteInfo().getRewriteKind(Function, PO);
-  Candidate.IsADLCandidate = IsADLCandidate;
+  Candidate.IsADLCandidate = llvm::to_underlying(IsADLCandidate);
   Candidate.ExplicitCallArguments = Args.size();
 
   // Explicit functions are not actually candidates at all if we're not
@@ -7832,7 +7832,7 @@ void Sema::AddTemplateOverloadCandidate(
     Candidate.RewriteKind =
       CandidateSet.getRewriteInfo().getRewriteKind(Candidate.Function, PO);
     Candidate.IsSurrogate = false;
-    Candidate.IsADLCandidate = IsADLCandidate;
+    Candidate.IsADLCandidate = llvm::to_underlying(IsADLCandidate);
     // Ignore the object argument if there is one, since we don't have an object
     // type.
     Candidate.IgnoreObjectArgument =
@@ -14082,7 +14082,8 @@ static ExprResult FinishOverloadedCallExpr(Sema &SemaRef, Scope *S, Expr *Fn,
       return ExprError();
     return SemaRef.BuildResolvedCallExpr(
         Res.get(), FDecl, LParenLoc, Args, RParenLoc, ExecConfig,
-        /*IsExecConfig=*/false, (*Best)->IsADLCandidate);
+        /*IsExecConfig=*/false,
+        static_cast<CallExpr::ADLCallKind>((*Best)->IsADLCandidate));
   }
 
   case OR_No_Viable_Function: {
@@ -14156,7 +14157,8 @@ static ExprResult FinishOverloadedCallExpr(Sema &SemaRef, Scope *S, Expr *Fn,
       return ExprError();
     return SemaRef.BuildResolvedCallExpr(
         Res.get(), FDecl, LParenLoc, Args, RParenLoc, ExecConfig,
-        /*IsExecConfig=*/false, (*Best)->IsADLCandidate);
+        /*IsExecConfig=*/false,
+        static_cast<CallExpr::ADLCallKind>((*Best)->IsADLCandidate));
   }
   }
 
@@ -14438,7 +14440,8 @@ Sema::CreateOverloadedUnaryOp(SourceLocation OpLoc, UnaryOperatorKind Opc,
       Args[0] = Input;
       CallExpr *TheCall = CXXOperatorCallExpr::Create(
           Context, Op, FnExpr.get(), ArgsArray, ResultTy, VK, OpLoc,
-          CurFPFeatureOverrides(), Best->IsADLCandidate);
+          CurFPFeatureOverrides(),
+          static_cast<CallExpr::ADLCallKind>(Best->IsADLCandidate));
 
       if (CheckCallReturnType(FnDecl->getReturnType(), OpLoc, TheCall, FnDecl))
         return ExprError();
@@ -14833,7 +14836,8 @@ ExprResult Sema::CreateOverloadedBinOp(SourceLocation OpLoc,
         // members; CodeGen should take care not to emit the this pointer.
         TheCall = CXXOperatorCallExpr::Create(
             Context, ChosenOp, FnExpr.get(), Args, ResultTy, VK, OpLoc,
-            CurFPFeatureOverrides(), Best->IsADLCandidate);
+            CurFPFeatureOverrides(),
+            static_cast<CallExpr::ADLCallKind>(Best->IsADLCandidate));
 
         if (const auto *Method = dyn_cast<CXXMethodDecl>(FnDecl);
             Method && Method->isImplicitObjectMemberFunction()) {
diff --git clang/lib/Sema/SemaSystemZ.cpp clang/lib/Sema/SemaSystemZ.cpp
index 7e836adbee65..535cb8243d72 100644
--- clang/lib/Sema/SemaSystemZ.cpp
+++ clang/lib/Sema/SemaSystemZ.cpp
@@ -38,6 +38,7 @@ bool SemaSystemZ::CheckSystemZBuiltinFunctionCall(unsigned BuiltinID,
   switch (BuiltinID) {
   default: return false;
   case SystemZ::BI__builtin_s390_lcbb: i = 1; l = 0; u = 15; break;
+  case SystemZ::BI__builtin_s390_veval:
   case SystemZ::BI__builtin_s390_verimb:
   case SystemZ::BI__builtin_s390_verimh:
   case SystemZ::BI__builtin_s390_verimf:
diff --git clang/lib/Sema/SemaX86.cpp clang/lib/Sema/SemaX86.cpp
index fd1a6017712d..7feca138e3e2 100644
--- clang/lib/Sema/SemaX86.cpp
+++ clang/lib/Sema/SemaX86.cpp
@@ -1045,9 +1045,9 @@ bool SemaX86::CheckBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
   case X86::BI__builtin_ia32_vpshrdw128:
   case X86::BI__builtin_ia32_vpshrdw256:
   case X86::BI__builtin_ia32_vpshrdw512:
-  case X86::BI__builtin_ia32_vminmaxnepbf16128:
-  case X86::BI__builtin_ia32_vminmaxnepbf16256:
-  case X86::BI__builtin_ia32_vminmaxnepbf16512:
+  case X86::BI__builtin_ia32_vminmaxbf16128:
+  case X86::BI__builtin_ia32_vminmaxbf16256:
+  case X86::BI__builtin_ia32_vminmaxbf16512:
   case X86::BI__builtin_ia32_vminmaxpd128_mask:
   case X86::BI__builtin_ia32_vminmaxpd256_round_mask:
   case X86::BI__builtin_ia32_vminmaxph128_mask:
diff --git clang/lib/Sema/TreeTransform.h clang/lib/Sema/TreeTransform.h
index 4fae2ccb5f6d..7dc88a1ae23b 100644
--- clang/lib/Sema/TreeTransform.h
+++ clang/lib/Sema/TreeTransform.h
@@ -8499,7 +8499,7 @@ TreeTransform<Derived>::TransformDeclStmt(DeclStmt *S) {
             getSema()
                 .getASTContext()
                 .getTypeDeclType(TD)
-                .getCanonicalType()
+                .getSingleStepDesugaredType(getSema().getASTContext())
                 ->containsUnexpandedParameterPack();
 
       if (auto *VD = dyn_cast<VarDecl>(Transformed))
diff --git clang/test/ARCMT/autoreleases.m clang/test/ARCMT/autoreleases.m
index 4c268c09a715..7c046dc227a0 100644
--- clang/test/ARCMT/autoreleases.m
+++ clang/test/ARCMT/autoreleases.m
@@ -69,7 +69,7 @@ id test2(A* val) {
   return val;
 }
 
-id test3(void) {
+void test3(void) {
   id a = [[A alloc] init];
   [a autorelease];
 }
diff --git clang/test/ARCMT/autoreleases.m.result clang/test/ARCMT/autoreleases.m.result
index b3aad804a45b..29d00ea60dee 100644
--- clang/test/ARCMT/autoreleases.m.result
+++ clang/test/ARCMT/autoreleases.m.result
@@ -64,6 +64,6 @@ id test2(A* val) {
   return val;
 }
 
-id test3(void) {
+void test3(void) {
   id a = [[A alloc] init];
 }
diff --git clang/test/ARCMT/retains.m clang/test/ARCMT/retains.m
index 43a94fc16cec..a38efe159640 100644
--- clang/test/ARCMT/retains.m
+++ clang/test/ARCMT/retains.m
@@ -21,7 +21,7 @@ id IhaveSideEffect(void);
 
 @synthesize bar;
 
--(id)something {}
+-(id)something { return (id)0; }
 
 -(id)test:(id)obj {
   id x = self.bar;
diff --git clang/test/ARCMT/retains.m.result clang/test/ARCMT/retains.m.result
index 4e720d6bb4c1..cd3bb3848fce 100644
--- clang/test/ARCMT/retains.m.result
+++ clang/test/ARCMT/retains.m.result
@@ -21,7 +21,7 @@ id IhaveSideEffect(void);
 
 @synthesize bar;
 
--(id)something {}
+-(id)something { return (id)0; }
 
 -(id)test:(id)obj {
   id x = self.bar;
diff --git clang/test/AST/ByteCode/c23.c clang/test/AST/ByteCode/c23.c
index 5154d57f6cb9..0e9851aa2ad3 100644
--- clang/test/AST/ByteCode/c23.c
+++ clang/test/AST/ByteCode/c23.c
@@ -49,3 +49,11 @@ static_assert(arg1[1] == 254);
 static_assert(arg1[2] == 186);
 static_assert(arg1[3] == 190);
 #endif
+
+void ghissue109095() {
+  constexpr char c[] = { 'a' };
+  constexpr int i = c[1]; // both-error {{constexpr variable 'i' must be initialized by a constant expression}}\
+                          // both-note {{declared here}}
+  _Static_assert(i == c[0]); // both-error {{static assertion expression is not an integral constant expression}}\
+                             // both-note {{initializer of 'i' is not a constant expression}}
+}
diff --git clang/test/AST/ByteCode/constexpr.c clang/test/AST/ByteCode/constexpr.c
index fed24fa72b25..af96bf3a06f3 100644
--- clang/test/AST/ByteCode/constexpr.c
+++ clang/test/AST/ByteCode/constexpr.c
@@ -309,8 +309,8 @@ constexpr const int *V81 = &V80;
 constexpr int *V82 = 0;
 constexpr int *V83 = V82;
 constexpr int *V84 = 42;
-// ref-error@-1 {{constexpr variable 'V84' must be initialized by a constant expression}}
-// ref-note@-2 {{this conversion is not allowed in a constant expression}}
+// both-error@-1 {{constexpr variable 'V84' must be initialized by a constant expression}}
+// both-note@-2 {{this conversion is not allowed in a constant expression}}
 // both-error@-3 {{constexpr pointer initializer is not null}}
 constexpr int *V85 = nullptr;
 
diff --git clang/test/AST/ByteCode/cxx17.cpp clang/test/AST/ByteCode/cxx17.cpp
index e8559d8b9812..ecb8a395520a 100644
--- clang/test/AST/ByteCode/cxx17.cpp
+++ clang/test/AST/ByteCode/cxx17.cpp
@@ -105,3 +105,23 @@ constexpr S s = getS(); // both-error {{must be initialized by a constant expres
                         // both-note {{declared here}}
 static_assert(s.a == 12, ""); // both-error {{not an integral constant expression}} \
                               // both-note {{initializer of 's' is not a constant expression}}
+
+using size_t = decltype(sizeof(0));
+namespace std { template<typename T> struct tuple_size; }
+namespace std { template<size_t, typename> struct tuple_element; }
+
+namespace constant {
+  struct Q {};
+  template<int N> constexpr int get(Q &&) { return N * N; }
+}
+template<> struct std::tuple_size<constant::Q> { static const int value = 3; };
+template<int N> struct std::tuple_element<N, constant::Q> { typedef int type; };
+
+namespace constant {
+  Q q;
+  constexpr bool f() {
+    auto [a, b, c] = q;
+    return a == 0 && b == 1 && c == 4;
+  }
+  static_assert(f());
+}
diff --git clang/test/AST/ByteCode/lifetimes.cpp clang/test/AST/ByteCode/lifetimes.cpp
index 9a99485c4a40..43039d0c766e 100644
--- clang/test/AST/ByteCode/lifetimes.cpp
+++ clang/test/AST/ByteCode/lifetimes.cpp
@@ -7,15 +7,15 @@ struct Foo {
   int a;
 };
 
-constexpr int dead1() { // expected-error {{never produces a constant expression}}
+constexpr int dead1() {
 
   Foo *F2 = nullptr;
   {
-    Foo F{12}; // expected-note 2{{declared here}}
+    Foo F{12}; // expected-note {{declared here}}
     F2 = &F;
   } // Ends lifetime of F.
 
-  return F2->a; // expected-note 2{{read of variable whose lifetime has ended}} \
+  return F2->a; // expected-note {{read of variable whose lifetime has ended}} \
                 // ref-note {{read of object outside its lifetime is not allowed in a constant expression}}
 }
 static_assert(dead1() == 1, ""); // both-error {{not an integral constant expression}} \
diff --git clang/test/AST/ByteCode/literals.cpp clang/test/AST/ByteCode/literals.cpp
index fdf1a6820e44..b75ca2b19a96 100644
--- clang/test/AST/ByteCode/literals.cpp
+++ clang/test/AST/ByteCode/literals.cpp
@@ -1315,3 +1315,12 @@ namespace {
   }
 }
 #endif
+
+void localConstexpr() {
+  constexpr int a = 1/0; // both-error {{must be initialized by a constant expression}} \
+                         // both-note {{division by zero}} \
+                         // both-warning {{division by zero is undefined}} \
+                         // both-note {{declared here}}
+  static_assert(a == 0, ""); // both-error {{not an integral constant expression}} \
+                             // both-note {{initializer of 'a' is not a constant expression}}
+}
diff --git clang/test/AST/ByteCode/records.cpp clang/test/AST/ByteCode/records.cpp
index d329219264d8..9470e7d8e3dc 100644
--- clang/test/AST/ByteCode/records.cpp
+++ clang/test/AST/ByteCode/records.cpp
@@ -1684,3 +1684,18 @@ namespace ExplicitThisInTemporary {
   constexpr bool g(B b) { return &b == b.p; }
   static_assert(g({}), "");
 }
+
+namespace IgnoredMemberExpr {
+  class A {
+  public:
+    int a;
+  };
+  class B : public A {
+  public:
+    constexpr int foo() {
+      a; // both-warning {{expression result unused}}
+      return 0;
+    }
+  };
+  static_assert(B{}.foo() == 0, "");
+}
diff --git clang/test/AST/ast-dump-cxx2b-deducing-this.cpp clang/test/AST/ast-dump-cxx2b-deducing-this.cpp
index 1b385e0fc333..854d12b4cdba 100644
--- clang/test/AST/ast-dump-cxx2b-deducing-this.cpp
+++ clang/test/AST/ast-dump-cxx2b-deducing-this.cpp
@@ -5,7 +5,7 @@ struct S {
   int f(this S&);
 };
 
-int main() {
+void main() {
   S s;
   int x = s.f();
   // CHECK: CallExpr 0x{{[^ ]*}} <col:11, col:15> 'int
diff --git clang/test/AST/ast-dump-special-member-functions.cpp clang/test/AST/ast-dump-special-member-functions.cpp
index b98c90f67604..0fe2cee615c8 100644
--- clang/test/AST/ast-dump-special-member-functions.cpp
+++ clang/test/AST/ast-dump-special-member-functions.cpp
@@ -253,25 +253,25 @@ struct TrivialCopyAssignment {
 struct NontrivialCopyAssignment {
   // CHECK: CXXRecordDecl 0x{{[^ ]*}} <line:[[@LINE-1]]:1, line:[[@LINE+3]]:1> line:[[@LINE-1]]:8 struct NontrivialCopyAssignment definition
   // CHECK: CopyAssignment {{.*}}non_trivial{{.*}}
-  NontrivialCopyAssignment& operator=(const NontrivialCopyAssignment&) {}
+  NontrivialCopyAssignment& operator=(const NontrivialCopyAssignment&) { return *this; }
 };
 
 struct CopyAssignmentHasConstParam {
   // CHECK: CXXRecordDecl 0x{{[^ ]*}} <line:[[@LINE-1]]:1, line:[[@LINE+3]]:1> line:[[@LINE-1]]:8 struct CopyAssignmentHasConstParam definition
   // CHECK: CopyAssignment {{.*}}has_const_param{{.*}}
-  CopyAssignmentHasConstParam& operator=(const CopyAssignmentHasConstParam&) {}
+  CopyAssignmentHasConstParam& operator=(const CopyAssignmentHasConstParam&) { return *this; }
 };
 
 struct CopyAssignmentDoesNotHaveConstParam {
   // CHECK: CXXRecordDecl 0x{{[^ ]*}} <line:[[@LINE-1]]:1, line:[[@LINE+3]]:1> line:[[@LINE-1]]:8 struct CopyAssignmentDoesNotHaveConstParam definition
   // CHECK-NOT: CopyAssignment {{.*}} has_const_param{{.*}}
-  CopyAssignmentDoesNotHaveConstParam& operator=(CopyAssignmentDoesNotHaveConstParam&) {}
+  CopyAssignmentDoesNotHaveConstParam& operator=(CopyAssignmentDoesNotHaveConstParam&) { return *this; }
 };
 
 struct UserDeclaredCopyAssignment {
   // CHECK: CXXRecordDecl 0x{{[^ ]*}} <line:[[@LINE-1]]:1, line:[[@LINE+3]]:1> line:[[@LINE-1]]:8 struct UserDeclaredCopyAssignment definition
   // CHECK: CopyAssignment {{.*}}user_declared{{.*}}
-  UserDeclaredCopyAssignment& operator=(const UserDeclaredCopyAssignment&) {}
+  UserDeclaredCopyAssignment& operator=(const UserDeclaredCopyAssignment&) { return *this; }
 };
 
 struct NonUserDeclaredCopyAssignment {
@@ -288,7 +288,7 @@ struct NeedsImplicitCopyAssignment {
 struct DoesNotNeedImplicitCopyAssignment {
   // CHECK: CXXRecordDecl 0x{{[^ ]*}} <line:[[@LINE-1]]:1, line:[[@LINE+3]]:1> line:[[@LINE-1]]:8 struct DoesNotNeedImplicitCopyAssignment definition
   // CHECK-NOT: CopyAssignment {{.*}}needs_implicit{{.*}}
-  DoesNotNeedImplicitCopyAssignment& operator=(const DoesNotNeedImplicitCopyAssignment&) {}
+  DoesNotNeedImplicitCopyAssignment& operator=(const DoesNotNeedImplicitCopyAssignment&) { return *this; }
 };
 
 struct DeclaresCopyAssignment {
@@ -352,13 +352,13 @@ struct TrivialMoveAssignment {
 struct NontrivialMoveAssignment {
   // CHECK: CXXRecordDecl 0x{{[^ ]*}} <line:[[@LINE-1]]:1, line:[[@LINE+3]]:1> line:[[@LINE-1]]:8 struct NontrivialMoveAssignment definition
   // CHECK: MoveAssignment {{.*}}non_trivial{{.*}}
-  NontrivialMoveAssignment& operator=(NontrivialMoveAssignment&&) {}
+  NontrivialMoveAssignment& operator=(NontrivialMoveAssignment&&) { return *this; }
 };
 
 struct UserDeclaredMoveAssignment {
   // CHECK: CXXRecordDecl 0x{{[^ ]*}} <line:[[@LINE-1]]:1, line:[[@LINE+3]]:1> line:[[@LINE-1]]:8 struct UserDeclaredMoveAssignment definition
   // CHECK: MoveAssignment {{.*}}user_declared{{.*}}
-  UserDeclaredMoveAssignment& operator=(UserDeclaredMoveAssignment&&) {}
+  UserDeclaredMoveAssignment& operator=(UserDeclaredMoveAssignment&&) { return *this; }
 };
 
 struct NonUserDeclaredMoveAssignment {
@@ -375,7 +375,7 @@ struct NeedsImplicitMoveAssignment {
 struct DoesNotNeedImplicitMoveAssignment {
   // CHECK: CXXRecordDecl 0x{{[^ ]*}} <line:[[@LINE-1]]:1, line:[[@LINE+3]]:1> line:[[@LINE-1]]:8 struct DoesNotNeedImplicitMoveAssignment definition
   // CHECK-NOT: MoveAssignment {{.*}}needs_implicit{{.*}}
-  DoesNotNeedImplicitMoveAssignment& operator=(DoesNotNeedImplicitMoveAssignment&&) {}
+  DoesNotNeedImplicitMoveAssignment& operator=(DoesNotNeedImplicitMoveAssignment&&) { return *this; }
 };
 
 struct MoveAssignmentNeedsOverloadResolution : virtual DeletedDestructor {
diff --git clang/test/Analysis/Inputs/expected-plists/plist-output.m.plist clang/test/Analysis/Inputs/expected-plists/plist-output.m.plist
index 32244329c434..8b8cc3239bd4 100644
--- clang/test/Analysis/Inputs/expected-plists/plist-output.m.plist
+++ clang/test/Analysis/Inputs/expected-plists/plist-output.m.plist
@@ -6151,7 +6151,7 @@
    <key>type</key><string>Argument with &apos;nonnull&apos; attribute passed null</string>
    <key>check_name</key><string>core.NonNullParamChecker</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>c0b359a043c633f1b8d1581f68743361</string>
+   <key>issue_hash_content_of_line_in_context</key><string>4c580a2a9cf15947fa485a0a9e625306</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>RDar13295437</string>
   <key>issue_hash_function_offset</key><string>3</string>
diff --git clang/test/Analysis/const-method-call.cpp clang/test/Analysis/const-method-call.cpp
index 7da7ca5554a2..b37ce17447bd 100644
--- clang/test/Analysis/const-method-call.cpp
+++ clang/test/Analysis/const-method-call.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_analyze_cc1 -analyzer-checker=core,debug.ExprInspection -verify -analyzer-config eagerly-assume=false %s
+// RUN: %clang_analyze_cc1 -Wno-error=return-type -analyzer-checker=core,debug.ExprInspection -verify -analyzer-config eagerly-assume=false %s
 
 void clang_analyzer_eval(bool);
 
diff --git clang/test/Analysis/inline-unique-reports.c clang/test/Analysis/inline-unique-reports.c
index e58870ea74ab..306e314a94e4 100644
--- clang/test/Analysis/inline-unique-reports.c
+++ clang/test/Analysis/inline-unique-reports.c
@@ -1,4 +1,4 @@
-// RUN: %clang_analyze_cc1 %s -analyzer-checker=core.NullDereference -analyzer-output=plist -Wno-error=implicit-int -o %t > /dev/null 2>&1
+// RUN: %clang_analyze_cc1 %s -analyzer-checker=core.NullDereference -analyzer-output=plist -Wno-error=implicit-int -Wno-error=return-type -o %t > /dev/null 2>&1
 // RUN: %normalize_plist <%t | diff -ub %S/Inputs/expected-plists/inline-unique-reports.c.plist -
 
 static inline bug(int *p) {
diff --git clang/test/Analysis/malloc.c clang/test/Analysis/malloc.c
index f2f8975b5f0e..0dc667bc1ed5 100644
--- clang/test/Analysis/malloc.c
+++ clang/test/Analysis/malloc.c
@@ -1914,8 +1914,8 @@ variable 'buf', which is not memory allocated by 'malloc()' [unix.Malloc]}}
 
 (*crash_a)(); // expected-warning{{type specifier missing}}
 // A CallEvent without a corresponding FunctionDecl.
-crash_b() { crash_a(); } // no-crash
-// expected-warning@-1{{type specifier missing}} expected-warning@-1{{non-void}}
+crash_b() { crash_a(); return 0; } // no-crash
+// expected-warning@-1{{type specifier missing}}
 
 long *global_a;
 void realloc_crash(void) {
diff --git clang/test/Analysis/nil-receiver-undefined-larger-than-voidptr-ret.m clang/test/Analysis/nil-receiver-undefined-larger-than-voidptr-ret.m
index bfc3cb92b639..300337e3b977 100644
--- clang/test/Analysis/nil-receiver-undefined-larger-than-voidptr-ret.m
+++ clang/test/Analysis/nil-receiver-undefined-larger-than-voidptr-ret.m
@@ -1,8 +1,8 @@
-// RUN: %clang_analyze_cc1 -triple i386-apple-darwin8 -analyzer-checker=core,alpha.core -Wno-objc-root-class %s > %t.1 2>&1
+// RUN: %clang_analyze_cc1 -Wno-error=return-type -triple i386-apple-darwin8 -analyzer-checker=core,alpha.core -Wno-objc-root-class %s > %t.1 2>&1
 // RUN: FileCheck -input-file=%t.1 -check-prefix=CHECK-darwin8 %s
-// RUN: %clang_analyze_cc1 -triple i386-apple-darwin9 -analyzer-checker=core,alpha.core -Wno-objc-root-class %s > %t.2 2>&1
+// RUN: %clang_analyze_cc1 -Wno-error=return-type -triple i386-apple-darwin9 -analyzer-checker=core,alpha.core -Wno-objc-root-class %s > %t.2 2>&1
 // RUN: FileCheck -input-file=%t.2 -check-prefix=CHECK-darwin9 %s
-// RUN: %clang_analyze_cc1 -triple thumbv6-apple-ios4.0 -analyzer-checker=core,alpha.core -Wno-objc-root-class %s > %t.3 2>&1
+// RUN: %clang_analyze_cc1 -Wno-error=return-type -triple thumbv6-apple-ios4.0 -analyzer-checker=core,alpha.core -Wno-objc-root-class %s > %t.3 2>&1
 // RUN: FileCheck -input-file=%t.3 -check-prefix=CHECK-darwin9 %s
 
 @interface MyClass {}
diff --git clang/test/Analysis/novoidtypecrash.c clang/test/Analysis/novoidtypecrash.c
index 197516a25961..5af30c201043 100644
--- clang/test/Analysis/novoidtypecrash.c
+++ clang/test/Analysis/novoidtypecrash.c
@@ -1,4 +1,4 @@
-// RUN: %clang_analyze_cc1 -std=c89 -Wno-int-conversion -analyzer-checker=core %s
+// RUN: %clang_analyze_cc1 -Wno-error=return-type -std=c89 -Wno-int-conversion -analyzer-checker=core %s
 x;
 y(void **z) { // no-crash
   *z = x;
diff --git clang/test/Analysis/plist-output.m clang/test/Analysis/plist-output.m
index 96123243a833..b89aab0a7c4c 100644
--- clang/test/Analysis/plist-output.m
+++ clang/test/Analysis/plist-output.m
@@ -177,7 +177,7 @@ void RDar13295437_f(void *i) __attribute__((__nonnull__));
 
 struct  RDar13295437_S { int *i; };
 
-int  RDar13295437(void) {
+void RDar13295437(void) {
   struct RDar13295437_S s = {0};
   struct RDar13295437_S *sp = &s;
   RDar13295437_f(sp->i);
diff --git clang/test/Analysis/plist-stats-output.c clang/test/Analysis/plist-stats-output.c
index 4bcae557d927..42e0a802d3e3 100644
--- clang/test/Analysis/plist-stats-output.c
+++ clang/test/Analysis/plist-stats-output.c
@@ -2,7 +2,7 @@
 // REQUIRES: asserts
 // RUN: FileCheck --input-file=%t.plist %s
 
-int foo(void) {}
+void foo(void) {}
 
 
 // CHECK:  <key>diagnostics</key>
diff --git clang/test/Analysis/scopes-cfg-output.cpp clang/test/Analysis/scopes-cfg-output.cpp
index 5e6706602d45..c082bb179545 100644
--- clang/test/Analysis/scopes-cfg-output.cpp
+++ clang/test/Analysis/scopes-cfg-output.cpp
@@ -1074,7 +1074,7 @@ void test_switch_with_compound_with_default() {
 // CHECK-NEXT:   Succs (1): B4
 // CHECK:      [B0 (EXIT)]
 // CHECK-NEXT:   Preds (1): B1
-int test_switch_with_compound_without_default() {
+void test_switch_with_compound_without_default() {
   char c = '1';
   switch (int i = getX()) {
     case 0:
diff --git clang/test/Analysis/structured_bindings.cpp clang/test/Analysis/structured_bindings.cpp
index 7004c2e7dcf4..989c584189ab 100644
--- clang/test/Analysis/structured_bindings.cpp
+++ clang/test/Analysis/structured_bindings.cpp
@@ -3,10 +3,10 @@
 void clang_analyzer_eval(bool);
 
 struct s { int a; };
-int foo() {
+void foo() {
   auto [a] = s{1};
   clang_analyzer_eval(a == 1); // expected-warning{{TRUE}}
-} // expected-warning{{non-void function does not return a value}}
+}
 
 struct s2 {
   int &x;
diff --git clang/test/CXX/drs/cwg605.cpp clang/test/CXX/drs/cwg605.cpp
index 2fd9e8155bf7..b98c483f3118 100644
--- clang/test/CXX/drs/cwg605.cpp
+++ clang/test/CXX/drs/cwg605.cpp
@@ -12,7 +12,7 @@ template <class T>
 static T f(T t) {}
 
 template <>
-int f(int t) {}
+int f(int t) { return 0; }
 
 void g(int a) {
   f(a);
diff --git clang/test/CXX/expr/expr.prim/expr.prim.lambda/p5.cpp clang/test/CXX/expr/expr.prim/expr.prim.lambda/p5.cpp
index 2a99ff0ea44f..84d84a61a8d2 100644
--- clang/test/CXX/expr/expr.prim/expr.prim.lambda/p5.cpp
+++ clang/test/CXX/expr/expr.prim/expr.prim.lambda/p5.cpp
@@ -1,9 +1,9 @@
-// RUN: %clang_cc1 -std=c++11 %s -Winvalid-noreturn -verify
+// RUN: %clang_cc1 -Werror=return-type -std=c++11 %s -Winvalid-noreturn -verify
 
 // An attribute-specifier-seq in a lambda-declarator appertains to the
 // type of the corresponding function call operator.
 void test_attributes() {
-  auto nrl = [](int x) -> int { if (x > 0) return x; }; // expected-warning{{on-void lambda does not return a value in all control paths}}
+  auto nrl = [](int x) -> int { if (x > 0) return x; }; // expected-error{{non-void lambda does not return a value in all control paths}}
 
   // FIXME: GCC accepts the [[gnu::noreturn]] attribute here.
   auto nrl2 = []() [[gnu::noreturn]] { return; }; // expected-warning{{attribute 'noreturn' ignored}}
diff --git clang/test/CXX/expr/expr.prim/expr.prim.lambda/p7.cpp clang/test/CXX/expr/expr.prim/expr.prim.lambda/p7.cpp
index 73714f1a947a..cb04d9567d2e 100644
--- clang/test/CXX/expr/expr.prim/expr.prim.lambda/p7.cpp
+++ clang/test/CXX/expr/expr.prim/expr.prim.lambda/p7.cpp
@@ -1,8 +1,8 @@
-// RUN: %clang_cc1 -fsyntax-only -std=c++11 %s -verify
+// RUN: %clang_cc1 -Werror=return-type -fsyntax-only -std=c++11 %s -verify
 
 // Check that analysis-based warnings work in lambda bodies.
 void analysis_based_warnings() {
-  (void)[]() -> int { }; // expected-warning{{non-void lambda does not return a value}}
+  (void)[]() -> int { }; // expected-error{{non-void lambda does not return a value}}
 }
 
 // Check that we get the right types of captured variables (the
diff --git clang/test/CodeGen/2003-06-26-CFECrash.c clang/test/CodeGen/2003-06-26-CFECrash.c
index aef3aa025344..b92d9d8b18bf 100644
--- clang/test/CodeGen/2003-06-26-CFECrash.c
+++ clang/test/CodeGen/2003-06-26-CFECrash.c
@@ -13,7 +13,7 @@ typedef struct Globals {
 
 extern Uz_Globs G;
 
-int extract_or_test_files(void) {  
+void extract_or_test_files(void) {
   G.pInfo = G.info;
 }
 
diff --git clang/test/CodeGen/2003-08-18-SigSetJmp.c clang/test/CodeGen/2003-08-18-SigSetJmp.c
index 986bcb8bd74c..cf7abaef7eff 100644
--- clang/test/CodeGen/2003-08-18-SigSetJmp.c
+++ clang/test/CodeGen/2003-08-18-SigSetJmp.c
@@ -5,7 +5,7 @@ typedef int sigjmp_buf[_JBLEN + 1];
 int sigsetjmp(sigjmp_buf env, int savemask);
 void bar(void);
 sigjmp_buf B;
-int foo(void) {
+void foo(void) {
   sigsetjmp(B, 1);
   bar();
 }
diff --git clang/test/CodeGen/2003-08-23-LocalUnionTest.c clang/test/CodeGen/2003-08-23-LocalUnionTest.c
index 50b01e425878..01b830f284f3 100644
--- clang/test/CodeGen/2003-08-23-LocalUnionTest.c
+++ clang/test/CodeGen/2003-08-23-LocalUnionTest.c
@@ -4,7 +4,7 @@
 
 union foo { int X; };
 
-int test(union foo* F) {
+void test(union foo* F) {
   {
     union foo { float X; } A;
   }
diff --git clang/test/CodeGen/2003-10-29-AsmRename.c clang/test/CodeGen/2003-10-29-AsmRename.c
index 746ff15dd0e3..0db2e862ae00 100644
--- clang/test/CodeGen/2003-10-29-AsmRename.c
+++ clang/test/CodeGen/2003-10-29-AsmRename.c
@@ -16,7 +16,7 @@ int Func64(struct bar* B) {
 }
 
 
-int test(void) {
+void test(void) {
   Func(0);    /* should be renamed to call Func64 */
   Func64(0);
 }
diff --git clang/test/CodeGen/2003-11-20-ComplexDivision.c clang/test/CodeGen/2003-11-20-ComplexDivision.c
index 51198b81ee2e..29873f5673dd 100644
--- clang/test/CodeGen/2003-11-20-ComplexDivision.c
+++ clang/test/CodeGen/2003-11-20-ComplexDivision.c
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 -emit-llvm %s  -o /dev/null
 
-int test(void) {
+void test(void) {
   __complex__ double C;
   double D;
   C / D;
diff --git clang/test/CodeGen/2004-06-18-VariableLengthArrayOfStructures.c clang/test/CodeGen/2004-06-18-VariableLengthArrayOfStructures.c
index abf78fb09556..5ad837ab4bcb 100644
--- clang/test/CodeGen/2004-06-18-VariableLengthArrayOfStructures.c
+++ clang/test/CodeGen/2004-06-18-VariableLengthArrayOfStructures.c
@@ -3,7 +3,7 @@
 
 struct S { };
 
-int xxxx(int a) {
+void xxxx(int a) {
   struct S comps[a];
   comps[0];
 }
diff --git clang/test/CodeGen/2004-11-27-StaticFunctionRedeclare.c clang/test/CodeGen/2004-11-27-StaticFunctionRedeclare.c
index 0650c744e46c..b9deeb0e2bf6 100644
--- clang/test/CodeGen/2004-11-27-StaticFunctionRedeclare.c
+++ clang/test/CodeGen/2004-11-27-StaticFunctionRedeclare.c
@@ -14,4 +14,4 @@ void bar(void) {
   int func(void);
   foo(func);
 }
-static int func(char** A, char ** B) {}
+static int func(char** A, char ** B) { return 0; }
diff --git clang/test/CodeGen/2005-01-02-VAArgError-ICE.c clang/test/CodeGen/2005-01-02-VAArgError-ICE.c
index 39439d7c614c..4f9a536be527 100644
--- clang/test/CodeGen/2005-01-02-VAArgError-ICE.c
+++ clang/test/CodeGen/2005-01-02-VAArgError-ICE.c
@@ -2,7 +2,7 @@
 // PR481
 // RUN: %clang_cc1 %s -Wno-implicit-function-declaration -emit-llvm -o /dev/null
 
-int flags(int a, int b, ...) {
+void flags(int a, int b, ...) {
         __builtin_va_list         args;
         __builtin_va_start(args,a);       // not the last named arg
         foo(args);
diff --git clang/test/CodeGen/2005-06-15-ExpandGotoInternalProblem.c clang/test/CodeGen/2005-06-15-ExpandGotoInternalProblem.c
index 521e69866151..597b0bd9afe9 100644
--- clang/test/CodeGen/2005-06-15-ExpandGotoInternalProblem.c
+++ clang/test/CodeGen/2005-06-15-ExpandGotoInternalProblem.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -std=c99 %s -emit-llvm -o - | \
+// RUN: %clang_cc1 -Wno-error=return-type -std=c99 %s -emit-llvm -o - | \
 // RUN:    opt -O3 -disable-output
 // PR580
 
diff --git clang/test/CodeGen/2007-01-06-KNR-Proto.c clang/test/CodeGen/2007-01-06-KNR-Proto.c
index d56a786fce53..f38979b67008 100644
--- clang/test/CodeGen/2007-01-06-KNR-Proto.c
+++ clang/test/CodeGen/2007-01-06-KNR-Proto.c
@@ -6,5 +6,6 @@ int svc_register (void (*dispatch) (int));
 int svc_register (dispatch)
      void (*dispatch) ();
 {
+  return 0;
 }
 
diff --git clang/test/CodeGen/2008-05-06-CFECrash.c clang/test/CodeGen/2008-05-06-CFECrash.c
index 11775673a7cc..7ca157969ac3 100644
--- clang/test/CodeGen/2008-05-06-CFECrash.c
+++ clang/test/CodeGen/2008-05-06-CFECrash.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -emit-llvm -O2 %s -o /dev/null
+// RUN: %clang_cc1 -Wno-error=return-type -emit-llvm -O2 %s -o /dev/null
 // PR2292.
 __inline__ __attribute__ ((__pure__)) int g (void) {}
 void f (int k) { k = g (); }
diff --git clang/test/CodeGen/2008-07-30-redef-of-bitcasted-decl.c clang/test/CodeGen/2008-07-30-redef-of-bitcasted-decl.c
index 70f3aaf6abfc..910b5fdde7fc 100644
--- clang/test/CodeGen/2008-07-30-redef-of-bitcasted-decl.c
+++ clang/test/CodeGen/2008-07-30-redef-of-bitcasted-decl.c
@@ -22,6 +22,6 @@ static void bar(void *db) {
 
 char s[5] = "hi";
 
-int foo(void) {
+void foo(void) {
   bar(0);
 }
diff --git clang/test/CodeGen/2008-10-13-FrontendCrash.c clang/test/CodeGen/2008-10-13-FrontendCrash.c
index da28bd9b1b9b..f303224a4494 100644
--- clang/test/CodeGen/2008-10-13-FrontendCrash.c
+++ clang/test/CodeGen/2008-10-13-FrontendCrash.c
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 %s -std=c89 -emit-llvm -o -
 // PR2797
 
-unsigned int
+void
 func_48 (signed char p_49)
 {
   signed char l_340;
diff --git clang/test/CodeGen/2009-01-21-InvalidIterator.c clang/test/CodeGen/2009-01-21-InvalidIterator.c
index 83353da68beb..b16e6d2d24ac 100644
--- clang/test/CodeGen/2009-01-21-InvalidIterator.c
+++ clang/test/CodeGen/2009-01-21-InvalidIterator.c
@@ -63,6 +63,7 @@ frame_hdr_cache[8];
 _Unwind_Ptr
 base_from_cb_data (struct unw_eh_callback_data *data)
 {
+  return 0;
 }
 
 void
diff --git clang/test/CodeGen/2009-05-04-EnumInreg.c clang/test/CodeGen/2009-05-04-EnumInreg.c
index 0ea18b92691e..491678647b84 100644
--- clang/test/CodeGen/2009-05-04-EnumInreg.c
+++ clang/test/CodeGen/2009-05-04-EnumInreg.c
@@ -14,4 +14,4 @@ enum kobject_action {
 struct kobject;
 
 // CHECK: i32 inreg %action
-int kobject_uevent(struct kobject *kobj, enum kobject_action action) {}
+void kobject_uevent(struct kobject *kobj, enum kobject_action action) {}
diff --git clang/test/CodeGen/2009-07-15-pad-wchar_t-array.c clang/test/CodeGen/2009-07-15-pad-wchar_t-array.c
index 7be237d11a4d..78bc0e8eb6a9 100644
--- clang/test/CodeGen/2009-07-15-pad-wchar_t-array.c
+++ clang/test/CodeGen/2009-07-15-pad-wchar_t-array.c
@@ -14,4 +14,5 @@ typedef __WCHAR_TYPE__ wchar_t;
 signed short _iodbcdm_sqlerror(void)
 {
   wchar_t _sqlState[6] = { L"\0" };
+  return 0;
 }
diff --git clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_dupq.c clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_dupq.c
index b1f404c0ec8c..cf1c00cdd56b 100644
--- clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_dupq.c
+++ clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_dupq.c
@@ -211,3 +211,19 @@ svfloat64_t test_svdup_laneq_f64(svfloat64_t zn) {
 svbfloat16_t test_svdup_laneq_bf16(svbfloat16_t zn) {
     return SVE_ACLE_FUNC(svdup_laneq, _bf16)(zn, 3);
 }
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svdup_laneq_mf8
+// CHECK-SAME: (<vscale x 16 x i8> [[ZN:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.dup.laneq.nxv16i8(<vscale x 16 x i8> [[ZN]], i32 1)
+// CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z20test_svdup_laneq_mf8u13__SVMfloat8_t
+// CPP-CHECK-SAME: (<vscale x 16 x i8> [[ZN:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.dup.laneq.nxv16i8(<vscale x 16 x i8> [[ZN]], i32 1)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+svmfloat8_t test_svdup_laneq_mf8(svmfloat8_t zn) {
+    return SVE_ACLE_FUNC(svdup_laneq, _mf8)(zn, 1);
+}
diff --git clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_extq.c clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_extq.c
index 06eec1e00900..d46e67b9918a 100644
--- clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_extq.c
+++ clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_extq.c
@@ -211,3 +211,19 @@ svfloat64_t test_svextq_f64(svfloat64_t zn, svfloat64_t zm) {
 svbfloat16_t test_svextq_bf16(svbfloat16_t zn, svbfloat16_t zm) {
     return SVE_ACLE_FUNC(svextq, _bf16,,)(zn, zm, 6);
 }
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svextq_mf8
+// CHECK-SAME: (<vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.extq.nxv16i8(<vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]], i32 6)
+// CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z15test_svextq_mf8u13__SVMfloat8_tS_
+// CPP-CHECK-SAME: (<vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.extq.nxv16i8(<vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]], i32 6)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+svmfloat8_t test_svextq_mf8(svmfloat8_t zn, svmfloat8_t zm) {
+    return SVE_ACLE_FUNC(svextq, _mf8,,)(zn, zm, 6);
+}
diff --git clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_tblq.c clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_tblq.c
index 7a19cde9abd8..3d3bb0d17a50 100644
--- clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_tblq.c
+++ clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_tblq.c
@@ -212,3 +212,19 @@ svfloat64_t test_svtblq_f64(svfloat64_t zn, svuint64_t zm) {
 svbfloat16_t test_svtblq_bf16(svbfloat16_t zn, svuint16_t zm) {
     return SVE_ACLE_FUNC(svtblq, _bf16,,)(zn, zm);
 }
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svtblq_mf8
+// CHECK-SAME: (<vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.tblq.nxv16i8(<vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
+// CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z15test_svtblq_mf8u13__SVMfloat8_tu11__SVUint8_t
+// CPP-CHECK-SAME: (<vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.tblq.nxv16i8(<vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+svmfloat8_t test_svtblq_mf8(svmfloat8_t zn, svuint8_t zm) {
+    return SVE_ACLE_FUNC(svtblq, _mf8,,)(zn, zm);
+}
diff --git clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_tbxq.c clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_tbxq.c
index f4aaed586c73..674bd9cbb083 100644
--- clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_tbxq.c
+++ clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_tbxq.c
@@ -212,3 +212,19 @@ svfloat64_t test_svtbxq_f64(svfloat64_t passthru, svfloat64_t zn, svuint64_t zm)
 svbfloat16_t test_svtbxq_bf16(svbfloat16_t passthru, svbfloat16_t zn, svuint16_t zm) {
     return SVE_ACLE_FUNC(svtbxq, _bf16,,)(passthru, zn, zm);
 }
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svtbxq_mf8
+// CHECK-SAME: (<vscale x 16 x i8> [[PASSTHRU:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.tbxq.nxv16i8(<vscale x 16 x i8> [[PASSTHRU]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
+// CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z15test_svtbxq_mf8u13__SVMfloat8_tS_u11__SVUint8_t
+// CPP-CHECK-SAME: (<vscale x 16 x i8> [[PASSTHRU:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.tbxq.nxv16i8(<vscale x 16 x i8> [[PASSTHRU]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+svmfloat8_t test_svtbxq_mf8(svmfloat8_t passthru, svmfloat8_t zn, svuint8_t zm) {
+    return SVE_ACLE_FUNC(svtbxq, _mf8,,)(passthru, zn, zm);
+}
diff --git clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_uzpq1.c clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_uzpq1.c
index 8c639120409e..35878d61f954 100644
--- clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_uzpq1.c
+++ clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_uzpq1.c
@@ -214,4 +214,19 @@ svbfloat16_t test_svuzpq1_bf16(svbfloat16_t zn, svbfloat16_t zm) {
     return SVE_ACLE_FUNC(svuzpq1,_bf16)(zn, zm);
 }
 
+// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svuzpq1_mf8
+// CHECK-SAME: (<vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.uzpq1.nxv16i8(<vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
+// CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z16test_svuzpq1_mf8u13__SVMfloat8_tS_
+// CPP-CHECK-SAME: (<vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.uzpq1.nxv16i8(<vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+svmfloat8_t test_svuzpq1_mf8(svmfloat8_t zn, svmfloat8_t zm) {
+    return SVE_ACLE_FUNC(svuzpq1,_mf8)(zn, zm);
+}
 
diff --git clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_uzpq2.c clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_uzpq2.c
index 756d2538317e..a22f20f30039 100644
--- clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_uzpq2.c
+++ clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_uzpq2.c
@@ -214,4 +214,18 @@ svbfloat16_t test_svuzpq2_bf16(svbfloat16_t zn, svbfloat16_t zm) {
     return SVE_ACLE_FUNC(svuzpq2,_bf16)(zn, zm);
 }
 
-
+// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svuzpq2_mf8
+// CHECK-SAME: (<vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.uzpq2.nxv16i8(<vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
+// CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z16test_svuzpq2_mf8u13__SVMfloat8_tS_
+// CPP-CHECK-SAME: (<vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.uzpq2.nxv16i8(<vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+svmfloat8_t test_svuzpq2_mf8(svmfloat8_t zn, svmfloat8_t zm) {
+    return SVE_ACLE_FUNC(svuzpq2,_mf8)(zn, zm);
+}
diff --git clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_zipq1.c clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_zipq1.c
index 6684bb2cf2d9..436697306269 100644
--- clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_zipq1.c
+++ clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_zipq1.c
@@ -214,4 +214,18 @@ svbfloat16_t test_svzipq1_bf16(svbfloat16_t zn, svbfloat16_t zm) {
     return SVE_ACLE_FUNC(svzipq1,_bf16)(zn, zm);
 }
 
-
+// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svzipq1_mf8
+// CHECK-SAME: (<vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.zipq1.nxv16i8(<vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
+// CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z16test_svzipq1_mf8u13__SVMfloat8_tS_
+// CPP-CHECK-SAME: (<vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.zipq1.nxv16i8(<vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+svmfloat8_t test_svzipq1_mf8(svmfloat8_t zn, svmfloat8_t zm) {
+    return SVE_ACLE_FUNC(svzipq1,_mf8)(zn, zm);
+}
diff --git clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_zipq2.c clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_zipq2.c
index 2bfd72b32d1c..4e27ec463c08 100644
--- clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_zipq2.c
+++ clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_zipq2.c
@@ -214,4 +214,18 @@ svbfloat16_t test_svzipq2_bf16(svbfloat16_t zn, svbfloat16_t zm) {
     return SVE_ACLE_FUNC(svzipq2,_bf16)(zn, zm);
 }
 
-
+// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svzipq2_mf8
+// CHECK-SAME: (<vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.zipq2.nxv16i8(<vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
+// CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z16test_svzipq2_mf8u13__SVMfloat8_tS_
+// CPP-CHECK-SAME: (<vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.zipq2.nxv16i8(<vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+svmfloat8_t test_svzipq2_mf8(svmfloat8_t zn, svmfloat8_t zm) {
+    return SVE_ACLE_FUNC(svzipq2,_mf8)(zn, zm);
+}
diff --git clang/test/CodeGen/AArch64/targetattr.c clang/test/CodeGen/AArch64/targetattr.c
index f8d5f9912c0d..cfe115bf97ed 100644
--- clang/test/CodeGen/AArch64/targetattr.c
+++ clang/test/CodeGen/AArch64/targetattr.c
@@ -218,7 +218,7 @@ void applem4() {}
 // CHECK: attributes #[[ATTR15]] = { noinline nounwind optnone "branch-target-enforcement" "guarded-control-stack" "no-trapping-math"="true" "sign-return-address"="non-leaf" "sign-return-address-key"="a_key" "stack-protector-buffer-size"="8" "target-cpu"="neoverse-n1" "target-features"="+aes,+bf16,+bti,+ccidx,+complxnum,+crc,+dit,+dotprod,+flagm,+fp-armv8,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+perfmon,+predres,+ras,+rcpc,+rdm,+sb,+sha2,+spe,+ssbs,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8.6a,+v8a" "tune-cpu"="cortex-a710" }
 // CHECK: attributes #[[ATTR16]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
 // CHECK: attributes #[[ATTR17]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="-v9.3a" }
-// CHECK: attributes #[[ATTR18]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="apple-m4" "target-features"="+aes,+bf16,+bti,+ccidx,+complxnum,+crc,+dit,+dotprod,+flagm,+fp-armv8,+fp16fml,+fpac,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+perfmon,+predres,+ras,+rcpc,+rdm,+sb,+sha2,+sha3,+sme,+sme-f64f64,+sme-i16i64,+sme2,+ssbs,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8.6a,+v8.7a,+v8a,+wfxt" }
+// CHECK: attributes #[[ATTR18]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="apple-m4" "target-features"="+aes,+bf16,+bti,+ccidx,+complxnum,+crc,+dit,+dotprod,+flagm,+fp-armv8,+fp16fml,+fpac,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+perfmon,+predres,+ras,+rcpc,+rdm,+sb,+sha2,+sha3,+sme,+sme-f64f64,+sme-i16i64,+sme2,+spe-eef,+ssbs,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8.6a,+v8.7a,+v8a,+wfxt" }
 //.
 // CHECK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
 // CHECK: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
diff --git clang/test/CodeGen/SystemZ/builtins-systemz-bitop.c clang/test/CodeGen/SystemZ/builtins-systemz-bitop.c
new file mode 100644
index 000000000000..5b4051c8d6f1
--- /dev/null
+++ clang/test/CodeGen/SystemZ/builtins-systemz-bitop.c
@@ -0,0 +1,16 @@
+// REQUIRES: systemz-registered-target
+// RUN: %clang_cc1 -target-cpu arch15 -triple s390x-ibm-linux -Wall -Wno-unused -Werror -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -target-cpu arch15 -triple s390x-ibm-linux -Wall -Wno-unused -Werror -emit-llvm -x c++ %s -o - | FileCheck %s
+
+unsigned long test_bdepg(unsigned long a, unsigned long b) {
+// CHECK-LABEL: test_bdepg
+// CHECK: call i64 @llvm.s390.bdepg(i64 {{.*}}, i64 {{.*}})
+  return __builtin_s390_bdepg(a, b);
+}
+
+unsigned long test_bextg(unsigned long a, unsigned long b) {
+// CHECK-LABEL: test_bextg
+// CHECK: call i64 @llvm.s390.bextg(i64 {{.*}}, i64 {{.*}})
+  return __builtin_s390_bextg(a, b);
+}
+
diff --git clang/test/CodeGen/SystemZ/builtins-systemz-vector5-error.c clang/test/CodeGen/SystemZ/builtins-systemz-vector5-error.c
new file mode 100644
index 000000000000..3943a15af9d2
--- /dev/null
+++ clang/test/CodeGen/SystemZ/builtins-systemz-vector5-error.c
@@ -0,0 +1,34 @@
+// REQUIRES: systemz-registered-target
+// RUN: %clang_cc1 -target-cpu arch15 -triple s390x-unknown-unknown \
+// RUN: -Wall -Wno-unused -Werror -fsyntax-only -verify %s
+
+typedef __attribute__((vector_size(16))) signed char vec_schar;
+typedef __attribute__((vector_size(16))) signed short vec_sshort;
+typedef __attribute__((vector_size(16))) signed int vec_sint;
+typedef __attribute__((vector_size(16))) signed long long vec_slong;
+typedef __attribute__((vector_size(16))) unsigned char vec_uchar;
+typedef __attribute__((vector_size(16))) unsigned short vec_ushort;
+typedef __attribute__((vector_size(16))) unsigned int vec_uint;
+typedef __attribute__((vector_size(16))) unsigned long long vec_ulong;
+typedef __attribute__((vector_size(16))) double vec_double;
+
+volatile vec_schar vsc;
+volatile vec_sshort vss;
+volatile vec_sint vsi;
+volatile vec_slong vsl;
+volatile vec_uchar vuc;
+volatile vec_ushort vus;
+volatile vec_uint vui;
+volatile vec_ulong vul;
+volatile vec_double vd;
+
+volatile unsigned int len;
+const void * volatile cptr;
+int cc;
+
+void test_integer(void) {
+  __builtin_s390_veval(vuc, vuc, vuc, -1);    // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  __builtin_s390_veval(vuc, vuc, vuc, 256);   // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  __builtin_s390_veval(vuc, vuc, vuc, len);   // expected-error {{must be a constant integer}}
+}
+
diff --git clang/test/CodeGen/SystemZ/builtins-systemz-vector5.c clang/test/CodeGen/SystemZ/builtins-systemz-vector5.c
new file mode 100644
index 000000000000..c3621819e71f
--- /dev/null
+++ clang/test/CodeGen/SystemZ/builtins-systemz-vector5.c
@@ -0,0 +1,103 @@
+// REQUIRES: systemz-registered-target
+// RUN: %clang_cc1 -target-cpu arch15 -triple s390x-ibm-linux -flax-vector-conversions=none \
+// RUN: -Wall -Wno-unused -Werror -emit-llvm %s -o - | FileCheck %s
+
+typedef __attribute__((vector_size(16))) signed char vec_schar;
+typedef __attribute__((vector_size(16))) signed short vec_sshort;
+typedef __attribute__((vector_size(16))) signed int vec_sint;
+typedef __attribute__((vector_size(16))) signed long long vec_slong;
+typedef __attribute__((vector_size(16))) signed __int128 vec_sint128;
+typedef __attribute__((vector_size(16))) unsigned char vec_uchar;
+typedef __attribute__((vector_size(16))) unsigned short vec_ushort;
+typedef __attribute__((vector_size(16))) unsigned int vec_uint;
+typedef __attribute__((vector_size(16))) unsigned long long vec_ulong;
+typedef __attribute__((vector_size(16))) unsigned __int128 vec_uint128;
+typedef __attribute__((vector_size(16))) double vec_double;
+
+volatile vec_schar vsc;
+volatile vec_sshort vss;
+volatile vec_sint vsi;
+volatile vec_slong vsl;
+volatile vec_uchar vuc;
+volatile vec_ushort vus;
+volatile vec_uint vui;
+volatile vec_ulong vul;
+volatile signed __int128 si128;
+volatile unsigned __int128 ui128;
+
+int cc;
+
+void test_core(void) {
+  vuc = __builtin_s390_vgemb(vus);
+  // CHECK: call <16 x i8> @llvm.s390.vgemb(<8 x i16> %{{.*}})
+  vus = __builtin_s390_vgemh(vuc);
+  // CHECK: call <8 x i16> @llvm.s390.vgemh(<16 x i8> %{{.*}})
+  vui = __builtin_s390_vgemf(vuc);
+  // CHECK: call <4 x i32> @llvm.s390.vgemf(<16 x i8> %{{.*}})
+  vul = __builtin_s390_vgemg(vuc);
+  // CHECK: call <2 x i64> @llvm.s390.vgemg(<16 x i8> %{{.*}})
+  ui128 = __builtin_s390_vgemq(vuc);
+  // CHECK: call i128 @llvm.s390.vgemq(<16 x i8> %{{.*}})
+
+  si128 = __builtin_s390_vuphg(vsl);
+  // CHECK: call i128 @llvm.s390.vuphg(<2 x i64> %{{.*}})
+  si128 = __builtin_s390_vuplg(vsl);
+  // CHECK: call i128 @llvm.s390.vuplg(<2 x i64> %{{.*}})
+  ui128 = __builtin_s390_vuplhg(vul);
+  // CHECK: call i128 @llvm.s390.vuplhg(<2 x i64> %{{.*}})
+  ui128 = __builtin_s390_vupllg(vul);
+  // CHECK: call i128 @llvm.s390.vupllg(<2 x i64> %{{.*}})
+}
+
+void test_integer(void) {
+  si128 = __builtin_s390_vavgq(si128, si128);
+  // CHECK: call i128 @llvm.s390.vavgq(i128 %{{.*}}, i128 %{{.*}})
+  ui128 = __builtin_s390_vavglq(ui128, ui128);
+  // CHECK: call i128 @llvm.s390.vavglq(i128 %{{.*}}, i128 %{{.*}})
+
+  vuc = __builtin_s390_veval(vuc, vuc, vuc, 0);
+  // CHECK: call <16 x i8> @llvm.s390.veval(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 0)
+
+  vsl = __builtin_s390_vmahg(vsl, vsl, vsl);
+  // CHECK: call <2 x i64> @llvm.s390.vmahg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}})
+  si128 = __builtin_s390_vmahq(si128, si128, si128);
+  // CHECK: call i128 @llvm.s390.vmahq(i128 %{{.*}}, i128 %{{.*}}, i128 %{{.*}})
+  vul = __builtin_s390_vmalhg(vul, vul, vul);
+  // CHECK: call <2 x i64> @llvm.s390.vmalhg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}})
+  ui128 = __builtin_s390_vmalhq(ui128, ui128, ui128);
+  // CHECK: call i128 @llvm.s390.vmalhq(i128 %{{.*}}, i128 %{{.*}}, i128 %{{.*}})
+
+  si128 = __builtin_s390_vmaeg(vsl, vsl, si128);
+  // CHECK: call i128 @llvm.s390.vmaeg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, i128 %{{.*}})
+  ui128 = __builtin_s390_vmaleg(vul, vul, ui128);
+  // CHECK: call i128 @llvm.s390.vmaleg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, i128 %{{.*}})
+  si128 = __builtin_s390_vmaog(vsl, vsl, si128);
+  // CHECK: call i128 @llvm.s390.vmaog(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, i128 %{{.*}})
+  ui128 = __builtin_s390_vmalog(vul, vul, ui128);
+  // CHECK: call i128 @llvm.s390.vmalog(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, i128 %{{.*}})
+
+  vsl = __builtin_s390_vmhg(vsl, vsl);
+  // CHECK: call <2 x i64> @llvm.s390.vmhg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
+  si128 = __builtin_s390_vmhq(si128, si128);
+  // CHECK: call i128 @llvm.s390.vmhq(i128 %{{.*}}, i128 %{{.*}})
+  vul = __builtin_s390_vmlhg(vul, vul);
+  // CHECK: call <2 x i64> @llvm.s390.vmlhg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
+  ui128 = __builtin_s390_vmlhq(ui128, ui128);
+  // CHECK: call i128 @llvm.s390.vmlhq(i128 %{{.*}}, i128 %{{.*}})
+
+  si128 = __builtin_s390_vmeg(vsl, vsl);
+  // CHECK: call i128 @llvm.s390.vmeg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
+  ui128 = __builtin_s390_vmleg(vul, vul);
+  // CHECK: call i128 @llvm.s390.vmleg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
+  si128 = __builtin_s390_vmog(vsl, vsl);
+  // CHECK: call i128 @llvm.s390.vmog(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
+  ui128 = __builtin_s390_vmlog(vul, vul);
+  // CHECK: call i128 @llvm.s390.vmlog(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
+
+  si128 = __builtin_s390_vceqqs(ui128, ui128, &cc);
+  // CHECK: call { i128, i32 } @llvm.s390.vceqqs(i128 %{{.*}}, i128 %{{.*}})
+  si128 = __builtin_s390_vchqs(si128, si128, &cc);
+  // CHECK: call { i128, i32 } @llvm.s390.vchqs(i128 %{{.*}}, i128 %{{.*}})
+  si128 = __builtin_s390_vchlqs(ui128, ui128, &cc);
+  // CHECK: call { i128, i32 } @llvm.s390.vchlqs(i128 %{{.*}}, i128 %{{.*}})
+}
diff --git clang/test/CodeGen/SystemZ/builtins-systemz-zvector-error.c clang/test/CodeGen/SystemZ/builtins-systemz-zvector-error.c
index 77e90b5ad4b8..2ec1d960aa5b 100644
--- clang/test/CodeGen/SystemZ/builtins-systemz-zvector-error.c
+++ clang/test/CodeGen/SystemZ/builtins-systemz-zvector-error.c
@@ -9,10 +9,12 @@ volatile vector signed char vsc;
 volatile vector signed short vss;
 volatile vector signed int vsi;
 volatile vector signed long long vsl;
+volatile vector signed __int128 vslll;
 volatile vector unsigned char vuc;
 volatile vector unsigned short vus;
 volatile vector unsigned int vui;
 volatile vector unsigned long long vul;
+volatile vector unsigned __int128 vulll;
 volatile vector bool char vbc;
 volatile vector bool short vbs;
 volatile vector bool int vbi;
@@ -34,10 +36,12 @@ const signed char * volatile cptrsc;
 const signed short * volatile cptrss;
 const signed int * volatile cptrsi;
 const signed long long * volatile cptrsl;
+const signed __int128 * volatile cptrslll;
 const unsigned char * volatile cptruc;
 const unsigned short * volatile cptrus;
 const unsigned int * volatile cptrui;
 const unsigned long long * volatile cptrul;
+const unsigned __int128 * volatile cptrulll;
 const float * volatile cptrf;
 const double * volatile cptrd;
 
@@ -233,27 +237,31 @@ void test_core(void) {
                                              // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 1}}
 
   vsc = vec_load_bndry(cptrsc, idx);  // expected-error {{no matching function}} expected-error {{argument to '__builtin_s390_vlbb' must be a constant integer}}
-                                      // expected-note@vecintrin.h:* 9 {{must be a constant power of 2 from 64 to 4096}}
+                                      // expected-note@vecintrin.h:* 11 {{must be a constant power of 2 from 64 to 4096}}
   vsc = vec_load_bndry(cptrsc, 200);  // expected-error {{no matching function}} expected-error {{argument value -1 is outside the valid range [0, 15]}}
-                                      // expected-note@vecintrin.h:* 9 {{must be a constant power of 2 from 64 to 4096}}
+                                      // expected-note@vecintrin.h:* 11 {{must be a constant power of 2 from 64 to 4096}}
   vsc = vec_load_bndry(cptrsc, 32);   // expected-error {{no matching function}} expected-error {{argument value -1 is outside the valid range [0, 15]}}
-                                      // expected-note@vecintrin.h:* 9 {{must be a constant power of 2 from 64 to 4096}}
+                                      // expected-note@vecintrin.h:* 11 {{must be a constant power of 2 from 64 to 4096}}
   vsc = vec_load_bndry(cptrsc, 8192); // expected-error {{no matching function}} expected-error {{argument value -1 is outside the valid range [0, 15]}}
-                                      // expected-note@vecintrin.h:* 9 {{must be a constant power of 2 from 64 to 4096}}
+                                      // expected-note@vecintrin.h:* 11 {{must be a constant power of 2 from 64 to 4096}}
   vuc = vec_load_bndry(cptruc, idx);  // expected-error {{no matching function}} expected-error {{argument to '__builtin_s390_vlbb' must be a constant integer}}
-                                      // expected-note@vecintrin.h:* 9 {{must be a constant power of 2 from 64 to 4096}}
+                                      // expected-note@vecintrin.h:* 11 {{must be a constant power of 2 from 64 to 4096}}
   vss = vec_load_bndry(cptrss, idx);  // expected-error {{no matching function}} expected-error {{argument to '__builtin_s390_vlbb' must be a constant integer}}
-                                      // expected-note@vecintrin.h:* 9 {{must be a constant power of 2 from 64 to 4096}}
+                                      // expected-note@vecintrin.h:* 11 {{must be a constant power of 2 from 64 to 4096}}
   vus = vec_load_bndry(cptrus, idx);  // expected-error {{no matching function}} expected-error {{argument to '__builtin_s390_vlbb' must be a constant integer}}
-                                      // expected-note@vecintrin.h:* 9 {{must be a constant power of 2 from 64 to 4096}}
+                                      // expected-note@vecintrin.h:* 11 {{must be a constant power of 2 from 64 to 4096}}
   vsi = vec_load_bndry(cptrsi, idx);  // expected-error {{no matching function}} expected-error {{argument to '__builtin_s390_vlbb' must be a constant integer}}
-                                      // expected-note@vecintrin.h:* 9 {{must be a constant power of 2 from 64 to 4096}}
+                                      // expected-note@vecintrin.h:* 11 {{must be a constant power of 2 from 64 to 4096}}
   vui = vec_load_bndry(cptrui, idx);  // expected-error {{no matching function}} expected-error {{argument to '__builtin_s390_vlbb' must be a constant integer}}
-                                      // expected-note@vecintrin.h:* 9 {{must be a constant power of 2 from 64 to 4096}}
+                                      // expected-note@vecintrin.h:* 11 {{must be a constant power of 2 from 64 to 4096}}
   vsl = vec_load_bndry(cptrsl, idx);  // expected-error {{no matching function}} expected-error {{argument to '__builtin_s390_vlbb' must be a constant integer}}
-                                      // expected-note@vecintrin.h:* 9 {{must be a constant power of 2 from 64 to 4096}}
+                                      // expected-note@vecintrin.h:* 11 {{must be a constant power of 2 from 64 to 4096}}
   vul = vec_load_bndry(cptrul, idx);  // expected-error {{no matching function}} expected-error {{argument to '__builtin_s390_vlbb' must be a constant integer}}
-                                      // expected-note@vecintrin.h:* 9 {{must be a constant power of 2 from 64 to 4096}}
+                                      // expected-note@vecintrin.h:* 11 {{must be a constant power of 2 from 64 to 4096}}
+  vslll = vec_load_bndry(cptrslll, idx);  // expected-error {{no matching function}} expected-error {{argument to '__builtin_s390_vlbb' must be a constant integer}}
+                                      // expected-note@vecintrin.h:* 11 {{must be a constant power of 2 from 64 to 4096}}
+  vulll = vec_load_bndry(cptrulll, idx);  // expected-error {{no matching function}} expected-error {{argument to '__builtin_s390_vlbb' must be a constant integer}}
+                                      // expected-note@vecintrin.h:* 11 {{must be a constant power of 2 from 64 to 4096}}
 
   vuc = vec_genmask(idx);  // expected-error {{no matching function}}
                            // expected-note@vecintrin.h:* {{must be a constant integer}}
@@ -478,83 +486,95 @@ void test_integer(void) {
                                     // expected-note@vecintrin.h:* 1 {{must be a constant integer}}
 
   vsc = vec_sld(vsc, vsc, idx); // expected-error {{no matching function}} expected-error {{argument to '__builtin_s390_vsldb' must be a constant integer}}
-                                // expected-note@vecintrin.h:* 12 {{candidate function not viable}}
+                                // expected-note@vecintrin.h:* 14 {{candidate function not viable}}
                                 // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 15}}
   vsc = vec_sld(vsc, vsc, -1);  // expected-error {{no matching function}} expected-error {{argument value -1 is outside the valid range [0, 15]}}
-                                // expected-note@vecintrin.h:* 12 {{candidate function not viable}}
+                                // expected-note@vecintrin.h:* 14 {{candidate function not viable}}
                                 // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 15}}
   vsc = vec_sld(vsc, vsc, 16);  // expected-error {{no matching function}} expected-error {{argument value 16 is outside the valid range [0, 15]}}
-                                // expected-note@vecintrin.h:* 12 {{candidate function not viable}}
+                                // expected-note@vecintrin.h:* 14 {{candidate function not viable}}
                                 // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 15}}
   vuc = vec_sld(vuc, vuc, idx); // expected-error {{no matching function}} expected-error {{argument to '__builtin_s390_vsldb' must be a constant integer}}
-                                // expected-note@vecintrin.h:* 11 {{candidate function not viable}}
+                                // expected-note@vecintrin.h:* 13 {{candidate function not viable}}
                                 // expected-note@vecintrin.h:* 2 {{must be a constant integer from 0 to 15}}
   vuc = vec_sld(vuc, vuc, -1);  // expected-error {{no matching function}} expected-error {{argument value -1 is outside the valid range [0, 15]}}
-                                // expected-note@vecintrin.h:* 11 {{candidate function not viable}}
+                                // expected-note@vecintrin.h:* 13 {{candidate function not viable}}
                                 // expected-note@vecintrin.h:* 2 {{must be a constant integer from 0 to 15}}
   vuc = vec_sld(vuc, vuc, 16);  // expected-error {{no matching function}} expected-error {{argument value 16 is outside the valid range [0, 15]}}
-                                // expected-note@vecintrin.h:* 11 {{candidate function not viable}}
+                                // expected-note@vecintrin.h:* 13 {{candidate function not viable}}
                                 // expected-note@vecintrin.h:* 2 {{must be a constant integer from 0 to 15}}
   vss = vec_sld(vss, vss, idx); // expected-error {{no matching function}} expected-error {{argument to '__builtin_s390_vsldb' must be a constant integer}}
-                                // expected-note@vecintrin.h:* 12 {{candidate function not viable}}
+                                // expected-note@vecintrin.h:* 14 {{candidate function not viable}}
                                 // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 15}}
   vus = vec_sld(vus, vus, idx); // expected-error {{no matching function}} expected-error {{argument to '__builtin_s390_vsldb' must be a constant integer}}
-                                // expected-note@vecintrin.h:* 11 {{candidate function not viable}}
+                                // expected-note@vecintrin.h:* 13 {{candidate function not viable}}
                                 // expected-note@vecintrin.h:* 2 {{must be a constant integer from 0 to 15}}
   vsi = vec_sld(vsi, vsi, idx); // expected-error {{no matching function}} expected-error {{argument to '__builtin_s390_vsldb' must be a constant integer}}
-                                // expected-note@vecintrin.h:* 12 {{candidate function not viable}}
+                                // expected-note@vecintrin.h:* 14 {{candidate function not viable}}
                                 // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 15}}
   vui = vec_sld(vui, vui, idx); // expected-error {{no matching function}} expected-error {{argument to '__builtin_s390_vsldb' must be a constant integer}}
-                                // expected-note@vecintrin.h:* 11 {{candidate function not viable}}
+                                // expected-note@vecintrin.h:* 13 {{candidate function not viable}}
                                 // expected-note@vecintrin.h:* 2 {{must be a constant integer from 0 to 15}}
   vsl = vec_sld(vsl, vsl, idx); // expected-error {{no matching function}} expected-error {{argument to '__builtin_s390_vsldb' must be a constant integer}}
-                                // expected-note@vecintrin.h:* 12 {{candidate function not viable}}
+                                // expected-note@vecintrin.h:* 14 {{candidate function not viable}}
                                 // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 15}}
   vul = vec_sld(vul, vul, idx); // expected-error {{no matching function}} expected-error {{argument to '__builtin_s390_vsldb' must be a constant integer}}
-                                // expected-note@vecintrin.h:* 11 {{candidate function not viable}}
+                                // expected-note@vecintrin.h:* 13 {{candidate function not viable}}
                                 // expected-note@vecintrin.h:* 2 {{must be a constant integer from 0 to 15}}
+  vslll = vec_sld(vslll, vslll, idx); // expected-error {{no matching function}} expected-error {{argument to '__builtin_s390_vsldb' must be a constant integer}}
+                                // expected-note@vecintrin.h:* 14 {{candidate function not viable}}
+                                // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 15}}
+  vulll = vec_sld(vulll, vulll, idx); // expected-error {{no matching function}} expected-error {{argument to '__builtin_s390_vsldb' must be a constant integer}}
+                                // expected-note@vecintrin.h:* 14 {{candidate function not viable}}
+                                // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 15}}
   vd = vec_sld(vd, vd, idx);    // expected-error {{no matching function}} expected-error {{argument to '__builtin_s390_vsldb' must be a constant integer}}
-                                // expected-note@vecintrin.h:* 12 {{candidate function not viable}}
+                                // expected-note@vecintrin.h:* 14 {{candidate function not viable}}
                                 // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 15}}
 
   vsc = vec_sldw(vsc, vsc, idx); // expected-error {{no matching function}} expected-error {{argument to '__builtin_s390_vsldb' must be a constant integer}}
-                                 // expected-note@vecintrin.h:* 8 {{candidate function not viable}}
+                                 // expected-note@vecintrin.h:* 10 {{candidate function not viable}}
                                  // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 3}}
   vsc = vec_sldw(vsc, vsc, -1);  // expected-error {{no matching function}} expected-error {{argument value -4 is outside the valid range [0, 15]}}
-                                 // expected-note@vecintrin.h:* 8 {{candidate function not viable}}
+                                 // expected-note@vecintrin.h:* 10 {{candidate function not viable}}
                                  // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 3}}
   vsc = vec_sldw(vsc, vsc, 4);   // expected-error {{no matching function}} expected-error {{argument value 16 is outside the valid range [0, 15]}}
-                                 // expected-note@vecintrin.h:* 8 {{candidate function not viable}}
+                                 // expected-note@vecintrin.h:* 10 {{candidate function not viable}}
                                  // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 3}}
   vuc = vec_sldw(vuc, vuc, idx); // expected-error {{no matching function}} expected-error {{argument to '__builtin_s390_vsldb' must be a constant integer}}
-                                 // expected-note@vecintrin.h:* 8 {{candidate function not viable}}
+                                 // expected-note@vecintrin.h:* 10 {{candidate function not viable}}
                                  // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 3}}
   vuc = vec_sldw(vuc, vuc, -1);  // expected-error {{no matching function}} expected-error {{argument value -4 is outside the valid range [0, 15]}}
-                                 // expected-note@vecintrin.h:* 8 {{candidate function not viable}}
+                                 // expected-note@vecintrin.h:* 10 {{candidate function not viable}}
                                  // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 3}}
   vuc = vec_sldw(vuc, vuc, 4);   // expected-error {{no matching function}} expected-error {{argument value 16 is outside the valid range [0, 15]}}
-                                 // expected-note@vecintrin.h:* 8 {{candidate function not viable}}
+                                 // expected-note@vecintrin.h:* 10 {{candidate function not viable}}
                                  // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 3}}
   vss = vec_sldw(vss, vss, idx); // expected-error {{no matching function}} expected-error {{argument to '__builtin_s390_vsldb' must be a constant integer}}
-                                 // expected-note@vecintrin.h:* 8 {{candidate function not viable}}
+                                 // expected-note@vecintrin.h:* 10 {{candidate function not viable}}
                                  // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 3}}
   vus = vec_sldw(vus, vus, idx); // expected-error {{no matching function}} expected-error {{argument to '__builtin_s390_vsldb' must be a constant integer}}
-                                 // expected-note@vecintrin.h:* 8 {{candidate function not viable}}
+                                 // expected-note@vecintrin.h:* 10 {{candidate function not viable}}
                                  // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 3}}
   vsi = vec_sldw(vsi, vsi, idx); // expected-error {{no matching function}} expected-error {{argument to '__builtin_s390_vsldb' must be a constant integer}}
-                                 // expected-note@vecintrin.h:* 8 {{candidate function not viable}}
+                                 // expected-note@vecintrin.h:* 10 {{candidate function not viable}}
                                  // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 3}}
   vui = vec_sldw(vui, vui, idx); // expected-error {{no matching function}} expected-error {{argument to '__builtin_s390_vsldb' must be a constant integer}}
-                                 // expected-note@vecintrin.h:* 8 {{candidate function not viable}}
+                                 // expected-note@vecintrin.h:* 10 {{candidate function not viable}}
                                  // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 3}}
   vsl = vec_sldw(vsl, vsl, idx); // expected-error {{no matching function}} expected-error {{argument to '__builtin_s390_vsldb' must be a constant integer}}
-                                 // expected-note@vecintrin.h:* 8 {{candidate function not viable}}
+                                 // expected-note@vecintrin.h:* 10 {{candidate function not viable}}
                                  // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 3}}
   vul = vec_sldw(vul, vul, idx); // expected-error {{no matching function}} expected-error {{argument to '__builtin_s390_vsldb' must be a constant integer}}
-                                 // expected-note@vecintrin.h:* 8 {{candidate function not viable}}
+                                 // expected-note@vecintrin.h:* 10 {{candidate function not viable}}
+                                 // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 3}}
+  vslll = vec_sldw(vslll, vslll, idx); // expected-error {{no matching function}} expected-error {{argument to '__builtin_s390_vsldb' must be a constant integer}}
+                                 // expected-note@vecintrin.h:* 10 {{candidate function not viable}}
+                                 // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 3}}
+  vulll = vec_sldw(vulll, vulll, idx); // expected-error {{no matching function}} expected-error {{argument to '__builtin_s390_vsldb' must be a constant integer}}
+                                 // expected-note@vecintrin.h:* 10 {{candidate function not viable}}
                                  // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 3}}
   vd = vec_sldw(vd, vd, idx);    // expected-error {{no matching function}} expected-error {{argument to '__builtin_s390_vsldb' must be a constant integer}}
-                                 // expected-note@vecintrin.h:* 8 {{candidate function not viable}}
+                                 // expected-note@vecintrin.h:* 10 {{candidate function not viable}}
                                  // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 3}}
 }
 
diff --git clang/test/CodeGen/SystemZ/builtins-systemz-zvector.c clang/test/CodeGen/SystemZ/builtins-systemz-zvector.c
index 06fc1ee05d67..775733ad3b94 100644
--- clang/test/CodeGen/SystemZ/builtins-systemz-zvector.c
+++ clang/test/CodeGen/SystemZ/builtins-systemz-zvector.c
@@ -12,24 +12,29 @@ volatile vector signed char vsc;
 volatile vector signed short vss;
 volatile vector signed int vsi;
 volatile vector signed long long vsl;
+volatile vector signed __int128 vslll;
 volatile vector unsigned char vuc;
 volatile vector unsigned short vus;
 volatile vector unsigned int vui;
 volatile vector unsigned long long vul;
+volatile vector unsigned __int128 vulll;
 volatile vector bool char vbc;
 volatile vector bool short vbs;
 volatile vector bool int vbi;
 volatile vector bool long long vbl;
+volatile vector bool __int128 vblll;
 volatile vector double vd;
 
 volatile signed char sc;
 volatile signed short ss;
 volatile signed int si;
 volatile signed long long sl;
+volatile signed __int128 slll;
 volatile unsigned char uc;
 volatile unsigned short us;
 volatile unsigned int ui;
 volatile unsigned long long ul;
+volatile unsigned __int128 ulll;
 volatile double d;
 
 const void * volatile cptr;
@@ -37,10 +42,12 @@ const signed char * volatile cptrsc;
 const signed short * volatile cptrss;
 const signed int * volatile cptrsi;
 const signed long long * volatile cptrsl;
+const signed __int128 * volatile cptrslll;
 const unsigned char * volatile cptruc;
 const unsigned short * volatile cptrus;
 const unsigned int * volatile cptrui;
 const unsigned long long * volatile cptrul;
+const unsigned __int128 * volatile cptrulll;
 const float * volatile cptrf;
 const double * volatile cptrd;
 
@@ -49,10 +56,12 @@ signed char * volatile ptrsc;
 signed short * volatile ptrss;
 signed int * volatile ptrsi;
 signed long long * volatile ptrsl;
+signed __int128 * volatile ptrslll;
 unsigned char * volatile ptruc;
 unsigned short * volatile ptrus;
 unsigned int * volatile ptrui;
 unsigned long long * volatile ptrul;
+unsigned __int128 * volatile ptrulll;
 float * volatile ptrf;
 double * volatile ptrd;
 
@@ -257,6 +266,15 @@ void test_core(void) {
   vbl = vec_perm(vbl, vbl, vuc);
   // CHECK: call <16 x i8> @llvm.s390.vperm(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vperm
+  vslll = vec_perm(vslll, vslll, vuc);
+  // CHECK: call <16 x i8> @llvm.s390.vperm(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK-ASM: vperm
+  vulll = vec_perm(vulll, vulll, vuc);
+  // CHECK: call <16 x i8> @llvm.s390.vperm(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK-ASM: vperm
+  vblll = vec_perm(vblll, vblll, vuc);
+  // CHECK: call <16 x i8> @llvm.s390.vperm(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK-ASM: vperm
   vd = vec_perm(vd, vd, vuc);
   // CHECK: call <16 x i8> @llvm.s390.vperm(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vperm
@@ -322,6 +340,10 @@ void test_core(void) {
   // CHECK-ASM: vperm
   vul = vec_revb(vul);
   // CHECK-ASM: vperm
+  vslll = vec_revb(vslll);
+  // CHECK-ASM: vperm
+  vulll = vec_revb(vulll);
+  // CHECK-ASM: vperm
   vd = vec_revb(vd);
   // CHECK-ASM: vperm
 
@@ -400,6 +422,18 @@ void test_core(void) {
   // CHECK-ASM: vsel
   vbl = vec_sel(vbl, vbl, vbl);
   // CHECK-ASM: vsel
+  vslll = vec_sel(vslll, vslll, vulll);
+  // CHECK-ASM: vsel
+  vslll = vec_sel(vslll, vslll, vblll);
+  // CHECK-ASM: vsel
+  vulll = vec_sel(vulll, vulll, vulll);
+  // CHECK-ASM: vsel
+  vulll = vec_sel(vulll, vulll, vblll);
+  // CHECK-ASM: vsel
+  vblll = vec_sel(vblll, vblll, vulll);
+  // CHECK-ASM: vsel
+  vblll = vec_sel(vblll, vblll, vblll);
+  // CHECK-ASM: vsel
   vd = vec_sel(vd, vd, vul);
   // CHECK-ASM: vsel
   vd = vec_sel(vd, vd, vbl);
@@ -503,6 +537,10 @@ void test_core(void) {
   // CHECK-ASM: vl
   vul = vec_xl(idx, cptrul);
   // CHECK-ASM: vl
+  vslll = vec_xl(idx, cptrslll);
+  // CHECK-ASM: vl
+  vulll = vec_xl(idx, cptrulll);
+  // CHECK-ASM: vl
   vd = vec_xl(idx, cptrd);
   // CHECK-ASM: vl
 
@@ -554,6 +592,10 @@ void test_core(void) {
   // CHECK-ASM: vst
   vec_xst(vul, idx, ptrul);
   // CHECK-ASM: vst
+  vec_xst(vslll, idx, ptrslll);
+  // CHECK-ASM: vst
+  vec_xst(vulll, idx, ptrulll);
+  // CHECK-ASM: vst
   vec_xst(vd, idx, ptrd);
   // CHECK-ASM: vst
 
@@ -613,6 +655,12 @@ void test_core(void) {
   vul = vec_load_bndry(cptrul, 64);
   // CHECK: call <16 x i8> @llvm.s390.vlbb(ptr %{{.*}}, i32 0)
   // CHECK-ASM: vlbb
+  vslll = vec_load_bndry(cptrslll, 64);
+  // CHECK: call <16 x i8> @llvm.s390.vlbb(ptr %{{.*}}, i32 0)
+  // CHECK-ASM: vlbb
+  vulll = vec_load_bndry(cptrulll, 64);
+  // CHECK: call <16 x i8> @llvm.s390.vlbb(ptr %{{.*}}, i32 0)
+  // CHECK-ASM: vlbb
   vd = vec_load_bndry(cptrd, 64);
   // CHECK: call <16 x i8> @llvm.s390.vlbb(ptr %{{.*}}, i32 0)
   // CHECK-ASM: vlbb
@@ -867,6 +915,10 @@ void test_core(void) {
   vd = vec_splats(d);
   // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <2 x i32> zeroinitializer
   // CHECK-ASM: vlrepg
+  vslll = vec_splats(slll);
+  // CHECK: insertelement <1 x i128> poison, i128 %{{.*}}, i64 0
+  vulll = vec_splats(ulll);
+  // CHECK: insertelement <1 x i128> poison, i128 %{{.*}}, i64 0
 
   vsl = vec_extend_s64(vsc);
   // CHECK-ASM: vsegb
@@ -982,6 +1034,15 @@ void test_core(void) {
   vbi = vec_pack(vbl, vbl);
   // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
   // CHECK-ASM: vpkg
+  vsl = vec_pack(vslll, vslll);
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i32> <i32 1, i32 3>
+  // CHECK-ASM: vmrlg
+  vul = vec_pack(vulll, vulll);
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i32> <i32 1, i32 3>
+  // CHECK-ASM: vmrlg
+  vbl = vec_pack(vblll, vblll);
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i32> <i32 1, i32 3>
+  // CHECK-ASM: vmrlg
 
   vsc = vec_packs(vss, vss);
   // CHECK: call <16 x i8> @llvm.s390.vpksh(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
@@ -2362,6 +2423,105 @@ void test_compare(void) {
 void test_integer(void) {
   // CHECK-ASM-LABEL: test_integer
 
+  vsc = vec_and(vsc, vsc);
+  // CHECK-ASM: vn
+  vuc = vec_and(vuc, vuc);
+  // CHECK-ASM: vn
+  vbc = vec_and(vbc, vbc);
+  // CHECK-ASM: vn
+  vss = vec_and(vss, vss);
+  // CHECK-ASM: vn
+  vus = vec_and(vus, vus);
+  // CHECK-ASM: vn
+  vbs = vec_and(vbs, vbs);
+  // CHECK-ASM: vn
+  vsi = vec_and(vsi, vsi);
+  // CHECK-ASM: vn
+  vui = vec_and(vui, vui);
+  // CHECK-ASM: vn
+  vbi = vec_and(vbi, vbi);
+  // CHECK-ASM: vn
+  vsl = vec_and(vsl, vsl);
+  // CHECK-ASM: vn
+  vul = vec_and(vul, vul);
+  // CHECK-ASM: vn
+  vbl = vec_and(vbl, vbl);
+  // CHECK-ASM: vn
+  vslll = vec_and(vslll, vslll);
+  // CHECK-ASM: vn
+  vulll = vec_and(vulll, vulll);
+  // CHECK-ASM: vn
+  vblll = vec_and(vblll, vblll);
+  // CHECK-ASM: vn
+  vd = vec_and(vd, vd);
+  // CHECK-ASM: vn
+
+  vsc = vec_or(vsc, vsc);
+  // CHECK-ASM: vo
+  vuc = vec_or(vuc, vuc);
+  // CHECK-ASM: vo
+  vbc = vec_or(vbc, vbc);
+  // CHECK-ASM: vo
+  vss = vec_or(vss, vss);
+  // CHECK-ASM: vo
+  vus = vec_or(vus, vus);
+  // CHECK-ASM: vo
+  vbs = vec_or(vbs, vbs);
+  // CHECK-ASM: vo
+  vsi = vec_or(vsi, vsi);
+  // CHECK-ASM: vo
+  vui = vec_or(vui, vui);
+  // CHECK-ASM: vo
+  vbi = vec_or(vbi, vbi);
+  // CHECK-ASM: vo
+  vsl = vec_or(vsl, vsl);
+  // CHECK-ASM: vo
+  vul = vec_or(vul, vul);
+  // CHECK-ASM: vo
+  vbl = vec_or(vbl, vbl);
+  // CHECK-ASM: vo
+  vslll = vec_or(vslll, vslll);
+  // CHECK-ASM: vo
+  vulll = vec_or(vulll, vulll);
+  // CHECK-ASM: vo
+  vblll = vec_or(vblll, vblll);
+  // CHECK-ASM: vo
+  vd = vec_or(vd, vd);
+  // CHECK-ASM: vo
+
+  vsc = vec_xor(vsc, vsc);
+  // CHECK-ASM: vx
+  vuc = vec_xor(vuc, vuc);
+  // CHECK-ASM: vx
+  vbc = vec_xor(vbc, vbc);
+  // CHECK-ASM: vx
+  vss = vec_xor(vss, vss);
+  // CHECK-ASM: vx
+  vus = vec_xor(vus, vus);
+  // CHECK-ASM: vx
+  vbs = vec_xor(vbs, vbs);
+  // CHECK-ASM: vx
+  vsi = vec_xor(vsi, vsi);
+  // CHECK-ASM: vx
+  vui = vec_xor(vui, vui);
+  // CHECK-ASM: vx
+  vbi = vec_xor(vbi, vbi);
+  // CHECK-ASM: vx
+  vsl = vec_xor(vsl, vsl);
+  // CHECK-ASM: vx
+  vul = vec_xor(vul, vul);
+  // CHECK-ASM: vx
+  vbl = vec_xor(vbl, vbl);
+  // CHECK-ASM: vx
+  vslll = vec_xor(vslll, vslll);
+  // CHECK-ASM: vx
+  vulll = vec_xor(vulll, vulll);
+  // CHECK-ASM: vx
+  vblll = vec_xor(vblll, vblll);
+  // CHECK-ASM: vx
+  vd = vec_xor(vd, vd);
+  // CHECK-ASM: vx
+
   vsc = vec_andc(vsc, vsc);
   // CHECK-ASM: vnc
   vsc = vec_andc(vsc, vbc);
@@ -2418,6 +2578,12 @@ void test_integer(void) {
   // CHECK-ASM: vnc
   vbl = vec_andc(vbl, vbl);
   // CHECK-ASM: vnc
+  vslll = vec_andc(vslll, vslll);
+  // CHECK-ASM: vnc
+  vulll = vec_andc(vulll, vulll);
+  // CHECK-ASM: vnc
+  vblll = vec_andc(vblll, vblll);
+  // CHECK-ASM: vnc
   vd = vec_andc(vd, vd);
   // CHECK-ASM: vnc
   vd = vec_andc(vd, vbl);
@@ -2481,6 +2647,12 @@ void test_integer(void) {
   // CHECK-ASM: vno
   vbl = vec_nor(vbl, vbl);
   // CHECK-ASM: vno
+  vslll = vec_nor(vslll, vslll);
+  // CHECK-ASM: vno
+  vulll = vec_nor(vulll, vulll);
+  // CHECK-ASM: vno
+  vblll = vec_nor(vblll, vblll);
+  // CHECK-ASM: vno
   vd = vec_nor(vd, vd);
   // CHECK-ASM: vno
   vd = vec_nor(vd, vbl);
@@ -2770,6 +2942,12 @@ void test_integer(void) {
   vbl = vec_sll(vbl, vui);
   // CHECK: call <16 x i8> @llvm.s390.vsl(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vsl
+  vslll = vec_sll(vslll, vuc);
+  // CHECK: call <16 x i8> @llvm.s390.vsl(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK-ASM: vsl
+  vulll = vec_sll(vulll, vuc);
+  // CHECK: call <16 x i8> @llvm.s390.vsl(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK-ASM: vsl
 
   vsc = vec_slb(vsc, vsc);
   // CHECK: call <16 x i8> @llvm.s390.vslb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
@@ -2789,42 +2967,69 @@ void test_integer(void) {
   vss = vec_slb(vss, vus);
   // CHECK: call <16 x i8> @llvm.s390.vslb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vslb
+  vss = vec_slb(vss, vuc);
+  // CHECK: call <16 x i8> @llvm.s390.vslb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK-ASM: vslb
   vus = vec_slb(vus, vss);
   // CHECK: call <16 x i8> @llvm.s390.vslb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vslb
   vus = vec_slb(vus, vus);
   // CHECK: call <16 x i8> @llvm.s390.vslb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vslb
+  vus = vec_slb(vus, vuc);
+  // CHECK: call <16 x i8> @llvm.s390.vslb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK-ASM: vslb
   vsi = vec_slb(vsi, vsi);
   // CHECK: call <16 x i8> @llvm.s390.vslb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vslb
   vsi = vec_slb(vsi, vui);
   // CHECK: call <16 x i8> @llvm.s390.vslb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vslb
+  vsi = vec_slb(vsi, vuc);
+  // CHECK: call <16 x i8> @llvm.s390.vslb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK-ASM: vslb
   vui = vec_slb(vui, vsi);
   // CHECK: call <16 x i8> @llvm.s390.vslb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vslb
   vui = vec_slb(vui, vui);
   // CHECK: call <16 x i8> @llvm.s390.vslb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vslb
+  vui = vec_slb(vui, vuc);
+  // CHECK: call <16 x i8> @llvm.s390.vslb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK-ASM: vslb
   vsl = vec_slb(vsl, vsl);
   // CHECK: call <16 x i8> @llvm.s390.vslb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vslb
   vsl = vec_slb(vsl, vul);
   // CHECK: call <16 x i8> @llvm.s390.vslb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vslb
+  vsl = vec_slb(vsl, vuc);
+  // CHECK: call <16 x i8> @llvm.s390.vslb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK-ASM: vslb
   vul = vec_slb(vul, vsl);
   // CHECK: call <16 x i8> @llvm.s390.vslb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vslb
   vul = vec_slb(vul, vul);
   // CHECK: call <16 x i8> @llvm.s390.vslb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vslb
+  vul = vec_slb(vul, vuc);
+  // CHECK: call <16 x i8> @llvm.s390.vslb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK-ASM: vslb
+  vslll = vec_slb(vslll, vuc);
+  // CHECK: call <16 x i8> @llvm.s390.vslb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK-ASM: vslb
+  vulll = vec_slb(vulll, vuc);
+  // CHECK: call <16 x i8> @llvm.s390.vslb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK-ASM: vslb
   vd = vec_slb(vd, vsl);
   // CHECK: call <16 x i8> @llvm.s390.vslb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vslb
   vd = vec_slb(vd, vul);
   // CHECK: call <16 x i8> @llvm.s390.vslb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vslb
+  vd = vec_slb(vd, vuc);
+  // CHECK: call <16 x i8> @llvm.s390.vslb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK-ASM: vslb
 
   vsc = vec_sld(vsc, vsc, 0);
   // CHECK: call <16 x i8> @llvm.s390.vsldb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 0)
@@ -2898,6 +3103,18 @@ void test_integer(void) {
   vbl = vec_sld(vbl, vbl, 15);
   // CHECK: call <16 x i8> @llvm.s390.vsldb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 15)
   // CHECK-ASM: vsldb
+  vslll = vec_sld(vslll, vslll, 0);
+  // CHECK: call <16 x i8> @llvm.s390.vsldb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 0)
+  // CHECK-ASM: vsldb
+  vslll = vec_sld(vslll, vslll, 15);
+  // CHECK: call <16 x i8> @llvm.s390.vsldb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 15)
+  // CHECK-ASM: vsldb
+  vulll = vec_sld(vulll, vulll, 0);
+  // CHECK: call <16 x i8> @llvm.s390.vsldb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 0)
+  // CHECK-ASM: vsldb
+  vulll = vec_sld(vulll, vulll, 15);
+  // CHECK: call <16 x i8> @llvm.s390.vsldb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 15)
+  // CHECK-ASM: vsldb
   vd = vec_sld(vd, vd, 0);
   // CHECK: call <16 x i8> @llvm.s390.vsldb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 0)
   // CHECK-ASM: vsldb
@@ -2953,6 +3170,18 @@ void test_integer(void) {
   vul = vec_sldw(vul, vul, 3);
   // CHECK: call <16 x i8> @llvm.s390.vsldb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 12)
   // CHECK-ASM: vsldb
+  vslll = vec_sldw(vslll, vslll, 0);
+  // CHECK: call <16 x i8> @llvm.s390.vsldb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 0)
+  // CHECK-ASM: vsldb
+  vslll = vec_sldw(vslll, vslll, 3);
+  // CHECK: call <16 x i8> @llvm.s390.vsldb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 12)
+  // CHECK-ASM: vsldb
+  vulll = vec_sldw(vulll, vulll, 0);
+  // CHECK: call <16 x i8> @llvm.s390.vsldb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 0)
+  // CHECK-ASM: vsldb
+  vulll = vec_sldw(vulll, vulll, 3);
+  // CHECK: call <16 x i8> @llvm.s390.vsldb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 12)
+  // CHECK-ASM: vsldb
   vd = vec_sldw(vd, vd, 0);
   // CHECK: call <16 x i8> @llvm.s390.vsldb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 0)
   // CHECK-ASM: vsldb
@@ -3068,6 +3297,12 @@ void test_integer(void) {
   vbl = vec_sral(vbl, vui);
   // CHECK: call <16 x i8> @llvm.s390.vsra(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vsra
+  vslll = vec_sral(vslll, vuc);
+  // CHECK: call <16 x i8> @llvm.s390.vsra(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK-ASM: vsra
+  vulll = vec_sral(vulll, vuc);
+  // CHECK: call <16 x i8> @llvm.s390.vsra(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK-ASM: vsra
 
   vsc = vec_srab(vsc, vsc);
   // CHECK: call <16 x i8> @llvm.s390.vsrab(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
@@ -3087,42 +3322,69 @@ void test_integer(void) {
   vss = vec_srab(vss, vus);
   // CHECK: call <16 x i8> @llvm.s390.vsrab(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vsrab
+  vss = vec_srab(vss, vuc);
+  // CHECK: call <16 x i8> @llvm.s390.vsrab(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK-ASM: vsrab
   vus = vec_srab(vus, vss);
   // CHECK: call <16 x i8> @llvm.s390.vsrab(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vsrab
   vus = vec_srab(vus, vus);
   // CHECK: call <16 x i8> @llvm.s390.vsrab(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vsrab
+  vus = vec_srab(vus, vuc);
+  // CHECK: call <16 x i8> @llvm.s390.vsrab(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK-ASM: vsrab
   vsi = vec_srab(vsi, vsi);
   // CHECK: call <16 x i8> @llvm.s390.vsrab(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vsrab
   vsi = vec_srab(vsi, vui);
   // CHECK: call <16 x i8> @llvm.s390.vsrab(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vsrab
+  vsi = vec_srab(vsi, vuc);
+  // CHECK: call <16 x i8> @llvm.s390.vsrab(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK-ASM: vsrab
   vui = vec_srab(vui, vsi);
   // CHECK: call <16 x i8> @llvm.s390.vsrab(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vsrab
   vui = vec_srab(vui, vui);
   // CHECK: call <16 x i8> @llvm.s390.vsrab(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vsrab
+  vui = vec_srab(vui, vuc);
+  // CHECK: call <16 x i8> @llvm.s390.vsrab(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK-ASM: vsrab
   vsl = vec_srab(vsl, vsl);
   // CHECK: call <16 x i8> @llvm.s390.vsrab(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vsrab
   vsl = vec_srab(vsl, vul);
   // CHECK: call <16 x i8> @llvm.s390.vsrab(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vsrab
+  vsl = vec_srab(vsl, vuc);
+  // CHECK: call <16 x i8> @llvm.s390.vsrab(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK-ASM: vsrab
   vul = vec_srab(vul, vsl);
   // CHECK: call <16 x i8> @llvm.s390.vsrab(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vsrab
   vul = vec_srab(vul, vul);
   // CHECK: call <16 x i8> @llvm.s390.vsrab(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vsrab
+  vul = vec_srab(vul, vuc);
+  // CHECK: call <16 x i8> @llvm.s390.vsrab(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK-ASM: vsrab
+  vslll = vec_srab(vslll, vuc);
+  // CHECK: call <16 x i8> @llvm.s390.vsrab(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK-ASM: vsrab
+  vulll = vec_srab(vulll, vuc);
+  // CHECK: call <16 x i8> @llvm.s390.vsrab(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK-ASM: vsrab
   vd = vec_srab(vd, vsl);
   // CHECK: call <16 x i8> @llvm.s390.vsrab(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vsrab
   vd = vec_srab(vd, vul);
   // CHECK: call <16 x i8> @llvm.s390.vsrab(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vsrab
+  vd = vec_srab(vd, vuc);
+  // CHECK: call <16 x i8> @llvm.s390.vsrab(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK-ASM: vsrab
 
   vsc = vec_srl(vsc, vuc);
   // CHECK: call <16 x i8> @llvm.s390.vsrl(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
@@ -3232,6 +3494,12 @@ void test_integer(void) {
   vbl = vec_srl(vbl, vui);
   // CHECK: call <16 x i8> @llvm.s390.vsrl(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vsrl
+  vslll = vec_srl(vslll, vuc);
+  // CHECK: call <16 x i8> @llvm.s390.vsrl(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK-ASM: vsrl
+  vulll = vec_srl(vulll, vuc);
+  // CHECK: call <16 x i8> @llvm.s390.vsrl(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK-ASM: vsrl
 
   vsc = vec_srb(vsc, vsc);
   // CHECK: call <16 x i8> @llvm.s390.vsrlb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
@@ -3251,42 +3519,69 @@ void test_integer(void) {
   vss = vec_srb(vss, vus);
   // CHECK: call <16 x i8> @llvm.s390.vsrlb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vsrlb
+  vss = vec_srb(vss, vuc);
+  // CHECK: call <16 x i8> @llvm.s390.vsrlb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK-ASM: vsrlb
   vus = vec_srb(vus, vss);
   // CHECK: call <16 x i8> @llvm.s390.vsrlb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vsrlb
   vus = vec_srb(vus, vus);
   // CHECK: call <16 x i8> @llvm.s390.vsrlb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vsrlb
+  vus = vec_srb(vus, vuc);
+  // CHECK: call <16 x i8> @llvm.s390.vsrlb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK-ASM: vsrlb
   vsi = vec_srb(vsi, vsi);
   // CHECK: call <16 x i8> @llvm.s390.vsrlb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vsrlb
   vsi = vec_srb(vsi, vui);
   // CHECK: call <16 x i8> @llvm.s390.vsrlb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vsrlb
+  vsi = vec_srb(vsi, vuc);
+  // CHECK: call <16 x i8> @llvm.s390.vsrlb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK-ASM: vsrlb
   vui = vec_srb(vui, vsi);
   // CHECK: call <16 x i8> @llvm.s390.vsrlb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vsrlb
   vui = vec_srb(vui, vui);
   // CHECK: call <16 x i8> @llvm.s390.vsrlb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vsrlb
+  vui = vec_srb(vui, vuc);
+  // CHECK: call <16 x i8> @llvm.s390.vsrlb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK-ASM: vsrlb
   vsl = vec_srb(vsl, vsl);
   // CHECK: call <16 x i8> @llvm.s390.vsrlb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vsrlb
   vsl = vec_srb(vsl, vul);
   // CHECK: call <16 x i8> @llvm.s390.vsrlb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vsrlb
+  vsl = vec_srb(vsl, vuc);
+  // CHECK: call <16 x i8> @llvm.s390.vsrlb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK-ASM: vsrlb
   vul = vec_srb(vul, vsl);
   // CHECK: call <16 x i8> @llvm.s390.vsrlb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vsrlb
   vul = vec_srb(vul, vul);
   // CHECK: call <16 x i8> @llvm.s390.vsrlb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vsrlb
+  vul = vec_srb(vul, vuc);
+  // CHECK: call <16 x i8> @llvm.s390.vsrlb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK-ASM: vsrlb
+  vslll = vec_srb(vslll, vuc);
+  // CHECK: call <16 x i8> @llvm.s390.vsrlb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK-ASM: vsrlb
+  vulll = vec_srb(vulll, vuc);
+  // CHECK: call <16 x i8> @llvm.s390.vsrlb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK-ASM: vsrlb
   vd = vec_srb(vd, vsl);
   // CHECK: call <16 x i8> @llvm.s390.vsrlb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vsrlb
   vd = vec_srb(vd, vul);
   // CHECK: call <16 x i8> @llvm.s390.vsrlb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vsrlb
+  vd = vec_srb(vd, vuc);
+  // CHECK: call <16 x i8> @llvm.s390.vsrlb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK-ASM: vsrlb
 
   vsc = vec_abs(vsc);
   // CHECK-ASM: vlcb
@@ -3345,6 +3640,10 @@ void test_integer(void) {
   // CHECK-ASM: vmxlg
   vul = vec_max(vbl, vul);
   // CHECK-ASM: vmxlg
+  vslll = vec_max(vslll, vslll);
+  // (emulated)
+  vulll = vec_max(vulll, vulll);
+  // (emulated)
   vd = vec_max(vd, vd);
   // (emulated)
 
@@ -3396,6 +3695,10 @@ void test_integer(void) {
   // CHECK-ASM: vmnlg
   vul = vec_min(vbl, vul);
   // CHECK-ASM: vmnlg
+  vslll = vec_min(vslll, vslll);
+  // (emulated)
+  vulll = vec_min(vulll, vulll);
+  // (emulated)
   vd = vec_min(vd, vd);
   // (emulated)
 
@@ -3411,6 +3714,16 @@ void test_integer(void) {
   vul = vec_addc(vul, vul);
   // CHECK: call <2 x i64> @llvm.s390.vaccg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   // CHECK-ASM: vaccg
+  vulll = vec_addc(vulll, vulll);
+  // CHECK: call i128 @llvm.s390.vaccq(i128 %{{.*}}, i128 %{{.*}})
+  // CHECK-ASM: vaccq
+
+  vulll = vec_adde(vulll, vulll, vulll);
+  // CHECK: call i128 @llvm.s390.vacq(i128 %{{.*}}, i128 %{{.*}}, i128 %{{.*}})
+  // CHECK-ASM: vacq
+  vulll = vec_addec(vulll, vulll, vulll);
+  // CHECK: call i128 @llvm.s390.vacccq(i128 %{{.*}}, i128 %{{.*}}, i128 %{{.*}})
+  // CHECK-ASM: vacccq
 
   vuc = vec_add_u128(vuc, vuc);
   // CHECK-ASM: vaq
@@ -3462,6 +3775,9 @@ void test_integer(void) {
   vul = vec_gfmsum(vui, vui);
   // CHECK: call <2 x i64> @llvm.s390.vgfmf(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   // CHECK-ASM: vgfmf
+  vulll = vec_gfmsum(vul, vul);
+  // CHECK: call i128 @llvm.s390.vgfmg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
+  // CHECK-ASM: vgfmg
   vuc = vec_gfmsum_128(vul, vul);
   // CHECK: call i128 @llvm.s390.vgfmg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   // CHECK-ASM: vgfmg
@@ -3475,6 +3791,9 @@ void test_integer(void) {
   vul = vec_gfmsum_accum(vui, vui, vul);
   // CHECK: call <2 x i64> @llvm.s390.vgfmaf(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <2 x i64> %{{.*}})
   // CHECK-ASM: vgfmaf
+  vulll = vec_gfmsum_accum(vul, vul, vulll);
+  // CHECK: call i128 @llvm.s390.vgfmag(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, i128 %{{.*}})
+  // CHECK-ASM: vgfmag
   vuc = vec_gfmsum_accum_128(vul, vul, vuc);
   // CHECK: call i128 @llvm.s390.vgfmag(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, i128 %{{.*}})
   // CHECK-ASM: vgfmag
@@ -3630,6 +3949,16 @@ void test_integer(void) {
   vul = vec_subc(vul, vul);
   // CHECK: call <2 x i64> @llvm.s390.vscbig(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   // CHECK-ASM: vscbig
+  vulll = vec_subc(vulll, vulll);
+  // CHECK: call i128 @llvm.s390.vscbiq(i128 %{{.*}}, i128 %{{.*}})
+  // CHECK-ASM: vscbiq
+
+  vulll = vec_sube(vulll, vulll, vulll);
+  // CHECK: call i128 @llvm.s390.vsbiq(i128 %{{.*}}, i128 %{{.*}}, i128 %{{.*}})
+  // CHECK-ASM: vsbiq
+  vulll = vec_subec(vulll, vulll, vulll);
+  // CHECK: call i128 @llvm.s390.vsbcbiq(i128 %{{.*}}, i128 %{{.*}}, i128 %{{.*}})
+  // CHECK-ASM: vsbcbiq
 
   vuc = vec_sub_u128(vuc, vuc);
   // CHECK-ASM: vsq
@@ -3655,6 +3984,12 @@ void test_integer(void) {
   vul = vec_sum2(vui, vui);
   // CHECK: call <2 x i64> @llvm.s390.vsumgf(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   // CHECK-ASM: vsumgf
+  vulll = vec_sum(vui, vui);
+  // CHECK: call i128 @llvm.s390.vsumqf(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK-ASM: vsumqf
+  vulll = vec_sum(vul, vul);
+  // CHECK: call i128 @llvm.s390.vsumqg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
+  // CHECK-ASM: vsumqg
   vuc = vec_sum_u128(vui, vui);
   // CHECK: call i128 @llvm.s390.vsumqf(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   // CHECK-ASM: vsumqf
@@ -3686,6 +4021,12 @@ void test_integer(void) {
   idx = vec_test_mask(vul, vul);
   // CHECK: call i32 @llvm.s390.vtm(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vtm
+  idx = vec_test_mask(vslll, vulll);
+  // CHECK: call i32 @llvm.s390.vtm(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK-ASM: vtm
+  idx = vec_test_mask(vulll, vulll);
+  // CHECK: call i32 @llvm.s390.vtm(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK-ASM: vtm
   idx = vec_test_mask(vd, vul);
   // CHECK: call i32 @llvm.s390.vtm(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vtm
diff --git clang/test/CodeGen/SystemZ/builtins-systemz-zvector2-error.c clang/test/CodeGen/SystemZ/builtins-systemz-zvector2-error.c
index 127b0f67e85c..0f2841d99c3a 100644
--- clang/test/CodeGen/SystemZ/builtins-systemz-zvector2-error.c
+++ clang/test/CodeGen/SystemZ/builtins-systemz-zvector2-error.c
@@ -9,10 +9,12 @@ volatile vector signed char vsc;
 volatile vector signed short vss;
 volatile vector signed int vsi;
 volatile vector signed long long vsl;
+volatile vector signed __int128 vslll;
 volatile vector unsigned char vuc;
 volatile vector unsigned short vus;
 volatile vector unsigned int vui;
 volatile vector unsigned long long vul;
+volatile vector unsigned __int128 vulll;
 volatile vector bool char vbc;
 volatile vector bool short vbs;
 volatile vector bool int vbi;
@@ -120,12 +122,19 @@ void test_core(void) {
 
 void test_integer(void) {
   vf = vec_sld(vf, vf, idx); // expected-error {{no matching function}} expected-error {{argument to '__builtin_s390_vsldb' must be a constant integer}}
-                             // expected-note@vecintrin.h:* 13 {{candidate function not viable}}
+                             // expected-note@vecintrin.h:* 15 {{candidate function not viable}}
                              // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 15}}
   vd = vec_sld(vd, vd, idx); // expected-error {{no matching function}} expected-error {{argument to '__builtin_s390_vsldb' must be a constant integer}}
-                             // expected-note@vecintrin.h:* 13 {{candidate function not viable}}
+                             // expected-note@vecintrin.h:* 15 {{candidate function not viable}}
                              // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 15}}
 
+  vulll = vec_msum(vul, vul, vulll, idx);   // expected-error {{no matching function}} expected-error {{argument to '__builtin_s390_vmslg' must be a constant integer}}
+                                            // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 15}}
+  vulll = vec_msum(vul, vul, vulll, -1);    // expected-error {{no matching function}} expected-error {{argument value -1 is outside the valid range [0, 15]}}
+                                            // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 15}}
+  vulll = vec_msum(vul, vul, vulll, 16);    // expected-error {{no matching function}} expected-error {{argument value 16 is outside the valid range [0, 15]}}
+                                            // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 15}}
+
   vuc = vec_msum_u128(vul, vul, vuc, idx);  // expected-error {{no matching function}} expected-error {{argument to '__builtin_s390_vmslg' must be a constant integer}}
                                             // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 15}}
   vuc = vec_msum_u128(vul, vul, vuc, -1);   // expected-error {{no matching function}} expected-error {{argument value -1 is outside the valid range [0, 15]}}
diff --git clang/test/CodeGen/SystemZ/builtins-systemz-zvector2.c clang/test/CodeGen/SystemZ/builtins-systemz-zvector2.c
index 15e72ecf51da..60df95817a32 100644
--- clang/test/CodeGen/SystemZ/builtins-systemz-zvector2.c
+++ clang/test/CodeGen/SystemZ/builtins-systemz-zvector2.c
@@ -12,14 +12,17 @@ volatile vector signed char vsc;
 volatile vector signed short vss;
 volatile vector signed int vsi;
 volatile vector signed long long vsl;
+volatile vector signed __int128 vslll;
 volatile vector unsigned char vuc;
 volatile vector unsigned short vus;
 volatile vector unsigned int vui;
 volatile vector unsigned long long vul;
+volatile vector unsigned __int128 vulll;
 volatile vector bool char vbc;
 volatile vector bool short vbs;
 volatile vector bool int vbi;
 volatile vector bool long long vbl;
+volatile vector bool __int128 vblll;
 volatile vector float vf;
 volatile vector double vd;
 
@@ -122,6 +125,10 @@ void test_core(void) {
   // CHECK: call <16 x i8> @llvm.s390.vperm(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vperm
 
+  vul = vec_bperm(vulll, vuc);
+  // CHECK: call <2 x i64> @llvm.s390.vbperm(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK-ASM: vbperm
+
   vul = vec_bperm_u128(vuc, vuc);
   // CHECK: call <2 x i64> @llvm.s390.vbperm(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vbperm
@@ -220,6 +227,12 @@ void test_core(void) {
   // CHECK: call void @llvm.s390.vstl(<16 x i8> %{{.*}}, i32 %{{.*}}, ptr %{{.*}})
   // CHECK-ASM: vstl
 
+  vsc = vec_load_len_r(cptrsc, 0);
+  // CHECK: call <16 x i8> @llvm.s390.vlrl(i32 0, ptr %{{.*}})
+  // CHECK-ASM: vlrl %{{.*}}, 0(%{{.*}}), 0
+  vsc = vec_load_len_r(cptrsc, idx);
+  // CHECK: call <16 x i8> @llvm.s390.vlrl(i32 %{{.*}}, ptr %{{.*}})
+  // CHECK-ASM: vlrlr
   vuc = vec_load_len_r(cptruc, 0);
   // CHECK: call <16 x i8> @llvm.s390.vlrl(i32 0, ptr %{{.*}})
   // CHECK-ASM: vlrl %{{.*}}, 0(%{{.*}}), 0
@@ -227,6 +240,12 @@ void test_core(void) {
   // CHECK: call <16 x i8> @llvm.s390.vlrl(i32 %{{.*}}, ptr %{{.*}})
   // CHECK-ASM: vlrlr
 
+  vec_store_len_r(vsc, ptrsc, 0);
+  // CHECK: call void @llvm.s390.vstrl(<16 x i8> %{{.*}}, i32 0, ptr %{{.*}})
+  // CHECK-ASM: vstrl %{{.*}}, 0(%{{.*}}), 0
+  vec_store_len_r(vsc, ptrsc, idx);
+  // CHECK: call void @llvm.s390.vstrl(<16 x i8> %{{.*}}, i32 %{{.*}}, ptr %{{.*}})
+  // CHECK-ASM: vstrlr
   vec_store_len_r(vuc, ptruc, 0);
   // CHECK: call void @llvm.s390.vstrl(<16 x i8> %{{.*}}, i32 0, ptr %{{.*}})
   // CHECK-ASM: vstrl %{{.*}}, 0(%{{.*}}), 0
@@ -479,6 +498,21 @@ void test_compare(void) {
 void test_integer(void) {
   // CHECK-ASM-LABEL: test_integer
 
+  vf = vec_and(vf, vf);
+  // CHECK-ASM: vn
+  vd = vec_and(vd, vd);
+  // CHECK-ASM: vn
+
+  vf = vec_or(vf, vf);
+  // CHECK-ASM: vo
+  vd = vec_or(vd, vd);
+  // CHECK-ASM: vo
+
+  vf = vec_xor(vf, vf);
+  // CHECK-ASM: vx
+  vd = vec_xor(vd, vd);
+  // CHECK-ASM: vx
+
   vf = vec_andc(vf, vf);
   // CHECK-ASM: vnc
   vd = vec_andc(vd, vd);
@@ -513,6 +547,12 @@ void test_integer(void) {
   // CHECK-ASM: vnn
   vbl = vec_nand(vbl, vbl);
   // CHECK-ASM: vnn
+  vslll = vec_nand(vslll, vslll);
+  // CHECK-ASM: vnn
+  vulll = vec_nand(vulll, vulll);
+  // CHECK-ASM: vnn
+  vblll = vec_nand(vblll, vblll);
+  // CHECK-ASM: vnn
   vf = vec_nand(vf, vf);
   // CHECK-ASM: vnn
   vd = vec_nand(vd, vd);
@@ -542,6 +582,12 @@ void test_integer(void) {
   // CHECK-ASM: voc
   vbl = vec_orc(vbl, vbl);
   // CHECK-ASM: voc
+  vslll = vec_orc(vslll, vslll);
+  // CHECK-ASM: voc
+  vulll = vec_orc(vulll, vulll);
+  // CHECK-ASM: voc
+  vblll = vec_orc(vblll, vblll);
+  // CHECK-ASM: voc
   vf = vec_orc(vf, vf);
   // CHECK-ASM: voc
   vd = vec_orc(vd, vd);
@@ -571,6 +617,12 @@ void test_integer(void) {
   // CHECK-ASM: vnx
   vbl = vec_eqv(vbl, vbl);
   // CHECK-ASM: vnx
+  vslll = vec_eqv(vslll, vslll);
+  // CHECK-ASM: vnx
+  vulll = vec_eqv(vulll, vulll);
+  // CHECK-ASM: vnx
+  vblll = vec_eqv(vblll, vblll);
+  // CHECK-ASM: vnx
   vf = vec_eqv(vf, vf);
   // CHECK-ASM: vnx
   vd = vec_eqv(vd, vd);
@@ -607,12 +659,18 @@ void test_integer(void) {
   vf = vec_slb(vf, vui);
   // CHECK: call <16 x i8> @llvm.s390.vslb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vslb
+  vf = vec_slb(vf, vuc);
+  // CHECK: call <16 x i8> @llvm.s390.vslb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK-ASM: vslb
   vd = vec_slb(vd, vsl);
   // CHECK: call <16 x i8> @llvm.s390.vslb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vslb
   vd = vec_slb(vd, vul);
   // CHECK: call <16 x i8> @llvm.s390.vslb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vslb
+  vd = vec_slb(vd, vuc);
+  // CHECK: call <16 x i8> @llvm.s390.vslb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK-ASM: vslb
 
   vf = vec_sld(vf, vf, 0);
   // CHECK: call <16 x i8> @llvm.s390.vsldb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 0)
@@ -633,12 +691,18 @@ void test_integer(void) {
   vf = vec_srab(vf, vui);
   // CHECK: call <16 x i8> @llvm.s390.vsrab(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vsrab
+  vf = vec_srab(vf, vuc);
+  // CHECK: call <16 x i8> @llvm.s390.vsrab(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK-ASM: vsrab
   vd = vec_srab(vd, vsl);
   // CHECK: call <16 x i8> @llvm.s390.vsrab(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vsrab
   vd = vec_srab(vd, vul);
   // CHECK: call <16 x i8> @llvm.s390.vsrab(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vsrab
+  vd = vec_srab(vd, vuc);
+  // CHECK: call <16 x i8> @llvm.s390.vsrab(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK-ASM: vsrab
 
   vf = vec_srb(vf, vsi);
   // CHECK: call <16 x i8> @llvm.s390.vsrlb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
@@ -646,12 +710,18 @@ void test_integer(void) {
   vf = vec_srb(vf, vui);
   // CHECK: call <16 x i8> @llvm.s390.vsrlb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vsrlb
+  vf = vec_srb(vf, vuc);
+  // CHECK: call <16 x i8> @llvm.s390.vsrlb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK-ASM: vsrlb
   vd = vec_srb(vd, vsl);
   // CHECK: call <16 x i8> @llvm.s390.vsrlb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vsrlb
   vd = vec_srb(vd, vul);
   // CHECK: call <16 x i8> @llvm.s390.vsrlb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vsrlb
+  vd = vec_srb(vd, vuc);
+  // CHECK: call <16 x i8> @llvm.s390.vsrlb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK-ASM: vsrlb
 
   idx = vec_test_mask(vf, vui);
   // CHECK: call i32 @llvm.s390.vtm(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
@@ -660,6 +730,19 @@ void test_integer(void) {
   // CHECK: call i32 @llvm.s390.vtm(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: vtm
 
+  vulll = vec_msum(vul, vul, vulll, 0);
+  // CHECK: call i128 @llvm.s390.vmslg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, i128 %{{.*}}, i32 0)
+  // CHECK-ASM: vmslg
+  vulll = vec_msum(vul, vul, vulll, 4);
+  // CHECK: call i128 @llvm.s390.vmslg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, i128 %{{.*}}, i32 4)
+  // CHECK-ASM: vmslg
+  vulll = vec_msum(vul, vul, vulll, 8);
+  // CHECK: call i128 @llvm.s390.vmslg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, i128 %{{.*}}, i32 8)
+  // CHECK-ASM: vmslg
+  vulll = vec_msum(vul, vul, vulll, 12);
+  // CHECK: call i128 @llvm.s390.vmslg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, i128 %{{.*}}, i32 12)
+  // CHECK-ASM: vmslg
+
   vuc = vec_msum_u128(vul, vul, vuc, 0);
   // CHECK: call i128 @llvm.s390.vmslg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, i128 %{{.*}}, i32 0)
   // CHECK-ASM: vmslg
diff --git clang/test/CodeGen/SystemZ/builtins-systemz-zvector3-error.c clang/test/CodeGen/SystemZ/builtins-systemz-zvector3-error.c
index 135e586f38ae..83af380f627d 100644
--- clang/test/CodeGen/SystemZ/builtins-systemz-zvector3-error.c
+++ clang/test/CodeGen/SystemZ/builtins-systemz-zvector3-error.c
@@ -9,10 +9,12 @@ volatile vector signed char vsc;
 volatile vector signed short vss;
 volatile vector signed int vsi;
 volatile vector signed long long vsl;
+volatile vector signed __int128 vslll;
 volatile vector unsigned char vuc;
 volatile vector unsigned short vus;
 volatile vector unsigned int vui;
 volatile vector unsigned long long vul;
+volatile vector unsigned __int128 vulll;
 volatile vector bool char vbc;
 volatile vector bool short vbs;
 volatile vector bool int vbi;
@@ -62,83 +64,99 @@ int cc;
 void test_integer(void) {
   vsc = vec_sldb(vsc, vsc, idx); // expected-error {{no matching function}} \
                                  // expected-error {{argument to '__builtin_s390_vsld' must be a constant integer}}
-                                 // expected-note@vecintrin.h:* 9 {{candidate function not viable}}
+                                 // expected-note@vecintrin.h:* 11 {{candidate function not viable}}
                                  // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 7}}
   vuc = vec_sldb(vuc, vuc, idx); // expected-error {{no matching function}} \
                                  // expected-error {{argument to '__builtin_s390_vsld' must be a constant integer}}
-                                 // expected-note@vecintrin.h:* 9 {{candidate function not viable}}
+                                 // expected-note@vecintrin.h:* 11 {{candidate function not viable}}
                                  // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 7}}
   vss = vec_sldb(vss, vss, idx); // expected-error {{no matching function}} \
                                  // expected-error {{argument to '__builtin_s390_vsld' must be a constant integer}}
-                                 // expected-note@vecintrin.h:* 9 {{candidate function not viable}}
+                                 // expected-note@vecintrin.h:* 11 {{candidate function not viable}}
                                  // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 7}}
   vus = vec_sldb(vus, vus, idx); // expected-error {{no matching function}} \
                                  // expected-error {{argument to '__builtin_s390_vsld' must be a constant integer}}
-                                 // expected-note@vecintrin.h:* 9 {{candidate function not viable}}
+                                 // expected-note@vecintrin.h:* 11 {{candidate function not viable}}
                                  // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 7}}
   vsi = vec_sldb(vsi, vsi, idx); // expected-error {{no matching function}} \
                                  // expected-error {{argument to '__builtin_s390_vsld' must be a constant integer}}
-                                 // expected-note@vecintrin.h:* 9 {{candidate function not viable}}
+                                 // expected-note@vecintrin.h:* 11 {{candidate function not viable}}
                                  // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 7}}
   vui = vec_sldb(vui, vui, idx); // expected-error {{no matching function}} \
                                  // expected-error {{argument to '__builtin_s390_vsld' must be a constant integer}}
-                                 // expected-note@vecintrin.h:* 9 {{candidate function not viable}}
+                                 // expected-note@vecintrin.h:* 11 {{candidate function not viable}}
                                  // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 7}}
   vsl = vec_sldb(vsl, vsl, idx); // expected-error {{no matching function}} \
                                  // expected-error {{argument to '__builtin_s390_vsld' must be a constant integer}}
-                                 // expected-note@vecintrin.h:* 9 {{candidate function not viable}}
+                                 // expected-note@vecintrin.h:* 11 {{candidate function not viable}}
                                  // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 7}}
   vul = vec_sldb(vul, vul, idx); // expected-error {{no matching function}} \
                                  // expected-error {{argument to '__builtin_s390_vsld' must be a constant integer}}
-                                 // expected-note@vecintrin.h:* 9 {{candidate function not viable}}
+                                 // expected-note@vecintrin.h:* 11 {{candidate function not viable}}
+                                 // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 7}}
+  vslll = vec_sldb(vslll, vslll, idx); // expected-error {{no matching function}} \
+                                 // expected-error {{argument to '__builtin_s390_vsld' must be a constant integer}}
+                                 // expected-note@vecintrin.h:* 11 {{candidate function not viable}}
+                                 // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 7}}
+  vulll = vec_sldb(vulll, vulll, idx); // expected-error {{no matching function}} \
+                                 // expected-error {{argument to '__builtin_s390_vsld' must be a constant integer}}
+                                 // expected-note@vecintrin.h:* 11 {{candidate function not viable}}
                                  // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 7}}
   vf = vec_sldb(vf, vf, idx);    // expected-error {{no matching function}} \
                                  // expected-error {{argument to '__builtin_s390_vsld' must be a constant integer}}
-                                 // expected-note@vecintrin.h:* 9 {{candidate function not viable}}
+                                 // expected-note@vecintrin.h:* 11 {{candidate function not viable}}
                                  // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 7}}
   vd = vec_sldb(vd, vd, idx);    // expected-error {{no matching function}} \
                                  // expected-error {{argument to '__builtin_s390_vsld' must be a constant integer}}
-                                 // expected-note@vecintrin.h:* 9 {{candidate function not viable}}
+                                 // expected-note@vecintrin.h:* 11 {{candidate function not viable}}
                                  // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 7}}
 
   vsc = vec_srdb(vsc, vsc, idx); // expected-error {{no matching function}} \
                                  // expected-error {{argument to '__builtin_s390_vsrd' must be a constant integer}}
-                                 // expected-note@vecintrin.h:* 9 {{candidate function not viable}}
+                                 // expected-note@vecintrin.h:* 11 {{candidate function not viable}}
                                  // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 7}}
   vuc = vec_srdb(vuc, vuc, idx); // expected-error {{no matching function}} \
                                  // expected-error {{argument to '__builtin_s390_vsrd' must be a constant integer}}
-                                 // expected-note@vecintrin.h:* 9 {{candidate function not viable}}
+                                 // expected-note@vecintrin.h:* 11 {{candidate function not viable}}
                                  // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 7}}
   vss = vec_srdb(vss, vss, idx); // expected-error {{no matching function}} \
                                  // expected-error {{argument to '__builtin_s390_vsrd' must be a constant integer}}
-                                 // expected-note@vecintrin.h:* 9 {{candidate function not viable}}
+                                 // expected-note@vecintrin.h:* 11 {{candidate function not viable}}
                                  // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 7}}
   vus = vec_srdb(vus, vus, idx); // expected-error {{no matching function}} \
                                  // expected-error {{argument to '__builtin_s390_vsrd' must be a constant integer}}
-                                 // expected-note@vecintrin.h:* 9 {{candidate function not viable}}
+                                 // expected-note@vecintrin.h:* 11 {{candidate function not viable}}
                                  // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 7}}
   vsi = vec_srdb(vsi, vsi, idx); // expected-error {{no matching function}} \
                                  // expected-error {{argument to '__builtin_s390_vsrd' must be a constant integer}}
-                                 // expected-note@vecintrin.h:* 9 {{candidate function not viable}}
+                                 // expected-note@vecintrin.h:* 11 {{candidate function not viable}}
                                  // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 7}}
   vui = vec_srdb(vui, vui, idx); // expected-error {{no matching function}} \
                                  // expected-error {{argument to '__builtin_s390_vsrd' must be a constant integer}}
-                                 // expected-note@vecintrin.h:* 9 {{candidate function not viable}}
+                                 // expected-note@vecintrin.h:* 11 {{candidate function not viable}}
                                  // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 7}}
   vsl = vec_srdb(vsl, vsl, idx); // expected-error {{no matching function}} \
                                  // expected-error {{argument to '__builtin_s390_vsrd' must be a constant integer}}
-                                 // expected-note@vecintrin.h:* 9 {{candidate function not viable}}
+                                 // expected-note@vecintrin.h:* 11 {{candidate function not viable}}
                                  // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 7}}
   vul = vec_srdb(vul, vul, idx); // expected-error {{no matching function}} \
                                  // expected-error {{argument to '__builtin_s390_vsrd' must be a constant integer}}
-                                 // expected-note@vecintrin.h:* 9 {{candidate function not viable}}
+                                 // expected-note@vecintrin.h:* 11 {{candidate function not viable}}
+                                 // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 7}}
+  vslll = vec_srdb(vslll, vslll, idx); // expected-error {{no matching function}} \
+                                 // expected-error {{argument to '__builtin_s390_vsrd' must be a constant integer}}
+                                 // expected-note@vecintrin.h:* 11 {{candidate function not viable}}
+                                 // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 7}}
+  vulll = vec_srdb(vulll, vulll, idx); // expected-error {{no matching function}} \
+                                 // expected-error {{argument to '__builtin_s390_vsrd' must be a constant integer}}
+                                 // expected-note@vecintrin.h:* 11 {{candidate function not viable}}
                                  // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 7}}
   vf = vec_srdb(vf, vf, idx);    // expected-error {{no matching function}} \
                                  // expected-error {{argument to '__builtin_s390_vsrd' must be a constant integer}}
-                                 // expected-note@vecintrin.h:* 9 {{candidate function not viable}}
+                                 // expected-note@vecintrin.h:* 11 {{candidate function not viable}}
                                  // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 7}}
   vd = vec_srdb(vd, vd, idx);    // expected-error {{no matching function}} \
                                  // expected-error {{argument to '__builtin_s390_vsrd' must be a constant integer}}
-                                 // expected-note@vecintrin.h:* 9 {{candidate function not viable}}
+                                 // expected-note@vecintrin.h:* 11 {{candidate function not viable}}
                                  // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 7}}
 }
diff --git clang/test/CodeGen/SystemZ/builtins-systemz-zvector3.c clang/test/CodeGen/SystemZ/builtins-systemz-zvector3.c
index 1b0520d471f9..ce8b31512723 100644
--- clang/test/CodeGen/SystemZ/builtins-systemz-zvector3.c
+++ clang/test/CodeGen/SystemZ/builtins-systemz-zvector3.c
@@ -12,10 +12,12 @@ volatile vector signed char vsc;
 volatile vector signed short vss;
 volatile vector signed int vsi;
 volatile vector signed long long vsl;
+volatile vector signed __int128 vslll;
 volatile vector unsigned char vuc;
 volatile vector unsigned short vus;
 volatile vector unsigned int vui;
 volatile vector unsigned long long vul;
+volatile vector unsigned __int128 vulll;
 volatile vector bool char vbc;
 volatile vector bool short vbs;
 volatile vector bool int vbi;
@@ -39,10 +41,12 @@ const signed char * volatile cptrsc;
 const signed short * volatile cptrss;
 const signed int * volatile cptrsi;
 const signed long long * volatile cptrsl;
+const signed __int128 * volatile cptrslll;
 const unsigned char * volatile cptruc;
 const unsigned short * volatile cptrus;
 const unsigned int * volatile cptrui;
 const unsigned long long * volatile cptrul;
+const unsigned __int128 * volatile cptrulll;
 const float * volatile cptrf;
 const double * volatile cptrd;
 
@@ -51,10 +55,12 @@ signed char * volatile ptrsc;
 signed short * volatile ptrss;
 signed int * volatile ptrsi;
 signed long long * volatile ptrsl;
+signed __int128 * volatile ptrslll;
 unsigned char * volatile ptruc;
 unsigned short * volatile ptrus;
 unsigned int * volatile ptrui;
 unsigned long long * volatile ptrul;
+unsigned __int128 * volatile ptrulll;
 float * volatile ptrf;
 double * volatile ptrd;
 
@@ -85,6 +91,10 @@ void test_core(void) {
   // CHECK-ASM: vlbrg
   vul += vec_revb(vec_xl(idx, cptrul));
   // CHECK-ASM: vlbrg
+  vslll += vec_revb(vec_xl(idx, cptrslll));
+  // CHECK-ASM: vlbrq
+  vulll += vec_revb(vec_xl(idx, cptrulll));
+  // CHECK-ASM: vlbrq
   vf += vec_revb(vec_xl(idx, cptrf));
   // CHECK-ASM: vlbrf
   vd += vec_revb(vec_xl(idx, cptrd));
@@ -102,6 +112,10 @@ void test_core(void) {
   // CHECK-ASM: vstbrg
   vec_xst(vec_revb(vul), idx, ptrul);
   // CHECK-ASM: vstbrg
+  vec_xst(vec_revb(vslll), idx, ptrslll);
+  // CHECK-ASM: vstbrq
+  vec_xst(vec_revb(vulll), idx, ptrulll);
+  // CHECK-ASM: vstbrq
   vec_xst(vec_revb(vf), idx, ptrf);
   // CHECK-ASM: vstbrf
   vec_xst(vec_revb(vd), idx, ptrd);
@@ -301,6 +315,18 @@ void test_integer(void) {
   vul = vec_sldb(vul, vul, 7);
   // CHECK: call <16 x i8> @llvm.s390.vsld(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 7)
   // CHECK-ASM: vsld
+  vslll = vec_sldb(vslll, vslll, 0);
+  // CHECK: call <16 x i8> @llvm.s390.vsld(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 0)
+  // CHECK-ASM: vsld
+  vslll = vec_sldb(vslll, vslll, 7);
+  // CHECK: call <16 x i8> @llvm.s390.vsld(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 7)
+  // CHECK-ASM: vsld
+  vulll = vec_sldb(vulll, vulll, 0);
+  // CHECK: call <16 x i8> @llvm.s390.vsld(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 0)
+  // CHECK-ASM: vsld
+  vulll = vec_sldb(vulll, vulll, 7);
+  // CHECK: call <16 x i8> @llvm.s390.vsld(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 7)
+  // CHECK-ASM: vsld
   vf = vec_sldb(vf, vf, 0);
   // CHECK: call <16 x i8> @llvm.s390.vsld(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 0)
   // CHECK-ASM: vsld
@@ -362,6 +388,18 @@ void test_integer(void) {
   vul = vec_srdb(vul, vul, 7);
   // CHECK: call <16 x i8> @llvm.s390.vsrd(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 7)
   // CHECK-ASM: vsrd
+  vslll = vec_srdb(vslll, vslll, 0);
+  // CHECK: call <16 x i8> @llvm.s390.vsrd(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 0)
+  // CHECK-ASM: vsrd
+  vslll = vec_srdb(vslll, vslll, 7);
+  // CHECK: call <16 x i8> @llvm.s390.vsrd(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 7)
+  // CHECK-ASM: vsrd
+  vulll = vec_srdb(vulll, vulll, 0);
+  // CHECK: call <16 x i8> @llvm.s390.vsrd(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 0)
+  // CHECK-ASM: vsrd
+  vulll = vec_srdb(vulll, vulll, 7);
+  // CHECK: call <16 x i8> @llvm.s390.vsrd(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 7)
+  // CHECK-ASM: vsrd
   vf = vec_srdb(vf, vf, 0);
   // CHECK: call <16 x i8> @llvm.s390.vsrd(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 0)
   // CHECK-ASM: vsrd
diff --git clang/test/CodeGen/SystemZ/builtins-systemz-zvector5-error.c clang/test/CodeGen/SystemZ/builtins-systemz-zvector5-error.c
new file mode 100644
index 000000000000..9f4844efd631
--- /dev/null
+++ clang/test/CodeGen/SystemZ/builtins-systemz-zvector5-error.c
@@ -0,0 +1,124 @@
+// REQUIRES: systemz-registered-target
+// RUN: %clang_cc1 -target-cpu arch15 -triple s390x-linux-gnu \
+// RUN: -fzvector -flax-vector-conversions=none \
+// RUN: -Wall -Wno-unused -Werror -fsyntax-only -verify %s
+
+#include <vecintrin.h>
+
+volatile vector signed char vsc;
+volatile vector signed short vss;
+volatile vector signed int vsi;
+volatile vector signed long long vsl;
+volatile vector signed __int128 vslll;
+volatile vector unsigned char vuc;
+volatile vector unsigned short vus;
+volatile vector unsigned int vui;
+volatile vector unsigned long long vul;
+volatile vector unsigned __int128 vulll;
+volatile vector bool char vbc;
+volatile vector bool short vbs;
+volatile vector bool int vbi;
+volatile vector bool long long vbl;
+volatile vector bool __int128 vblll;
+volatile vector double vd;
+
+volatile signed char sc;
+volatile signed short ss;
+volatile signed int si;
+volatile signed long long sl;
+volatile unsigned char uc;
+volatile unsigned short us;
+volatile unsigned int ui;
+volatile unsigned long long ul;
+volatile double d;
+
+const void * volatile cptr;
+const signed char * volatile cptrsc;
+const signed short * volatile cptrss;
+const signed int * volatile cptrsi;
+const signed long long * volatile cptrsl;
+const unsigned char * volatile cptruc;
+const unsigned short * volatile cptrus;
+const unsigned int * volatile cptrui;
+const unsigned long long * volatile cptrul;
+const float * volatile cptrf;
+const double * volatile cptrd;
+
+void * volatile ptr;
+signed char * volatile ptrsc;
+signed short * volatile ptrss;
+signed int * volatile ptrsi;
+signed long long * volatile ptrsl;
+unsigned char * volatile ptruc;
+unsigned short * volatile ptrus;
+unsigned int * volatile ptrui;
+unsigned long long * volatile ptrul;
+float * volatile ptrf;
+double * volatile ptrd;
+
+volatile unsigned int len;
+volatile int idx;
+int cc;
+
+void test_integer(void) {
+  vsc = vec_evaluate(vsc, vsc, vsc, idx); // expected-error {{no matching function}} \
+                                          // expected-error {{argument to '__builtin_s390_veval' must be a constant integer}} \
+                                          // expected-note@vecintrin.h:* 14 {{candidate function not viable}}
+                                          // expected-note@vecintrin.h:* 1 {{must be a constant integer}}
+  vuc = vec_evaluate(vuc, vuc, vuc, idx); // expected-error {{no matching function}} \
+                                          // expected-error {{argument to '__builtin_s390_veval' must be a constant integer}} \
+                                          // expected-note@vecintrin.h:* 13 {{candidate function not viable}}
+                                          // expected-note@vecintrin.h:* 2 {{must be a constant integer}}
+  vbc = vec_evaluate(vbc, vbc, vbc, idx); // expected-error {{no matching function}} \
+                                          // expected-error {{argument to '__builtin_s390_veval' must be a constant integer}} \
+                                          // expected-note@vecintrin.h:* 13 {{candidate function not viable}}
+                                          // expected-note@vecintrin.h:* 2 {{must be a constant integer}}
+  vss = vec_evaluate(vss, vss, vss, idx); // expected-error {{no matching function}} \
+                                          // expected-error {{argument to '__builtin_s390_veval' must be a constant integer}} \
+                                          // expected-note@vecintrin.h:* 14 {{candidate function not viable}}
+                                          // expected-note@vecintrin.h:* 1 {{must be a constant integer}}
+  vus = vec_evaluate(vus, vus, vus, idx); // expected-error {{no matching function}} \
+                                          // expected-error {{argument to '__builtin_s390_veval' must be a constant integer}} \
+                                          // expected-note@vecintrin.h:* 13 {{candidate function not viable}}
+                                          // expected-note@vecintrin.h:* 2 {{must be a constant integer}}
+  vbs = vec_evaluate(vbs, vbs, vbs, idx); // expected-error {{no matching function}} \
+                                          // expected-error {{argument to '__builtin_s390_veval' must be a constant integer}} \
+                                          // expected-note@vecintrin.h:* 13 {{candidate function not viable}}
+                                          // expected-note@vecintrin.h:* 2 {{must be a constant integer}}
+  vsi = vec_evaluate(vsi, vsi, vsi, idx); // expected-error {{no matching function}} \
+                                          // expected-error {{argument to '__builtin_s390_veval' must be a constant integer}} \
+                                          // expected-note@vecintrin.h:* 14 {{candidate function not viable}}
+                                          // expected-note@vecintrin.h:* 1 {{must be a constant integer}}
+  vui = vec_evaluate(vui, vui, vui, idx); // expected-error {{no matching function}} \
+                                          // expected-error {{argument to '__builtin_s390_veval' must be a constant integer}} \
+                                          // expected-note@vecintrin.h:* 13 {{candidate function not viable}}
+                                          // expected-note@vecintrin.h:* 2 {{must be a constant integer}}
+  vbi = vec_evaluate(vbi, vbi, vbi, idx); // expected-error {{no matching function}} \
+                                          // expected-error {{argument to '__builtin_s390_veval' must be a constant integer}} \
+                                          // expected-note@vecintrin.h:* 13 {{candidate function not viable}}
+                                          // expected-note@vecintrin.h:* 2 {{must be a constant integer}}
+  vsl = vec_evaluate(vsl, vsl, vsl, idx); // expected-error {{no matching function}} \
+                                          // expected-error {{argument to '__builtin_s390_veval' must be a constant integer}} \
+                                          // expected-note@vecintrin.h:* 14 {{candidate function not viable}}
+                                          // expected-note@vecintrin.h:* 1 {{must be a constant integer}}
+  vul = vec_evaluate(vul, vul, vul, idx); // expected-error {{no matching function}} \
+                                          // expected-error {{argument to '__builtin_s390_veval' must be a constant integer}} \
+                                          // expected-note@vecintrin.h:* 13 {{candidate function not viable}}
+                                          // expected-note@vecintrin.h:* 2 {{must be a constant integer}}
+  vbl = vec_evaluate(vbl, vbl, vbl, idx); // expected-error {{no matching function}} \
+                                          // expected-error {{argument to '__builtin_s390_veval' must be a constant integer}} \
+                                          // expected-note@vecintrin.h:* 13 {{candidate function not viable}}
+                                          // expected-note@vecintrin.h:* 2 {{must be a constant integer}}
+  vslll = vec_evaluate(vslll, vslll, vslll, idx); // expected-error {{no matching function}} \
+                                          // expected-error {{argument to '__builtin_s390_veval' must be a constant integer}} \
+                                          // expected-note@vecintrin.h:* 14 {{candidate function not viable}}
+                                          // expected-note@vecintrin.h:* 1 {{must be a constant integer}}
+  vulll = vec_evaluate(vulll, vulll, vulll, idx); // expected-error {{no matching function}} \
+                                          // expected-error {{argument to '__builtin_s390_veval' must be a constant integer}} \
+                                          // expected-note@vecintrin.h:* 13 {{candidate function not viable}}
+                                          // expected-note@vecintrin.h:* 2 {{must be a constant integer}}
+  vblll = vec_evaluate(vblll, vblll, vblll, idx); // expected-error {{no matching function}} \
+                                          // expected-error {{argument to '__builtin_s390_veval' must be a constant integer}} \
+                                          // expected-note@vecintrin.h:* 13 {{candidate function not viable}}
+                                          // expected-note@vecintrin.h:* 2 {{must be a constant integer}}
+}
diff --git clang/test/CodeGen/SystemZ/builtins-systemz-zvector5.c clang/test/CodeGen/SystemZ/builtins-systemz-zvector5.c
new file mode 100644
index 000000000000..7a29dbf552e0
--- /dev/null
+++ clang/test/CodeGen/SystemZ/builtins-systemz-zvector5.c
@@ -0,0 +1,429 @@
+// REQUIRES: systemz-registered-target
+// RUN: %clang_cc1 -target-cpu arch15 -triple s390x-linux-gnu \
+// RUN: -O2 -fzvector -flax-vector-conversions=none \
+// RUN: -Wall -Wno-unused -Werror -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -target-cpu arch15 -triple s390x-linux-gnu \
+// RUN: -O2 -fzvector -flax-vector-conversions=none \
+// RUN: -Wall -Wno-unused -Werror -S %s -o - | FileCheck %s --check-prefix=CHECK-ASM
+
+#include <vecintrin.h>
+
+volatile vector signed char vsc;
+volatile vector signed short vss;
+volatile vector signed int vsi;
+volatile vector signed long long vsl;
+volatile vector signed __int128 vslll;
+volatile vector unsigned char vuc;
+volatile vector unsigned short vus;
+volatile vector unsigned int vui;
+volatile vector unsigned long long vul;
+volatile vector unsigned __int128 vulll;
+volatile vector bool char vbc;
+volatile vector bool short vbs;
+volatile vector bool int vbi;
+volatile vector bool long long vbl;
+volatile vector bool __int128 vblll;
+volatile vector float vf;
+volatile vector double vd;
+
+volatile int idx;
+int cc;
+
+void test_core(void) {
+  // CHECK-ASM-LABEL: test_core
+
+  vuc = vec_gen_element_masks_8(vus);
+  // CHECK: call <16 x i8> @llvm.s390.vgemb(<8 x i16> %{{.*}})
+  // CHECK-ASM: vgemb
+  vus = vec_gen_element_masks_16(vuc);
+  // CHECK: call <8 x i16> @llvm.s390.vgemh(<16 x i8> %{{.*}})
+  // CHECK-ASM: vgemh
+  vui = vec_gen_element_masks_32(vuc);
+  // CHECK: call <4 x i32> @llvm.s390.vgemf(<16 x i8> %{{.*}})
+  // CHECK-ASM: vgemf
+  vul = vec_gen_element_masks_64(vuc);
+  // CHECK: call <2 x i64> @llvm.s390.vgemg(<16 x i8> %{{.*}})
+  // CHECK-ASM: vgemg
+  vulll = vec_gen_element_masks_128(vuc);
+  // CHECK: call i128 @llvm.s390.vgemq(<16 x i8> %{{.*}})
+  // CHECK-ASM: vgemq
+
+  vsc = vec_blend(vsc, vsc, vsc);
+  // CHECK-ASM: vblendb
+  vbc = vec_blend(vbc, vbc, vsc);
+  // CHECK-ASM: vblendb
+  vuc = vec_blend(vuc, vuc, vsc);
+  // CHECK-ASM: vblendb
+  vss = vec_blend(vss, vss, vss);
+  // CHECK-ASM: vblendh
+  vbs = vec_blend(vbs, vbs, vss);
+  // CHECK-ASM: vblendh
+  vus = vec_blend(vus, vus, vss);
+  // CHECK-ASM: vblendh
+  vsi = vec_blend(vsi, vsi, vsi);
+  // CHECK-ASM: vblendf
+  vbi = vec_blend(vbi, vbi, vsi);
+  // CHECK-ASM: vblendf
+  vui = vec_blend(vui, vui, vsi);
+  // CHECK-ASM: vblendf
+  vsl = vec_blend(vsl, vsl, vsl);
+  // CHECK-ASM: vblendg
+  vul = vec_blend(vul, vul, vsl);
+  // CHECK-ASM: vblendg
+  vbl = vec_blend(vbl, vbl, vsl);
+  // CHECK-ASM: vblendg
+  vslll = vec_blend(vslll, vslll, vslll);
+  // CHECK-ASM: vblendq
+  vblll = vec_blend(vblll, vblll, vslll);
+  // CHECK-ASM: vblendq
+  vulll = vec_blend(vulll, vulll, vslll);
+  // CHECK-ASM: vblendq
+  vf = vec_blend(vf, vf, vsi);
+  // CHECK-ASM: vblendf
+  vd = vec_blend(vd, vd, vsl);
+  // CHECK-ASM: vblendg
+
+  vslll = vec_unpackh(vsl);
+  // CHECK: call i128 @llvm.s390.vuphg(<2 x i64> %{{.*}})
+  // CHECK-ASM: vuphg
+  vulll = vec_unpackh(vul);
+  // CHECK: call i128 @llvm.s390.vuplhg(<2 x i64> %{{.*}})
+  // CHECK-ASM: vuplhg
+  vslll = vec_unpackl(vsl);
+  // CHECK: call i128 @llvm.s390.vuplg(<2 x i64> %{{.*}})
+  // CHECK-ASM: vuplg
+  vulll = vec_unpackl(vul);
+  // CHECK: call i128 @llvm.s390.vupllg(<2 x i64> %{{.*}})
+  // CHECK-ASM: vupllg
+}
+
+void test_compare(void) {
+  // CHECK-ASM-LABEL: test_compare
+
+  vblll = vec_cmpeq(vslll, vslll);
+  // CHECK: icmp eq <1 x i128> %{{.*}}, %{{.*}}
+  // CHECK-ASM: vceqq
+  vblll = vec_cmpeq(vulll, vulll);
+  // CHECK: icmp eq <1 x i128> %{{.*}}, %{{.*}}
+  // CHECK-ASM: vceqq
+  vblll = vec_cmpeq(vblll, vblll);
+  // CHECK: icmp eq <1 x i128> %{{.*}}, %{{.*}}
+  // CHECK-ASM: vceqq
+
+  vblll = vec_cmpge(vslll, vslll);
+  // CHECK: icmp sge <1 x i128> %{{.*}}, %{{.*}}
+  // CHECK-ASM: vchq
+  vblll = vec_cmpge(vulll, vulll);
+  // CHECK: icmp uge <1 x i128> %{{.*}}, %{{.*}}
+  // CHECK-ASM: vchlq
+
+  vblll = vec_cmpgt(vslll, vslll);
+  // CHECK: icmp sgt <1 x i128> %{{.*}}, %{{.*}}
+  // CHECK-ASM: vchq
+  vblll = vec_cmpgt(vulll, vulll);
+  // CHECK: icmp ugt <1 x i128> %{{.*}}, %{{.*}}
+  // CHECK-ASM: vchlq
+
+  vblll = vec_cmple(vslll, vslll);
+  // CHECK: icmp sle <1 x i128> %{{.*}}, %{{.*}}
+  // CHECK-ASM: vchq
+  vblll = vec_cmple(vulll, vulll);
+  // CHECK: icmp ule <1 x i128> %{{.*}}, %{{.*}}
+  // CHECK-ASM: vchlq
+
+  vblll = vec_cmplt(vslll, vslll);
+  // CHECK: icmp slt <1 x i128> %{{.*}}, %{{.*}}
+  // CHECK-ASM: vchq
+  vblll = vec_cmplt(vulll, vulll);
+  // CHECK: icmp ult <1 x i128> %{{.*}}, %{{.*}}
+  // CHECK-ASM: vchlq
+
+  idx = vec_all_eq(vslll, vslll);
+  // CHECK: call { i128, i32 } @llvm.s390.vceqqs(i128 %{{.*}}, i128 %{{.*}})
+  // CHECK-ASM: vceqqs
+  idx = vec_all_eq(vulll, vulll);
+  // CHECK: call { i128, i32 } @llvm.s390.vceqqs(i128 %{{.*}}, i128 %{{.*}})
+  // CHECK-ASM: vceqqs
+  idx = vec_all_eq(vblll, vblll);
+  // CHECK: call { i128, i32 } @llvm.s390.vceqqs(i128 %{{.*}}, i128 %{{.*}})
+  // CHECK-ASM: vceqqs
+
+  idx = vec_all_ne(vslll, vslll);
+  // CHECK: call { i128, i32 } @llvm.s390.vceqqs(i128 %{{.*}}, i128 %{{.*}})
+  // CHECK-ASM: vceqqs
+  idx = vec_all_ne(vulll, vulll);
+  // CHECK: call { i128, i32 } @llvm.s390.vceqqs(i128 %{{.*}}, i128 %{{.*}})
+  // CHECK-ASM: vceqqs
+  idx = vec_all_ne(vblll, vblll);
+  // CHECK: call { i128, i32 } @llvm.s390.vceqqs(i128 %{{.*}}, i128 %{{.*}})
+  // CHECK-ASM: vceqqs
+
+  idx = vec_all_ge(vslll, vslll);
+  // CHECK: call { i128, i32 } @llvm.s390.vchqs(i128 %{{.*}}, i128 %{{.*}})
+  // CHECK-ASM: vchqs
+  idx = vec_all_ge(vulll, vulll);
+  // CHECK: call { i128, i32 } @llvm.s390.vchlqs(i128 %{{.*}}, i128 %{{.*}})
+  // CHECK-ASM: vchlqs
+
+  idx = vec_all_gt(vslll, vslll);
+  // CHECK: call { i128, i32 } @llvm.s390.vchqs(i128 %{{.*}}, i128 %{{.*}})
+  // CHECK-ASM: vchqs
+  idx = vec_all_gt(vulll, vulll);
+  // CHECK: call { i128, i32 } @llvm.s390.vchlqs(i128 %{{.*}}, i128 %{{.*}})
+  // CHECK-ASM: vchlqs
+
+  idx = vec_all_le(vslll, vslll);
+  // CHECK: call { i128, i32 } @llvm.s390.vchqs(i128 %{{.*}}, i128 %{{.*}})
+  // CHECK-ASM: vchqs
+  idx = vec_all_le(vulll, vulll);
+  // CHECK: call { i128, i32 } @llvm.s390.vchlqs(i128 %{{.*}}, i128 %{{.*}})
+  // CHECK-ASM: vchlqs
+
+  idx = vec_all_lt(vslll, vslll);
+  // CHECK: call { i128, i32 } @llvm.s390.vchqs(i128 %{{.*}}, i128 %{{.*}})
+  // CHECK-ASM: vchqs
+  idx = vec_all_lt(vulll, vulll);
+  // CHECK: call { i128, i32 } @llvm.s390.vchlqs(i128 %{{.*}}, i128 %{{.*}})
+  // CHECK-ASM: vchlqs
+
+  idx = vec_any_eq(vslll, vslll);
+  // CHECK: call { i128, i32 } @llvm.s390.vceqqs(i128 %{{.*}}, i128 %{{.*}})
+  // CHECK-ASM: vceqqs
+  idx = vec_any_eq(vulll, vulll);
+  // CHECK: call { i128, i32 } @llvm.s390.vceqqs(i128 %{{.*}}, i128 %{{.*}})
+  // CHECK-ASM: vceqqs
+  idx = vec_any_eq(vblll, vblll);
+  // CHECK: call { i128, i32 } @llvm.s390.vceqqs(i128 %{{.*}}, i128 %{{.*}})
+  // CHECK-ASM: vceqqs
+
+  idx = vec_any_ne(vslll, vslll);
+  // CHECK: call { i128, i32 } @llvm.s390.vceqqs(i128 %{{.*}}, i128 %{{.*}})
+  // CHECK-ASM: vceqqs
+  idx = vec_any_ne(vulll, vulll);
+  // CHECK: call { i128, i32 } @llvm.s390.vceqqs(i128 %{{.*}}, i128 %{{.*}})
+  // CHECK-ASM: vceqqs
+  idx = vec_any_ne(vblll, vblll);
+  // CHECK: call { i128, i32 } @llvm.s390.vceqqs(i128 %{{.*}}, i128 %{{.*}})
+  // CHECK-ASM: vceqqs
+
+  idx = vec_any_ge(vslll, vslll);
+  // CHECK: call { i128, i32 } @llvm.s390.vchqs(i128 %{{.*}}, i128 %{{.*}})
+  // CHECK-ASM: vchqs
+  idx = vec_any_ge(vulll, vulll);
+  // CHECK: call { i128, i32 } @llvm.s390.vchlqs(i128 %{{.*}}, i128 %{{.*}})
+  // CHECK-ASM: vchlqs
+
+  idx = vec_any_gt(vslll, vslll);
+  // CHECK: call { i128, i32 } @llvm.s390.vchqs(i128 %{{.*}}, i128 %{{.*}})
+  // CHECK-ASM: vchqs
+  idx = vec_any_gt(vulll, vulll);
+  // CHECK: call { i128, i32 } @llvm.s390.vchlqs(i128 %{{.*}}, i128 %{{.*}})
+  // CHECK-ASM: vchlqs
+
+  idx = vec_any_le(vslll, vslll);
+  // CHECK: call { i128, i32 } @llvm.s390.vchqs(i128 %{{.*}}, i128 %{{.*}})
+  // CHECK-ASM: vchqs
+  idx = vec_any_le(vulll, vulll);
+  // CHECK: call { i128, i32 } @llvm.s390.vchlqs(i128 %{{.*}}, i128 %{{.*}})
+  // CHECK-ASM: vchlqs
+
+  idx = vec_any_lt(vslll, vslll);
+  // CHECK: call { i128, i32 } @llvm.s390.vchqs(i128 %{{.*}}, i128 %{{.*}})
+  // CHECK-ASM: vchqs
+  idx = vec_any_lt(vulll, vulll);
+  // CHECK: call { i128, i32 } @llvm.s390.vchlqs(i128 %{{.*}}, i128 %{{.*}})
+  // CHECK-ASM: vchlqs
+}
+
+void test_integer(void) {
+  // CHECK-ASM-LABEL: test_integer
+
+  vulll = vec_cntlz(vulll);
+  // CHECK: call range(i128 0, 129) i128 @llvm.ctlz.i128(i128 %{{.*}}, i1 false)
+  // CHECK-ASM: vclzq
+  vulll = vec_cnttz(vulll);
+  // CHECK: call range(i128 0, 129) i128 @llvm.cttz.i128(i128 %{{.*}}, i1 false)
+  // CHECK-ASM: vctzq
+
+  vslll = vec_abs(vslll);
+  // CHECK-ASM: vlcq
+
+  vslll = vec_avg(vslll, vslll);
+  // CHECK: call i128 @llvm.s390.vavgq(i128 %{{.*}}, i128 %{{.*}})
+  // CHECK-ASM: vavgq
+  vulll = vec_avg(vulll, vulll);
+  // CHECK: call i128 @llvm.s390.vavglq(i128 %{{.*}}, i128 %{{.*}})
+  // CHECK-ASM: vavglq
+
+  vsc = vec_evaluate(vsc, vsc, vsc, 0);
+  // CHECK: call <16 x i8> @llvm.s390.veval(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 0)
+  // CHECK-ASM: veval
+  vsc = vec_evaluate(vsc, vsc, vsc, 255);
+  // CHECK: call <16 x i8> @llvm.s390.veval(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 255)
+  // CHECK-ASM: veval
+  vuc = vec_evaluate(vuc, vuc, vuc, 0);
+  // CHECK: call <16 x i8> @llvm.s390.veval(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 0)
+  // CHECK-ASM: veval
+  vuc = vec_evaluate(vuc, vuc, vuc, 255);
+  // CHECK: call <16 x i8> @llvm.s390.veval(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 255)
+  // CHECK-ASM: veval
+  vbc = vec_evaluate(vbc, vbc, vbc, 0);
+  // CHECK: call <16 x i8> @llvm.s390.veval(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 0)
+  // CHECK-ASM: veval
+  vbc = vec_evaluate(vbc, vbc, vbc, 255);
+  // CHECK: call <16 x i8> @llvm.s390.veval(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 255)
+  // CHECK-ASM: veval
+  vss = vec_evaluate(vss, vss, vss, 0);
+  // CHECK: call <16 x i8> @llvm.s390.veval(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 0)
+  // CHECK-ASM: veval
+  vss = vec_evaluate(vss, vss, vss, 255);
+  // CHECK: call <16 x i8> @llvm.s390.veval(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 255)
+  // CHECK-ASM: veval
+  vus = vec_evaluate(vus, vus, vus, 0);
+  // CHECK: call <16 x i8> @llvm.s390.veval(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 0)
+  // CHECK-ASM: veval
+  vus = vec_evaluate(vus, vus, vus, 255);
+  // CHECK: call <16 x i8> @llvm.s390.veval(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 255)
+  // CHECK-ASM: veval
+  vbs = vec_evaluate(vbs, vbs, vbs, 0);
+  // CHECK: call <16 x i8> @llvm.s390.veval(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 0)
+  // CHECK-ASM: veval
+  vbs = vec_evaluate(vbs, vbs, vbs, 255);
+  // CHECK: call <16 x i8> @llvm.s390.veval(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 255)
+  // CHECK-ASM: veval
+  vsi = vec_evaluate(vsi, vsi, vsi, 0);
+  // CHECK: call <16 x i8> @llvm.s390.veval(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 0)
+  // CHECK-ASM: veval
+  vsi = vec_evaluate(vsi, vsi, vsi, 255);
+  // CHECK: call <16 x i8> @llvm.s390.veval(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 255)
+  // CHECK-ASM: veval
+  vui = vec_evaluate(vui, vui, vui, 0);
+  // CHECK: call <16 x i8> @llvm.s390.veval(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 0)
+  // CHECK-ASM: veval
+  vui = vec_evaluate(vui, vui, vui, 255);
+  // CHECK: call <16 x i8> @llvm.s390.veval(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 255)
+  // CHECK-ASM: veval
+  vbi = vec_evaluate(vbi, vbi, vbi, 0);
+  // CHECK: call <16 x i8> @llvm.s390.veval(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 0)
+  // CHECK-ASM: veval
+  vbi = vec_evaluate(vbi, vbi, vbi, 255);
+  // CHECK: call <16 x i8> @llvm.s390.veval(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 255)
+  // CHECK-ASM: veval
+  vsl = vec_evaluate(vsl, vsl, vsl, 0);
+  // CHECK: call <16 x i8> @llvm.s390.veval(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 0)
+  // CHECK-ASM: veval
+  vsl = vec_evaluate(vsl, vsl, vsl, 255);
+  // CHECK: call <16 x i8> @llvm.s390.veval(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 255)
+  // CHECK-ASM: veval
+  vul = vec_evaluate(vul, vul, vul, 0);
+  // CHECK: call <16 x i8> @llvm.s390.veval(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 0)
+  // CHECK-ASM: veval
+  vul = vec_evaluate(vul, vul, vul, 255);
+  // CHECK: call <16 x i8> @llvm.s390.veval(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 255)
+  // CHECK-ASM: veval
+  vbl = vec_evaluate(vbl, vbl, vbl, 0);
+  // CHECK: call <16 x i8> @llvm.s390.veval(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 0)
+  // CHECK-ASM: veval
+  vbl = vec_evaluate(vbl, vbl, vbl, 255);
+  // CHECK: call <16 x i8> @llvm.s390.veval(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 255)
+  // CHECK-ASM: veval
+  vslll = vec_evaluate(vslll, vslll, vslll, 0);
+  // CHECK: call <16 x i8> @llvm.s390.veval(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 0)
+  // CHECK-ASM: veval
+  vslll = vec_evaluate(vslll, vslll, vslll, 255);
+  // CHECK: call <16 x i8> @llvm.s390.veval(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 255)
+  // CHECK-ASM: veval
+  vulll = vec_evaluate(vulll, vulll, vulll, 0);
+  // CHECK: call <16 x i8> @llvm.s390.veval(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 0)
+  // CHECK-ASM: veval
+  vulll = vec_evaluate(vulll, vulll, vulll, 255);
+  // CHECK: call <16 x i8> @llvm.s390.veval(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 255)
+  // CHECK-ASM: veval
+  vblll = vec_evaluate(vblll, vblll, vblll, 0);
+  // CHECK: call <16 x i8> @llvm.s390.veval(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 0)
+  // CHECK-ASM: veval
+  vblll = vec_evaluate(vblll, vblll, vblll, 255);
+  // CHECK: call <16 x i8> @llvm.s390.veval(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 255)
+  // CHECK-ASM: veval
+
+  vslll = vec_max(vslll, vslll);
+  // CHECK-ASM: vmxq
+  vulll = vec_max(vulll, vulll);
+  // CHECK-ASM: vmxlq
+  vslll = vec_min(vslll, vslll);
+  // CHECK-ASM: vmnq
+  vulll = vec_min(vulll, vulll);
+  // CHECK-ASM: vmnlq
+
+  vsl = vec_mladd(vsl, vsl, vsl);
+  // CHECK-ASM: vmalg
+  vsl = vec_mladd(vul, vsl, vsl);
+  // CHECK-ASM: vmalg
+  vsl = vec_mladd(vsl, vul, vul);
+  // CHECK-ASM: vmalg
+  vul = vec_mladd(vul, vul, vul);
+  // CHECK-ASM: vmalg
+  vslll = vec_mladd(vslll, vslll, vslll);
+  // CHECK-ASM: vmalq
+  vslll = vec_mladd(vulll, vslll, vslll);
+  // CHECK-ASM: vmalq
+  vslll = vec_mladd(vslll, vulll, vulll);
+  // CHECK-ASM: vmalq
+  vulll = vec_mladd(vulll, vulll, vulll);
+  // CHECK-ASM: vmalq
+
+  vsl = vec_mhadd(vsl, vsl, vsl);
+  // CHECK: call <2 x i64> @llvm.s390.vmahg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}})
+  // CHECK-ASM: vmahg
+  vul = vec_mhadd(vul, vul, vul);
+  // CHECK: call <2 x i64> @llvm.s390.vmalhg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}})
+  // CHECK-ASM: vmalhg
+  vslll = vec_mhadd(vslll, vslll, vslll);
+  // CHECK: call i128 @llvm.s390.vmahq(i128 %{{.*}}, i128 %{{.*}}, i128 %{{.*}})
+  // CHECK-ASM: vmahq
+  vulll = vec_mhadd(vulll, vulll, vulll);
+  // CHECK: call i128 @llvm.s390.vmalhq(i128 %{{.*}}, i128 %{{.*}}, i128 %{{.*}})
+  // CHECK-ASM: vmalhq
+
+  vslll = vec_meadd(vsl, vsl, vslll);
+  // CHECK: call i128 @llvm.s390.vmaeg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, i128 %{{.*}})
+  // CHECK-ASM: vmaeg
+  vulll = vec_meadd(vul, vul, vulll);
+  // CHECK: call i128 @llvm.s390.vmaleg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, i128 %{{.*}})
+  // CHECK-ASM: vmaleg
+
+  vslll = vec_moadd(vsl, vsl, vslll);
+  // CHECK: call i128 @llvm.s390.vmaog(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, i128 %{{.*}})
+  // CHECK-ASM: vmaog
+  vulll = vec_moadd(vul, vul, vulll);
+  // CHECK: call i128 @llvm.s390.vmalog(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, i128 %{{.*}})
+  // CHECK-ASM: vmalog
+
+  vsl = vec_mulh(vsl, vsl);
+  // CHECK: call <2 x i64> @llvm.s390.vmhg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
+  // CHECK-ASM: vmhg
+  vul = vec_mulh(vul, vul);
+  // CHECK: call <2 x i64> @llvm.s390.vmlhg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
+  // CHECK-ASM: vmlhg
+  vslll = vec_mulh(vslll, vslll);
+  // CHECK: call i128 @llvm.s390.vmhq(i128 %{{.*}}, i128 %{{.*}})
+  // CHECK-ASM: vmhq
+  vulll = vec_mulh(vulll, vulll);
+  // CHECK: call i128 @llvm.s390.vmlhq(i128 %{{.*}}, i128 %{{.*}})
+  // CHECK-ASM: vmlhq
+
+  vslll = vec_mule(vsl, vsl);
+  // CHECK: call i128 @llvm.s390.vmeg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
+  // CHECK-ASM: vmeg
+  vulll = vec_mule(vul, vul);
+  // CHECK: call i128 @llvm.s390.vmleg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
+  // CHECK-ASM: vmleg
+
+  vslll = vec_mulo(vsl, vsl);
+  // CHECK: call i128 @llvm.s390.vmog(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
+  // CHECK-ASM: vmog
+  vulll = vec_mulo(vul, vul);
+  // CHECK: call i128 @llvm.s390.vmlog(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
+  // CHECK-ASM: vmlog
+}
+
diff --git clang/test/CodeGen/SystemZ/systemz-abi-vector.c clang/test/CodeGen/SystemZ/systemz-abi-vector.c
index 8361ccef2102..1e1926678ec3 100644
--- clang/test/CodeGen/SystemZ/systemz-abi-vector.c
+++ clang/test/CodeGen/SystemZ/systemz-abi-vector.c
@@ -18,6 +18,8 @@
 // RUN:   -emit-llvm -o - %s | FileCheck --check-prefix=CHECK-VECTOR %s
 // RUN: %clang_cc1 -no-enable-noundef-analysis -triple s390x-linux-gnu -target-cpu arch14 \
 // RUN:   -emit-llvm -o - %s | FileCheck --check-prefix=CHECK-VECTOR %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple s390x-linux-gnu -target-cpu arch15 \
+// RUN:   -emit-llvm -o - %s | FileCheck --check-prefix=CHECK-VECTOR %s
 
 // Vector types
 
diff --git clang/test/CodeGen/SystemZ/systemz-abi.c clang/test/CodeGen/SystemZ/systemz-abi.c
index fd2b5d450cc6..58081bdc6cc2 100644
--- clang/test/CodeGen/SystemZ/systemz-abi.c
+++ clang/test/CodeGen/SystemZ/systemz-abi.c
@@ -24,6 +24,11 @@
 // RUN: %clang_cc1 -no-enable-noundef-analysis -triple s390x-linux-gnu -target-cpu arch14 \
 // RUN:   -emit-llvm -o - %s -mfloat-abi soft | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,SOFT-FLOAT
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple s390x-linux-gnu -target-cpu arch15 \
+// RUN:   -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,HARD-FLOAT
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple s390x-linux-gnu -target-cpu arch15 \
+// RUN:   -emit-llvm -o - %s -mfloat-abi soft | FileCheck %s \
+// RUN:   --check-prefixes=CHECK,SOFT-FLOAT
 
 // Scalar types
 
diff --git clang/test/CodeGen/SystemZ/vec-abi-gnuattr-05.c clang/test/CodeGen/SystemZ/vec-abi-gnuattr-05.c
index c19fd17a9684..d0cc7615dd6b 100644
--- clang/test/CodeGen/SystemZ/vec-abi-gnuattr-05.c
+++ clang/test/CodeGen/SystemZ/vec-abi-gnuattr-05.c
@@ -11,7 +11,7 @@ typedef __attribute__((vector_size(16))) int v4i32;
 v4i32 (*bar)(int);
 
 static int foo() {
-  (*bar)(0)[0];
+  return (*bar)(0)[0];
 }
 
 int fun() { return foo(); }
diff --git clang/test/CodeGen/SystemZ/zvector.c clang/test/CodeGen/SystemZ/zvector.c
index cbf6a9a1a1bf..a0b654d9acc9 100644
--- clang/test/CodeGen/SystemZ/zvector.c
+++ clang/test/CodeGen/SystemZ/zvector.c
@@ -19,6 +19,10 @@ volatile vector signed long long sl, sl2;
 volatile vector unsigned long long ul, ul2;
 volatile vector bool long long bl, bl2;
 
+volatile vector signed __int128 slll, slll2;
+volatile vector unsigned __int128 ulll, ulll2;
+volatile vector bool __int128 blll, blll2;
+
 volatile vector double fd, fd2;
 
 volatile int cnt;
@@ -42,8 +46,12 @@ volatile int cnt;
 // CHECK-NEXT:    store volatile <2 x i64> [[TMP6]], ptr @sl, align 8
 // CHECK-NEXT:    [[TMP7:%.*]] = load volatile <2 x i64>, ptr @ul2, align 8
 // CHECK-NEXT:    store volatile <2 x i64> [[TMP7]], ptr @ul, align 8
-// CHECK-NEXT:    [[TMP8:%.*]] = load volatile <2 x double>, ptr @fd2, align 8
-// CHECK-NEXT:    store volatile <2 x double> [[TMP8]], ptr @fd, align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = load volatile <1 x i128>, ptr @slll2, align 8
+// CHECK-NEXT:    store volatile <1 x i128> [[TMP8]], ptr @slll, align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = load volatile <1 x i128>, ptr @ulll2, align 8
+// CHECK-NEXT:    store volatile <1 x i128> [[TMP9]], ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = load volatile <2 x double>, ptr @fd2, align 8
+// CHECK-NEXT:    store volatile <2 x double> [[TMP10]], ptr @fd, align 8
 // CHECK-NEXT:    ret void
 //
 void test_assign(void) {
@@ -60,6 +68,9 @@ void test_assign(void) {
   sl = sl2;
   ul = ul2;
 
+  slll = slll2;
+  ulll = ulll2;
+
   fd = fd2;
 }
 
@@ -82,8 +93,12 @@ void test_assign(void) {
 // CHECK-NEXT:    store volatile <2 x i64> [[TMP6]], ptr @sl, align 8
 // CHECK-NEXT:    [[TMP7:%.*]] = load volatile <2 x i64>, ptr @ul2, align 8
 // CHECK-NEXT:    store volatile <2 x i64> [[TMP7]], ptr @ul, align 8
-// CHECK-NEXT:    [[TMP8:%.*]] = load volatile <2 x double>, ptr @fd2, align 8
-// CHECK-NEXT:    store volatile <2 x double> [[TMP8]], ptr @fd, align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = load volatile <1 x i128>, ptr @slll2, align 8
+// CHECK-NEXT:    store volatile <1 x i128> [[TMP8]], ptr @slll, align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = load volatile <1 x i128>, ptr @ulll2, align 8
+// CHECK-NEXT:    store volatile <1 x i128> [[TMP9]], ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = load volatile <2 x double>, ptr @fd2, align 8
+// CHECK-NEXT:    store volatile <2 x double> [[TMP10]], ptr @fd, align 8
 // CHECK-NEXT:    ret void
 //
 void test_pos(void) {
@@ -100,6 +115,9 @@ void test_pos(void) {
   sl = +sl2;
   ul = +ul2;
 
+  slll = +slll2;
+  ulll = +ulll2;
+
   fd = +fd2;
 }
 
@@ -118,8 +136,11 @@ void test_pos(void) {
 // CHECK-NEXT:    [[TMP3:%.*]] = load volatile <2 x i64>, ptr @sl2, align 8
 // CHECK-NEXT:    [[SUB3:%.*]] = sub <2 x i64> zeroinitializer, [[TMP3]]
 // CHECK-NEXT:    store volatile <2 x i64> [[SUB3]], ptr @sl, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = load volatile <2 x double>, ptr @fd2, align 8
-// CHECK-NEXT:    [[FNEG:%.*]] = fneg <2 x double> [[TMP4]]
+// CHECK-NEXT:    [[TMP4:%.*]] = load volatile <1 x i128>, ptr @slll2, align 8
+// CHECK-NEXT:    [[SUB4:%.*]] = sub <1 x i128> zeroinitializer, [[TMP4]]
+// CHECK-NEXT:    store volatile <1 x i128> [[SUB4]], ptr @slll, align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = load volatile <2 x double>, ptr @fd2, align 8
+// CHECK-NEXT:    [[FNEG:%.*]] = fneg <2 x double> [[TMP5]]
 // CHECK-NEXT:    store volatile <2 x double> [[FNEG]], ptr @fd, align 8
 // CHECK-NEXT:    ret void
 //
@@ -129,6 +150,7 @@ void test_neg(void) {
   ss = -ss2;
   si = -si2;
   sl = -sl2;
+  slll = -slll2;
   fd = -fd2;
 }
 
@@ -159,9 +181,15 @@ void test_neg(void) {
 // CHECK-NEXT:    [[TMP7:%.*]] = load volatile <2 x i64>, ptr @ul2, align 8
 // CHECK-NEXT:    [[INC7:%.*]] = add <2 x i64> [[TMP7]], splat (i64 1)
 // CHECK-NEXT:    store volatile <2 x i64> [[INC7]], ptr @ul2, align 8
-// CHECK-NEXT:    [[TMP8:%.*]] = load volatile <2 x double>, ptr @fd2, align 8
-// CHECK-NEXT:    [[INC8:%.*]] = fadd <2 x double> [[TMP8]], splat (double 1.000000e+00)
-// CHECK-NEXT:    store volatile <2 x double> [[INC8]], ptr @fd2, align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = load volatile <1 x i128>, ptr @slll2, align 8
+// CHECK-NEXT:    [[INC8:%.*]] = add <1 x i128> [[TMP8]], splat (i128 1)
+// CHECK-NEXT:    store volatile <1 x i128> [[INC8]], ptr @slll2, align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = load volatile <1 x i128>, ptr @ulll2, align 8
+// CHECK-NEXT:    [[INC9:%.*]] = add <1 x i128> [[TMP9]], splat (i128 1)
+// CHECK-NEXT:    store volatile <1 x i128> [[INC9]], ptr @ulll2, align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = load volatile <2 x double>, ptr @fd2, align 8
+// CHECK-NEXT:    [[INC10:%.*]] = fadd <2 x double> [[TMP10]], splat (double 1.000000e+00)
+// CHECK-NEXT:    store volatile <2 x double> [[INC10]], ptr @fd2, align 8
 // CHECK-NEXT:    ret void
 //
 void test_preinc(void) {
@@ -178,6 +206,9 @@ void test_preinc(void) {
   ++sl2;
   ++ul2;
 
+  ++slll2;
+  ++ulll2;
+
   ++fd2;
 }
 
@@ -208,9 +239,15 @@ void test_preinc(void) {
 // CHECK-NEXT:    [[TMP7:%.*]] = load volatile <2 x i64>, ptr @ul2, align 8
 // CHECK-NEXT:    [[INC7:%.*]] = add <2 x i64> [[TMP7]], splat (i64 1)
 // CHECK-NEXT:    store volatile <2 x i64> [[INC7]], ptr @ul2, align 8
-// CHECK-NEXT:    [[TMP8:%.*]] = load volatile <2 x double>, ptr @fd2, align 8
-// CHECK-NEXT:    [[INC8:%.*]] = fadd <2 x double> [[TMP8]], splat (double 1.000000e+00)
-// CHECK-NEXT:    store volatile <2 x double> [[INC8]], ptr @fd2, align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = load volatile <1 x i128>, ptr @slll2, align 8
+// CHECK-NEXT:    [[INC8:%.*]] = add <1 x i128> [[TMP8]], splat (i128 1)
+// CHECK-NEXT:    store volatile <1 x i128> [[INC8]], ptr @slll2, align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = load volatile <1 x i128>, ptr @ulll2, align 8
+// CHECK-NEXT:    [[INC9:%.*]] = add <1 x i128> [[TMP9]], splat (i128 1)
+// CHECK-NEXT:    store volatile <1 x i128> [[INC9]], ptr @ulll2, align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = load volatile <2 x double>, ptr @fd2, align 8
+// CHECK-NEXT:    [[INC10:%.*]] = fadd <2 x double> [[TMP10]], splat (double 1.000000e+00)
+// CHECK-NEXT:    store volatile <2 x double> [[INC10]], ptr @fd2, align 8
 // CHECK-NEXT:    ret void
 //
 void test_postinc(void) {
@@ -227,6 +264,9 @@ void test_postinc(void) {
   sl2++;
   ul2++;
 
+  slll2++;
+  ulll2++;
+
   fd2++;
 }
 
@@ -257,9 +297,15 @@ void test_postinc(void) {
 // CHECK-NEXT:    [[TMP7:%.*]] = load volatile <2 x i64>, ptr @ul2, align 8
 // CHECK-NEXT:    [[DEC7:%.*]] = add <2 x i64> [[TMP7]], splat (i64 -1)
 // CHECK-NEXT:    store volatile <2 x i64> [[DEC7]], ptr @ul2, align 8
-// CHECK-NEXT:    [[TMP8:%.*]] = load volatile <2 x double>, ptr @fd2, align 8
-// CHECK-NEXT:    [[DEC8:%.*]] = fadd <2 x double> [[TMP8]], splat (double -1.000000e+00)
-// CHECK-NEXT:    store volatile <2 x double> [[DEC8]], ptr @fd2, align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = load volatile <1 x i128>, ptr @slll2, align 8
+// CHECK-NEXT:    [[DEC8:%.*]] = add <1 x i128> [[TMP8]], splat (i128 18446744073709551615)
+// CHECK-NEXT:    store volatile <1 x i128> [[DEC8]], ptr @slll2, align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = load volatile <1 x i128>, ptr @ulll2, align 8
+// CHECK-NEXT:    [[DEC9:%.*]] = add <1 x i128> [[TMP9]], splat (i128 18446744073709551615)
+// CHECK-NEXT:    store volatile <1 x i128> [[DEC9]], ptr @ulll2, align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = load volatile <2 x double>, ptr @fd2, align 8
+// CHECK-NEXT:    [[DEC10:%.*]] = fadd <2 x double> [[TMP10]], splat (double -1.000000e+00)
+// CHECK-NEXT:    store volatile <2 x double> [[DEC10]], ptr @fd2, align 8
 // CHECK-NEXT:    ret void
 //
 void test_predec(void) {
@@ -276,6 +322,9 @@ void test_predec(void) {
   --sl2;
   --ul2;
 
+  --slll2;
+  --ulll2;
+
   --fd2;
 }
 
@@ -306,9 +355,15 @@ void test_predec(void) {
 // CHECK-NEXT:    [[TMP7:%.*]] = load volatile <2 x i64>, ptr @ul2, align 8
 // CHECK-NEXT:    [[DEC7:%.*]] = add <2 x i64> [[TMP7]], splat (i64 -1)
 // CHECK-NEXT:    store volatile <2 x i64> [[DEC7]], ptr @ul2, align 8
-// CHECK-NEXT:    [[TMP8:%.*]] = load volatile <2 x double>, ptr @fd2, align 8
-// CHECK-NEXT:    [[DEC8:%.*]] = fadd <2 x double> [[TMP8]], splat (double -1.000000e+00)
-// CHECK-NEXT:    store volatile <2 x double> [[DEC8]], ptr @fd2, align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = load volatile <1 x i128>, ptr @slll2, align 8
+// CHECK-NEXT:    [[DEC8:%.*]] = add <1 x i128> [[TMP8]], splat (i128 18446744073709551615)
+// CHECK-NEXT:    store volatile <1 x i128> [[DEC8]], ptr @slll2, align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = load volatile <1 x i128>, ptr @ulll2, align 8
+// CHECK-NEXT:    [[DEC9:%.*]] = add <1 x i128> [[TMP9]], splat (i128 18446744073709551615)
+// CHECK-NEXT:    store volatile <1 x i128> [[DEC9]], ptr @ulll2, align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = load volatile <2 x double>, ptr @fd2, align 8
+// CHECK-NEXT:    [[DEC10:%.*]] = fadd <2 x double> [[TMP10]], splat (double -1.000000e+00)
+// CHECK-NEXT:    store volatile <2 x double> [[DEC10]], ptr @fd2, align 8
 // CHECK-NEXT:    ret void
 //
 void test_postdec(void) {
@@ -325,6 +380,9 @@ void test_postdec(void) {
   sl2--;
   ul2--;
 
+  slll2--;
+  ulll2--;
+
   fd2--;
 }
 
@@ -427,10 +485,34 @@ void test_postdec(void) {
 // CHECK-NEXT:    [[TMP47:%.*]] = load volatile <2 x i64>, ptr @ul2, align 8
 // CHECK-NEXT:    [[ADD23:%.*]] = add <2 x i64> [[TMP46]], [[TMP47]]
 // CHECK-NEXT:    store volatile <2 x i64> [[ADD23]], ptr @ul, align 8
-// CHECK-NEXT:    [[TMP48:%.*]] = load volatile <2 x double>, ptr @fd, align 8
-// CHECK-NEXT:    [[TMP49:%.*]] = load volatile <2 x double>, ptr @fd2, align 8
-// CHECK-NEXT:    [[ADD24:%.*]] = fadd <2 x double> [[TMP48]], [[TMP49]]
-// CHECK-NEXT:    store volatile <2 x double> [[ADD24]], ptr @fd, align 8
+// CHECK-NEXT:    [[TMP48:%.*]] = load volatile <1 x i128>, ptr @slll, align 8
+// CHECK-NEXT:    [[TMP49:%.*]] = load volatile <1 x i128>, ptr @slll2, align 8
+// CHECK-NEXT:    [[ADD24:%.*]] = add <1 x i128> [[TMP48]], [[TMP49]]
+// CHECK-NEXT:    store volatile <1 x i128> [[ADD24]], ptr @slll, align 8
+// CHECK-NEXT:    [[TMP50:%.*]] = load volatile <1 x i128>, ptr @slll, align 8
+// CHECK-NEXT:    [[TMP51:%.*]] = load volatile <1 x i128>, ptr @blll2, align 8
+// CHECK-NEXT:    [[ADD25:%.*]] = add <1 x i128> [[TMP50]], [[TMP51]]
+// CHECK-NEXT:    store volatile <1 x i128> [[ADD25]], ptr @slll, align 8
+// CHECK-NEXT:    [[TMP52:%.*]] = load volatile <1 x i128>, ptr @blll, align 8
+// CHECK-NEXT:    [[TMP53:%.*]] = load volatile <1 x i128>, ptr @slll2, align 8
+// CHECK-NEXT:    [[ADD26:%.*]] = add <1 x i128> [[TMP52]], [[TMP53]]
+// CHECK-NEXT:    store volatile <1 x i128> [[ADD26]], ptr @slll, align 8
+// CHECK-NEXT:    [[TMP54:%.*]] = load volatile <1 x i128>, ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP55:%.*]] = load volatile <1 x i128>, ptr @ulll2, align 8
+// CHECK-NEXT:    [[ADD27:%.*]] = add <1 x i128> [[TMP54]], [[TMP55]]
+// CHECK-NEXT:    store volatile <1 x i128> [[ADD27]], ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP56:%.*]] = load volatile <1 x i128>, ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP57:%.*]] = load volatile <1 x i128>, ptr @blll2, align 8
+// CHECK-NEXT:    [[ADD28:%.*]] = add <1 x i128> [[TMP56]], [[TMP57]]
+// CHECK-NEXT:    store volatile <1 x i128> [[ADD28]], ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP58:%.*]] = load volatile <1 x i128>, ptr @blll, align 8
+// CHECK-NEXT:    [[TMP59:%.*]] = load volatile <1 x i128>, ptr @ulll2, align 8
+// CHECK-NEXT:    [[ADD29:%.*]] = add <1 x i128> [[TMP58]], [[TMP59]]
+// CHECK-NEXT:    store volatile <1 x i128> [[ADD29]], ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP60:%.*]] = load volatile <2 x double>, ptr @fd, align 8
+// CHECK-NEXT:    [[TMP61:%.*]] = load volatile <2 x double>, ptr @fd2, align 8
+// CHECK-NEXT:    [[ADD30:%.*]] = fadd <2 x double> [[TMP60]], [[TMP61]]
+// CHECK-NEXT:    store volatile <2 x double> [[ADD30]], ptr @fd, align 8
 // CHECK-NEXT:    ret void
 //
 void test_add(void) {
@@ -463,6 +545,13 @@ void test_add(void) {
   ul = ul + bl2;
   ul = bl + ul2;
 
+  slll = slll + slll2;
+  slll = slll + blll2;
+  slll = blll + slll2;
+  ulll = ulll + ulll2;
+  ulll = ulll + blll2;
+  ulll = blll + ulll2;
+
   fd = fd + fd2;
 }
 
@@ -533,10 +622,26 @@ void test_add(void) {
 // CHECK-NEXT:    [[TMP31:%.*]] = load volatile <2 x i64>, ptr @ul, align 8
 // CHECK-NEXT:    [[ADD15:%.*]] = add <2 x i64> [[TMP31]], [[TMP30]]
 // CHECK-NEXT:    store volatile <2 x i64> [[ADD15]], ptr @ul, align 8
-// CHECK-NEXT:    [[TMP32:%.*]] = load volatile <2 x double>, ptr @fd2, align 8
-// CHECK-NEXT:    [[TMP33:%.*]] = load volatile <2 x double>, ptr @fd, align 8
-// CHECK-NEXT:    [[ADD16:%.*]] = fadd <2 x double> [[TMP33]], [[TMP32]]
-// CHECK-NEXT:    store volatile <2 x double> [[ADD16]], ptr @fd, align 8
+// CHECK-NEXT:    [[TMP32:%.*]] = load volatile <1 x i128>, ptr @slll2, align 8
+// CHECK-NEXT:    [[TMP33:%.*]] = load volatile <1 x i128>, ptr @slll, align 8
+// CHECK-NEXT:    [[ADD16:%.*]] = add <1 x i128> [[TMP33]], [[TMP32]]
+// CHECK-NEXT:    store volatile <1 x i128> [[ADD16]], ptr @slll, align 8
+// CHECK-NEXT:    [[TMP34:%.*]] = load volatile <1 x i128>, ptr @blll2, align 8
+// CHECK-NEXT:    [[TMP35:%.*]] = load volatile <1 x i128>, ptr @slll, align 8
+// CHECK-NEXT:    [[ADD17:%.*]] = add <1 x i128> [[TMP35]], [[TMP34]]
+// CHECK-NEXT:    store volatile <1 x i128> [[ADD17]], ptr @slll, align 8
+// CHECK-NEXT:    [[TMP36:%.*]] = load volatile <1 x i128>, ptr @ulll2, align 8
+// CHECK-NEXT:    [[TMP37:%.*]] = load volatile <1 x i128>, ptr @ulll, align 8
+// CHECK-NEXT:    [[ADD18:%.*]] = add <1 x i128> [[TMP37]], [[TMP36]]
+// CHECK-NEXT:    store volatile <1 x i128> [[ADD18]], ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP38:%.*]] = load volatile <1 x i128>, ptr @blll2, align 8
+// CHECK-NEXT:    [[TMP39:%.*]] = load volatile <1 x i128>, ptr @ulll, align 8
+// CHECK-NEXT:    [[ADD19:%.*]] = add <1 x i128> [[TMP39]], [[TMP38]]
+// CHECK-NEXT:    store volatile <1 x i128> [[ADD19]], ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP40:%.*]] = load volatile <2 x double>, ptr @fd2, align 8
+// CHECK-NEXT:    [[TMP41:%.*]] = load volatile <2 x double>, ptr @fd, align 8
+// CHECK-NEXT:    [[ADD20:%.*]] = fadd <2 x double> [[TMP41]], [[TMP40]]
+// CHECK-NEXT:    store volatile <2 x double> [[ADD20]], ptr @fd, align 8
 // CHECK-NEXT:    ret void
 //
 void test_add_assign(void) {
@@ -561,6 +666,11 @@ void test_add_assign(void) {
   ul += ul2;
   ul += bl2;
 
+  slll += slll2;
+  slll += blll2;
+  ulll += ulll2;
+  ulll += blll2;
+
   fd += fd2;
 }
 
@@ -663,10 +773,34 @@ void test_add_assign(void) {
 // CHECK-NEXT:    [[TMP47:%.*]] = load volatile <2 x i64>, ptr @ul2, align 8
 // CHECK-NEXT:    [[SUB23:%.*]] = sub <2 x i64> [[TMP46]], [[TMP47]]
 // CHECK-NEXT:    store volatile <2 x i64> [[SUB23]], ptr @ul, align 8
-// CHECK-NEXT:    [[TMP48:%.*]] = load volatile <2 x double>, ptr @fd, align 8
-// CHECK-NEXT:    [[TMP49:%.*]] = load volatile <2 x double>, ptr @fd2, align 8
-// CHECK-NEXT:    [[SUB24:%.*]] = fsub <2 x double> [[TMP48]], [[TMP49]]
-// CHECK-NEXT:    store volatile <2 x double> [[SUB24]], ptr @fd, align 8
+// CHECK-NEXT:    [[TMP48:%.*]] = load volatile <1 x i128>, ptr @slll, align 8
+// CHECK-NEXT:    [[TMP49:%.*]] = load volatile <1 x i128>, ptr @slll2, align 8
+// CHECK-NEXT:    [[SUB24:%.*]] = sub <1 x i128> [[TMP48]], [[TMP49]]
+// CHECK-NEXT:    store volatile <1 x i128> [[SUB24]], ptr @slll, align 8
+// CHECK-NEXT:    [[TMP50:%.*]] = load volatile <1 x i128>, ptr @slll, align 8
+// CHECK-NEXT:    [[TMP51:%.*]] = load volatile <1 x i128>, ptr @blll2, align 8
+// CHECK-NEXT:    [[SUB25:%.*]] = sub <1 x i128> [[TMP50]], [[TMP51]]
+// CHECK-NEXT:    store volatile <1 x i128> [[SUB25]], ptr @slll, align 8
+// CHECK-NEXT:    [[TMP52:%.*]] = load volatile <1 x i128>, ptr @blll, align 8
+// CHECK-NEXT:    [[TMP53:%.*]] = load volatile <1 x i128>, ptr @slll2, align 8
+// CHECK-NEXT:    [[SUB26:%.*]] = sub <1 x i128> [[TMP52]], [[TMP53]]
+// CHECK-NEXT:    store volatile <1 x i128> [[SUB26]], ptr @slll, align 8
+// CHECK-NEXT:    [[TMP54:%.*]] = load volatile <1 x i128>, ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP55:%.*]] = load volatile <1 x i128>, ptr @ulll2, align 8
+// CHECK-NEXT:    [[SUB27:%.*]] = sub <1 x i128> [[TMP54]], [[TMP55]]
+// CHECK-NEXT:    store volatile <1 x i128> [[SUB27]], ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP56:%.*]] = load volatile <1 x i128>, ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP57:%.*]] = load volatile <1 x i128>, ptr @blll2, align 8
+// CHECK-NEXT:    [[SUB28:%.*]] = sub <1 x i128> [[TMP56]], [[TMP57]]
+// CHECK-NEXT:    store volatile <1 x i128> [[SUB28]], ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP58:%.*]] = load volatile <1 x i128>, ptr @blll, align 8
+// CHECK-NEXT:    [[TMP59:%.*]] = load volatile <1 x i128>, ptr @ulll2, align 8
+// CHECK-NEXT:    [[SUB29:%.*]] = sub <1 x i128> [[TMP58]], [[TMP59]]
+// CHECK-NEXT:    store volatile <1 x i128> [[SUB29]], ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP60:%.*]] = load volatile <2 x double>, ptr @fd, align 8
+// CHECK-NEXT:    [[TMP61:%.*]] = load volatile <2 x double>, ptr @fd2, align 8
+// CHECK-NEXT:    [[SUB30:%.*]] = fsub <2 x double> [[TMP60]], [[TMP61]]
+// CHECK-NEXT:    store volatile <2 x double> [[SUB30]], ptr @fd, align 8
 // CHECK-NEXT:    ret void
 //
 void test_sub(void) {
@@ -699,6 +833,13 @@ void test_sub(void) {
   ul = ul - bl2;
   ul = bl - ul2;
 
+  slll = slll - slll2;
+  slll = slll - blll2;
+  slll = blll - slll2;
+  ulll = ulll - ulll2;
+  ulll = ulll - blll2;
+  ulll = blll - ulll2;
+
   fd = fd - fd2;
 }
 
@@ -769,10 +910,26 @@ void test_sub(void) {
 // CHECK-NEXT:    [[TMP31:%.*]] = load volatile <2 x i64>, ptr @ul, align 8
 // CHECK-NEXT:    [[SUB15:%.*]] = sub <2 x i64> [[TMP31]], [[TMP30]]
 // CHECK-NEXT:    store volatile <2 x i64> [[SUB15]], ptr @ul, align 8
-// CHECK-NEXT:    [[TMP32:%.*]] = load volatile <2 x double>, ptr @fd2, align 8
-// CHECK-NEXT:    [[TMP33:%.*]] = load volatile <2 x double>, ptr @fd, align 8
-// CHECK-NEXT:    [[SUB16:%.*]] = fsub <2 x double> [[TMP33]], [[TMP32]]
-// CHECK-NEXT:    store volatile <2 x double> [[SUB16]], ptr @fd, align 8
+// CHECK-NEXT:    [[TMP32:%.*]] = load volatile <1 x i128>, ptr @slll2, align 8
+// CHECK-NEXT:    [[TMP33:%.*]] = load volatile <1 x i128>, ptr @slll, align 8
+// CHECK-NEXT:    [[SUB16:%.*]] = sub <1 x i128> [[TMP33]], [[TMP32]]
+// CHECK-NEXT:    store volatile <1 x i128> [[SUB16]], ptr @slll, align 8
+// CHECK-NEXT:    [[TMP34:%.*]] = load volatile <1 x i128>, ptr @blll2, align 8
+// CHECK-NEXT:    [[TMP35:%.*]] = load volatile <1 x i128>, ptr @slll, align 8
+// CHECK-NEXT:    [[SUB17:%.*]] = sub <1 x i128> [[TMP35]], [[TMP34]]
+// CHECK-NEXT:    store volatile <1 x i128> [[SUB17]], ptr @slll, align 8
+// CHECK-NEXT:    [[TMP36:%.*]] = load volatile <1 x i128>, ptr @ulll2, align 8
+// CHECK-NEXT:    [[TMP37:%.*]] = load volatile <1 x i128>, ptr @ulll, align 8
+// CHECK-NEXT:    [[SUB18:%.*]] = sub <1 x i128> [[TMP37]], [[TMP36]]
+// CHECK-NEXT:    store volatile <1 x i128> [[SUB18]], ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP38:%.*]] = load volatile <1 x i128>, ptr @blll2, align 8
+// CHECK-NEXT:    [[TMP39:%.*]] = load volatile <1 x i128>, ptr @ulll, align 8
+// CHECK-NEXT:    [[SUB19:%.*]] = sub <1 x i128> [[TMP39]], [[TMP38]]
+// CHECK-NEXT:    store volatile <1 x i128> [[SUB19]], ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP40:%.*]] = load volatile <2 x double>, ptr @fd2, align 8
+// CHECK-NEXT:    [[TMP41:%.*]] = load volatile <2 x double>, ptr @fd, align 8
+// CHECK-NEXT:    [[SUB20:%.*]] = fsub <2 x double> [[TMP41]], [[TMP40]]
+// CHECK-NEXT:    store volatile <2 x double> [[SUB20]], ptr @fd, align 8
 // CHECK-NEXT:    ret void
 //
 void test_sub_assign(void) {
@@ -797,6 +954,11 @@ void test_sub_assign(void) {
   ul -= ul2;
   ul -= bl2;
 
+  slll -= slll2;
+  slll -= blll2;
+  ulll -= ulll2;
+  ulll -= blll2;
+
   fd -= fd2;
 }
 
@@ -835,10 +997,18 @@ void test_sub_assign(void) {
 // CHECK-NEXT:    [[TMP15:%.*]] = load volatile <2 x i64>, ptr @ul2, align 8
 // CHECK-NEXT:    [[MUL7:%.*]] = mul <2 x i64> [[TMP14]], [[TMP15]]
 // CHECK-NEXT:    store volatile <2 x i64> [[MUL7]], ptr @ul, align 8
-// CHECK-NEXT:    [[TMP16:%.*]] = load volatile <2 x double>, ptr @fd, align 8
-// CHECK-NEXT:    [[TMP17:%.*]] = load volatile <2 x double>, ptr @fd2, align 8
-// CHECK-NEXT:    [[MUL8:%.*]] = fmul <2 x double> [[TMP16]], [[TMP17]]
-// CHECK-NEXT:    store volatile <2 x double> [[MUL8]], ptr @fd, align 8
+// CHECK-NEXT:    [[TMP16:%.*]] = load volatile <1 x i128>, ptr @slll, align 8
+// CHECK-NEXT:    [[TMP17:%.*]] = load volatile <1 x i128>, ptr @slll2, align 8
+// CHECK-NEXT:    [[MUL8:%.*]] = mul <1 x i128> [[TMP16]], [[TMP17]]
+// CHECK-NEXT:    store volatile <1 x i128> [[MUL8]], ptr @slll, align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = load volatile <1 x i128>, ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP19:%.*]] = load volatile <1 x i128>, ptr @ulll2, align 8
+// CHECK-NEXT:    [[MUL9:%.*]] = mul <1 x i128> [[TMP18]], [[TMP19]]
+// CHECK-NEXT:    store volatile <1 x i128> [[MUL9]], ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP20:%.*]] = load volatile <2 x double>, ptr @fd, align 8
+// CHECK-NEXT:    [[TMP21:%.*]] = load volatile <2 x double>, ptr @fd2, align 8
+// CHECK-NEXT:    [[MUL10:%.*]] = fmul <2 x double> [[TMP20]], [[TMP21]]
+// CHECK-NEXT:    store volatile <2 x double> [[MUL10]], ptr @fd, align 8
 // CHECK-NEXT:    ret void
 //
 void test_mul(void) {
@@ -855,6 +1025,9 @@ void test_mul(void) {
   sl = sl * sl2;
   ul = ul * ul2;
 
+  slll = slll * slll2;
+  ulll = ulll * ulll2;
+
   fd = fd * fd2;
 }
 
@@ -893,10 +1066,18 @@ void test_mul(void) {
 // CHECK-NEXT:    [[TMP15:%.*]] = load volatile <2 x i64>, ptr @ul, align 8
 // CHECK-NEXT:    [[MUL7:%.*]] = mul <2 x i64> [[TMP15]], [[TMP14]]
 // CHECK-NEXT:    store volatile <2 x i64> [[MUL7]], ptr @ul, align 8
-// CHECK-NEXT:    [[TMP16:%.*]] = load volatile <2 x double>, ptr @fd2, align 8
-// CHECK-NEXT:    [[TMP17:%.*]] = load volatile <2 x double>, ptr @fd, align 8
-// CHECK-NEXT:    [[MUL8:%.*]] = fmul <2 x double> [[TMP17]], [[TMP16]]
-// CHECK-NEXT:    store volatile <2 x double> [[MUL8]], ptr @fd, align 8
+// CHECK-NEXT:    [[TMP16:%.*]] = load volatile <1 x i128>, ptr @slll2, align 8
+// CHECK-NEXT:    [[TMP17:%.*]] = load volatile <1 x i128>, ptr @slll, align 8
+// CHECK-NEXT:    [[MUL8:%.*]] = mul <1 x i128> [[TMP17]], [[TMP16]]
+// CHECK-NEXT:    store volatile <1 x i128> [[MUL8]], ptr @slll, align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = load volatile <1 x i128>, ptr @ulll2, align 8
+// CHECK-NEXT:    [[TMP19:%.*]] = load volatile <1 x i128>, ptr @ulll, align 8
+// CHECK-NEXT:    [[MUL9:%.*]] = mul <1 x i128> [[TMP19]], [[TMP18]]
+// CHECK-NEXT:    store volatile <1 x i128> [[MUL9]], ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP20:%.*]] = load volatile <2 x double>, ptr @fd2, align 8
+// CHECK-NEXT:    [[TMP21:%.*]] = load volatile <2 x double>, ptr @fd, align 8
+// CHECK-NEXT:    [[MUL10:%.*]] = fmul <2 x double> [[TMP21]], [[TMP20]]
+// CHECK-NEXT:    store volatile <2 x double> [[MUL10]], ptr @fd, align 8
 // CHECK-NEXT:    ret void
 //
 void test_mul_assign(void) {
@@ -913,6 +1094,9 @@ void test_mul_assign(void) {
   sl *= sl2;
   ul *= ul2;
 
+  slll *= slll2;
+  ulll *= ulll2;
+
   fd *= fd2;
 }
 
@@ -951,10 +1135,18 @@ void test_mul_assign(void) {
 // CHECK-NEXT:    [[TMP15:%.*]] = load volatile <2 x i64>, ptr @ul2, align 8
 // CHECK-NEXT:    [[DIV7:%.*]] = udiv <2 x i64> [[TMP14]], [[TMP15]]
 // CHECK-NEXT:    store volatile <2 x i64> [[DIV7]], ptr @ul, align 8
-// CHECK-NEXT:    [[TMP16:%.*]] = load volatile <2 x double>, ptr @fd, align 8
-// CHECK-NEXT:    [[TMP17:%.*]] = load volatile <2 x double>, ptr @fd2, align 8
-// CHECK-NEXT:    [[DIV8:%.*]] = fdiv <2 x double> [[TMP16]], [[TMP17]]
-// CHECK-NEXT:    store volatile <2 x double> [[DIV8]], ptr @fd, align 8
+// CHECK-NEXT:    [[TMP16:%.*]] = load volatile <1 x i128>, ptr @slll, align 8
+// CHECK-NEXT:    [[TMP17:%.*]] = load volatile <1 x i128>, ptr @slll2, align 8
+// CHECK-NEXT:    [[DIV8:%.*]] = sdiv <1 x i128> [[TMP16]], [[TMP17]]
+// CHECK-NEXT:    store volatile <1 x i128> [[DIV8]], ptr @slll, align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = load volatile <1 x i128>, ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP19:%.*]] = load volatile <1 x i128>, ptr @ulll2, align 8
+// CHECK-NEXT:    [[DIV9:%.*]] = udiv <1 x i128> [[TMP18]], [[TMP19]]
+// CHECK-NEXT:    store volatile <1 x i128> [[DIV9]], ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP20:%.*]] = load volatile <2 x double>, ptr @fd, align 8
+// CHECK-NEXT:    [[TMP21:%.*]] = load volatile <2 x double>, ptr @fd2, align 8
+// CHECK-NEXT:    [[DIV10:%.*]] = fdiv <2 x double> [[TMP20]], [[TMP21]]
+// CHECK-NEXT:    store volatile <2 x double> [[DIV10]], ptr @fd, align 8
 // CHECK-NEXT:    ret void
 //
 void test_div(void) {
@@ -971,6 +1163,9 @@ void test_div(void) {
   sl = sl / sl2;
   ul = ul / ul2;
 
+  slll = slll / slll2;
+  ulll = ulll / ulll2;
+
   fd = fd / fd2;
 }
 
@@ -1009,10 +1204,18 @@ void test_div(void) {
 // CHECK-NEXT:    [[TMP15:%.*]] = load volatile <2 x i64>, ptr @ul, align 8
 // CHECK-NEXT:    [[DIV7:%.*]] = udiv <2 x i64> [[TMP15]], [[TMP14]]
 // CHECK-NEXT:    store volatile <2 x i64> [[DIV7]], ptr @ul, align 8
-// CHECK-NEXT:    [[TMP16:%.*]] = load volatile <2 x double>, ptr @fd2, align 8
-// CHECK-NEXT:    [[TMP17:%.*]] = load volatile <2 x double>, ptr @fd, align 8
-// CHECK-NEXT:    [[DIV8:%.*]] = fdiv <2 x double> [[TMP17]], [[TMP16]]
-// CHECK-NEXT:    store volatile <2 x double> [[DIV8]], ptr @fd, align 8
+// CHECK-NEXT:    [[TMP16:%.*]] = load volatile <1 x i128>, ptr @slll2, align 8
+// CHECK-NEXT:    [[TMP17:%.*]] = load volatile <1 x i128>, ptr @slll, align 8
+// CHECK-NEXT:    [[DIV8:%.*]] = sdiv <1 x i128> [[TMP17]], [[TMP16]]
+// CHECK-NEXT:    store volatile <1 x i128> [[DIV8]], ptr @slll, align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = load volatile <1 x i128>, ptr @ulll2, align 8
+// CHECK-NEXT:    [[TMP19:%.*]] = load volatile <1 x i128>, ptr @ulll, align 8
+// CHECK-NEXT:    [[DIV9:%.*]] = udiv <1 x i128> [[TMP19]], [[TMP18]]
+// CHECK-NEXT:    store volatile <1 x i128> [[DIV9]], ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP20:%.*]] = load volatile <2 x double>, ptr @fd2, align 8
+// CHECK-NEXT:    [[TMP21:%.*]] = load volatile <2 x double>, ptr @fd, align 8
+// CHECK-NEXT:    [[DIV10:%.*]] = fdiv <2 x double> [[TMP21]], [[TMP20]]
+// CHECK-NEXT:    store volatile <2 x double> [[DIV10]], ptr @fd, align 8
 // CHECK-NEXT:    ret void
 //
 void test_div_assign(void) {
@@ -1029,6 +1232,9 @@ void test_div_assign(void) {
   sl /= sl2;
   ul /= ul2;
 
+  slll /= slll2;
+  ulll /= ulll2;
+
   fd /= fd2;
 }
 
@@ -1067,6 +1273,14 @@ void test_div_assign(void) {
 // CHECK-NEXT:    [[TMP15:%.*]] = load volatile <2 x i64>, ptr @ul2, align 8
 // CHECK-NEXT:    [[REM7:%.*]] = urem <2 x i64> [[TMP14]], [[TMP15]]
 // CHECK-NEXT:    store volatile <2 x i64> [[REM7]], ptr @ul, align 8
+// CHECK-NEXT:    [[TMP16:%.*]] = load volatile <1 x i128>, ptr @slll, align 8
+// CHECK-NEXT:    [[TMP17:%.*]] = load volatile <1 x i128>, ptr @slll2, align 8
+// CHECK-NEXT:    [[REM8:%.*]] = srem <1 x i128> [[TMP16]], [[TMP17]]
+// CHECK-NEXT:    store volatile <1 x i128> [[REM8]], ptr @slll, align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = load volatile <1 x i128>, ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP19:%.*]] = load volatile <1 x i128>, ptr @ulll2, align 8
+// CHECK-NEXT:    [[REM9:%.*]] = urem <1 x i128> [[TMP18]], [[TMP19]]
+// CHECK-NEXT:    store volatile <1 x i128> [[REM9]], ptr @ulll, align 8
 // CHECK-NEXT:    ret void
 //
 void test_rem(void) {
@@ -1082,6 +1296,9 @@ void test_rem(void) {
 
   sl = sl % sl2;
   ul = ul % ul2;
+
+  slll = slll % slll2;
+  ulll = ulll % ulll2;
 }
 
 // CHECK-LABEL: define dso_local void @test_rem_assign(
@@ -1119,6 +1336,14 @@ void test_rem(void) {
 // CHECK-NEXT:    [[TMP15:%.*]] = load volatile <2 x i64>, ptr @ul, align 8
 // CHECK-NEXT:    [[REM7:%.*]] = urem <2 x i64> [[TMP15]], [[TMP14]]
 // CHECK-NEXT:    store volatile <2 x i64> [[REM7]], ptr @ul, align 8
+// CHECK-NEXT:    [[TMP16:%.*]] = load volatile <1 x i128>, ptr @slll2, align 8
+// CHECK-NEXT:    [[TMP17:%.*]] = load volatile <1 x i128>, ptr @slll, align 8
+// CHECK-NEXT:    [[REM8:%.*]] = srem <1 x i128> [[TMP17]], [[TMP16]]
+// CHECK-NEXT:    store volatile <1 x i128> [[REM8]], ptr @slll, align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = load volatile <1 x i128>, ptr @ulll2, align 8
+// CHECK-NEXT:    [[TMP19:%.*]] = load volatile <1 x i128>, ptr @ulll, align 8
+// CHECK-NEXT:    [[REM9:%.*]] = urem <1 x i128> [[TMP19]], [[TMP18]]
+// CHECK-NEXT:    store volatile <1 x i128> [[REM9]], ptr @ulll, align 8
 // CHECK-NEXT:    ret void
 //
 void test_rem_assign(void) {
@@ -1134,6 +1359,9 @@ void test_rem_assign(void) {
 
   sl %= sl2;
   ul %= ul2;
+
+  slll %= slll2;
+  ulll %= ulll2;
 }
 
 // CHECK-LABEL: define dso_local void @test_not(
@@ -1175,6 +1403,15 @@ void test_rem_assign(void) {
 // CHECK-NEXT:    [[TMP11:%.*]] = load volatile <2 x i64>, ptr @bl2, align 8
 // CHECK-NEXT:    [[NOT11:%.*]] = xor <2 x i64> [[TMP11]], splat (i64 -1)
 // CHECK-NEXT:    store volatile <2 x i64> [[NOT11]], ptr @bl, align 8
+// CHECK-NEXT:    [[TMP12:%.*]] = load volatile <1 x i128>, ptr @slll2, align 8
+// CHECK-NEXT:    [[NOT12:%.*]] = xor <1 x i128> [[TMP12]], splat (i128 -1)
+// CHECK-NEXT:    store volatile <1 x i128> [[NOT12]], ptr @slll, align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = load volatile <1 x i128>, ptr @ulll2, align 8
+// CHECK-NEXT:    [[NOT13:%.*]] = xor <1 x i128> [[TMP13]], splat (i128 -1)
+// CHECK-NEXT:    store volatile <1 x i128> [[NOT13]], ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP14:%.*]] = load volatile <1 x i128>, ptr @blll2, align 8
+// CHECK-NEXT:    [[NOT14:%.*]] = xor <1 x i128> [[TMP14]], splat (i128 -1)
+// CHECK-NEXT:    store volatile <1 x i128> [[NOT14]], ptr @blll, align 8
 // CHECK-NEXT:    ret void
 //
 void test_not(void) {
@@ -1194,6 +1431,10 @@ void test_not(void) {
   sl = ~sl2;
   ul = ~ul2;
   bl = ~bl2;
+
+  slll = ~slll2;
+  ulll = ~ulll2;
+  blll = ~blll2;
 }
 
 // CHECK-LABEL: define dso_local void @test_and(
@@ -1311,6 +1552,34 @@ void test_not(void) {
 // CHECK-NEXT:    [[TMP55:%.*]] = load volatile <2 x i64>, ptr @bl2, align 8
 // CHECK-NEXT:    [[AND27:%.*]] = and <2 x i64> [[TMP54]], [[TMP55]]
 // CHECK-NEXT:    store volatile <2 x i64> [[AND27]], ptr @bl, align 8
+// CHECK-NEXT:    [[TMP56:%.*]] = load volatile <1 x i128>, ptr @slll, align 8
+// CHECK-NEXT:    [[TMP57:%.*]] = load volatile <1 x i128>, ptr @slll2, align 8
+// CHECK-NEXT:    [[AND28:%.*]] = and <1 x i128> [[TMP56]], [[TMP57]]
+// CHECK-NEXT:    store volatile <1 x i128> [[AND28]], ptr @slll, align 8
+// CHECK-NEXT:    [[TMP58:%.*]] = load volatile <1 x i128>, ptr @slll, align 8
+// CHECK-NEXT:    [[TMP59:%.*]] = load volatile <1 x i128>, ptr @blll2, align 8
+// CHECK-NEXT:    [[AND29:%.*]] = and <1 x i128> [[TMP58]], [[TMP59]]
+// CHECK-NEXT:    store volatile <1 x i128> [[AND29]], ptr @slll, align 8
+// CHECK-NEXT:    [[TMP60:%.*]] = load volatile <1 x i128>, ptr @blll, align 8
+// CHECK-NEXT:    [[TMP61:%.*]] = load volatile <1 x i128>, ptr @slll2, align 8
+// CHECK-NEXT:    [[AND30:%.*]] = and <1 x i128> [[TMP60]], [[TMP61]]
+// CHECK-NEXT:    store volatile <1 x i128> [[AND30]], ptr @slll, align 8
+// CHECK-NEXT:    [[TMP62:%.*]] = load volatile <1 x i128>, ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP63:%.*]] = load volatile <1 x i128>, ptr @ulll2, align 8
+// CHECK-NEXT:    [[AND31:%.*]] = and <1 x i128> [[TMP62]], [[TMP63]]
+// CHECK-NEXT:    store volatile <1 x i128> [[AND31]], ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP64:%.*]] = load volatile <1 x i128>, ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP65:%.*]] = load volatile <1 x i128>, ptr @blll2, align 8
+// CHECK-NEXT:    [[AND32:%.*]] = and <1 x i128> [[TMP64]], [[TMP65]]
+// CHECK-NEXT:    store volatile <1 x i128> [[AND32]], ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP66:%.*]] = load volatile <1 x i128>, ptr @blll, align 8
+// CHECK-NEXT:    [[TMP67:%.*]] = load volatile <1 x i128>, ptr @ulll2, align 8
+// CHECK-NEXT:    [[AND33:%.*]] = and <1 x i128> [[TMP66]], [[TMP67]]
+// CHECK-NEXT:    store volatile <1 x i128> [[AND33]], ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP68:%.*]] = load volatile <1 x i128>, ptr @blll, align 8
+// CHECK-NEXT:    [[TMP69:%.*]] = load volatile <1 x i128>, ptr @blll2, align 8
+// CHECK-NEXT:    [[AND34:%.*]] = and <1 x i128> [[TMP68]], [[TMP69]]
+// CHECK-NEXT:    store volatile <1 x i128> [[AND34]], ptr @blll, align 8
 // CHECK-NEXT:    ret void
 //
 void test_and(void) {
@@ -1346,6 +1615,14 @@ void test_and(void) {
   ul = ul & bl2;
   ul = bl & ul2;
   bl = bl & bl2;
+
+  slll = slll & slll2;
+  slll = slll & blll2;
+  slll = blll & slll2;
+  ulll = ulll & ulll2;
+  ulll = ulll & blll2;
+  ulll = blll & ulll2;
+  blll = blll & blll2;
 }
 
 // CHECK-LABEL: define dso_local void @test_and_assign(
@@ -1431,6 +1708,26 @@ void test_and(void) {
 // CHECK-NEXT:    [[TMP39:%.*]] = load volatile <2 x i64>, ptr @bl, align 8
 // CHECK-NEXT:    [[AND19:%.*]] = and <2 x i64> [[TMP39]], [[TMP38]]
 // CHECK-NEXT:    store volatile <2 x i64> [[AND19]], ptr @bl, align 8
+// CHECK-NEXT:    [[TMP40:%.*]] = load volatile <1 x i128>, ptr @slll2, align 8
+// CHECK-NEXT:    [[TMP41:%.*]] = load volatile <1 x i128>, ptr @slll, align 8
+// CHECK-NEXT:    [[AND20:%.*]] = and <1 x i128> [[TMP41]], [[TMP40]]
+// CHECK-NEXT:    store volatile <1 x i128> [[AND20]], ptr @slll, align 8
+// CHECK-NEXT:    [[TMP42:%.*]] = load volatile <1 x i128>, ptr @blll2, align 8
+// CHECK-NEXT:    [[TMP43:%.*]] = load volatile <1 x i128>, ptr @slll, align 8
+// CHECK-NEXT:    [[AND21:%.*]] = and <1 x i128> [[TMP43]], [[TMP42]]
+// CHECK-NEXT:    store volatile <1 x i128> [[AND21]], ptr @slll, align 8
+// CHECK-NEXT:    [[TMP44:%.*]] = load volatile <1 x i128>, ptr @ulll2, align 8
+// CHECK-NEXT:    [[TMP45:%.*]] = load volatile <1 x i128>, ptr @ulll, align 8
+// CHECK-NEXT:    [[AND22:%.*]] = and <1 x i128> [[TMP45]], [[TMP44]]
+// CHECK-NEXT:    store volatile <1 x i128> [[AND22]], ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP46:%.*]] = load volatile <1 x i128>, ptr @blll2, align 8
+// CHECK-NEXT:    [[TMP47:%.*]] = load volatile <1 x i128>, ptr @ulll, align 8
+// CHECK-NEXT:    [[AND23:%.*]] = and <1 x i128> [[TMP47]], [[TMP46]]
+// CHECK-NEXT:    store volatile <1 x i128> [[AND23]], ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP48:%.*]] = load volatile <1 x i128>, ptr @blll2, align 8
+// CHECK-NEXT:    [[TMP49:%.*]] = load volatile <1 x i128>, ptr @blll, align 8
+// CHECK-NEXT:    [[AND24:%.*]] = and <1 x i128> [[TMP49]], [[TMP48]]
+// CHECK-NEXT:    store volatile <1 x i128> [[AND24]], ptr @blll, align 8
 // CHECK-NEXT:    ret void
 //
 void test_and_assign(void) {
@@ -1458,6 +1755,12 @@ void test_and_assign(void) {
   ul &= ul2;
   ul &= bl2;
   bl &= bl2;
+
+  slll &= slll2;
+  slll &= blll2;
+  ulll &= ulll2;
+  ulll &= blll2;
+  blll &= blll2;
 }
 
 // CHECK-LABEL: define dso_local void @test_or(
@@ -1575,6 +1878,34 @@ void test_and_assign(void) {
 // CHECK-NEXT:    [[TMP55:%.*]] = load volatile <2 x i64>, ptr @bl2, align 8
 // CHECK-NEXT:    [[OR27:%.*]] = or <2 x i64> [[TMP54]], [[TMP55]]
 // CHECK-NEXT:    store volatile <2 x i64> [[OR27]], ptr @bl, align 8
+// CHECK-NEXT:    [[TMP56:%.*]] = load volatile <1 x i128>, ptr @slll, align 8
+// CHECK-NEXT:    [[TMP57:%.*]] = load volatile <1 x i128>, ptr @slll2, align 8
+// CHECK-NEXT:    [[OR28:%.*]] = or <1 x i128> [[TMP56]], [[TMP57]]
+// CHECK-NEXT:    store volatile <1 x i128> [[OR28]], ptr @slll, align 8
+// CHECK-NEXT:    [[TMP58:%.*]] = load volatile <1 x i128>, ptr @slll, align 8
+// CHECK-NEXT:    [[TMP59:%.*]] = load volatile <1 x i128>, ptr @blll2, align 8
+// CHECK-NEXT:    [[OR29:%.*]] = or <1 x i128> [[TMP58]], [[TMP59]]
+// CHECK-NEXT:    store volatile <1 x i128> [[OR29]], ptr @slll, align 8
+// CHECK-NEXT:    [[TMP60:%.*]] = load volatile <1 x i128>, ptr @blll, align 8
+// CHECK-NEXT:    [[TMP61:%.*]] = load volatile <1 x i128>, ptr @slll2, align 8
+// CHECK-NEXT:    [[OR30:%.*]] = or <1 x i128> [[TMP60]], [[TMP61]]
+// CHECK-NEXT:    store volatile <1 x i128> [[OR30]], ptr @slll, align 8
+// CHECK-NEXT:    [[TMP62:%.*]] = load volatile <1 x i128>, ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP63:%.*]] = load volatile <1 x i128>, ptr @ulll2, align 8
+// CHECK-NEXT:    [[OR31:%.*]] = or <1 x i128> [[TMP62]], [[TMP63]]
+// CHECK-NEXT:    store volatile <1 x i128> [[OR31]], ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP64:%.*]] = load volatile <1 x i128>, ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP65:%.*]] = load volatile <1 x i128>, ptr @blll2, align 8
+// CHECK-NEXT:    [[OR32:%.*]] = or <1 x i128> [[TMP64]], [[TMP65]]
+// CHECK-NEXT:    store volatile <1 x i128> [[OR32]], ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP66:%.*]] = load volatile <1 x i128>, ptr @blll, align 8
+// CHECK-NEXT:    [[TMP67:%.*]] = load volatile <1 x i128>, ptr @ulll2, align 8
+// CHECK-NEXT:    [[OR33:%.*]] = or <1 x i128> [[TMP66]], [[TMP67]]
+// CHECK-NEXT:    store volatile <1 x i128> [[OR33]], ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP68:%.*]] = load volatile <1 x i128>, ptr @blll, align 8
+// CHECK-NEXT:    [[TMP69:%.*]] = load volatile <1 x i128>, ptr @blll2, align 8
+// CHECK-NEXT:    [[OR34:%.*]] = or <1 x i128> [[TMP68]], [[TMP69]]
+// CHECK-NEXT:    store volatile <1 x i128> [[OR34]], ptr @blll, align 8
 // CHECK-NEXT:    ret void
 //
 void test_or(void) {
@@ -1610,6 +1941,14 @@ void test_or(void) {
   ul = ul | bl2;
   ul = bl | ul2;
   bl = bl | bl2;
+
+  slll = slll | slll2;
+  slll = slll | blll2;
+  slll = blll | slll2;
+  ulll = ulll | ulll2;
+  ulll = ulll | blll2;
+  ulll = blll | ulll2;
+  blll = blll | blll2;
 }
 
 // CHECK-LABEL: define dso_local void @test_or_assign(
@@ -1695,6 +2034,26 @@ void test_or(void) {
 // CHECK-NEXT:    [[TMP39:%.*]] = load volatile <2 x i64>, ptr @bl, align 8
 // CHECK-NEXT:    [[OR19:%.*]] = or <2 x i64> [[TMP39]], [[TMP38]]
 // CHECK-NEXT:    store volatile <2 x i64> [[OR19]], ptr @bl, align 8
+// CHECK-NEXT:    [[TMP40:%.*]] = load volatile <1 x i128>, ptr @slll2, align 8
+// CHECK-NEXT:    [[TMP41:%.*]] = load volatile <1 x i128>, ptr @slll, align 8
+// CHECK-NEXT:    [[OR20:%.*]] = or <1 x i128> [[TMP41]], [[TMP40]]
+// CHECK-NEXT:    store volatile <1 x i128> [[OR20]], ptr @slll, align 8
+// CHECK-NEXT:    [[TMP42:%.*]] = load volatile <1 x i128>, ptr @blll2, align 8
+// CHECK-NEXT:    [[TMP43:%.*]] = load volatile <1 x i128>, ptr @slll, align 8
+// CHECK-NEXT:    [[OR21:%.*]] = or <1 x i128> [[TMP43]], [[TMP42]]
+// CHECK-NEXT:    store volatile <1 x i128> [[OR21]], ptr @slll, align 8
+// CHECK-NEXT:    [[TMP44:%.*]] = load volatile <1 x i128>, ptr @ulll2, align 8
+// CHECK-NEXT:    [[TMP45:%.*]] = load volatile <1 x i128>, ptr @ulll, align 8
+// CHECK-NEXT:    [[OR22:%.*]] = or <1 x i128> [[TMP45]], [[TMP44]]
+// CHECK-NEXT:    store volatile <1 x i128> [[OR22]], ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP46:%.*]] = load volatile <1 x i128>, ptr @blll2, align 8
+// CHECK-NEXT:    [[TMP47:%.*]] = load volatile <1 x i128>, ptr @ulll, align 8
+// CHECK-NEXT:    [[OR23:%.*]] = or <1 x i128> [[TMP47]], [[TMP46]]
+// CHECK-NEXT:    store volatile <1 x i128> [[OR23]], ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP48:%.*]] = load volatile <1 x i128>, ptr @blll2, align 8
+// CHECK-NEXT:    [[TMP49:%.*]] = load volatile <1 x i128>, ptr @blll, align 8
+// CHECK-NEXT:    [[OR24:%.*]] = or <1 x i128> [[TMP49]], [[TMP48]]
+// CHECK-NEXT:    store volatile <1 x i128> [[OR24]], ptr @blll, align 8
 // CHECK-NEXT:    ret void
 //
 void test_or_assign(void) {
@@ -1722,6 +2081,12 @@ void test_or_assign(void) {
   ul |= ul2;
   ul |= bl2;
   bl |= bl2;
+
+  slll |= slll2;
+  slll |= blll2;
+  ulll |= ulll2;
+  ulll |= blll2;
+  blll |= blll2;
 }
 
 // CHECK-LABEL: define dso_local void @test_xor(
@@ -1839,6 +2204,34 @@ void test_or_assign(void) {
 // CHECK-NEXT:    [[TMP55:%.*]] = load volatile <2 x i64>, ptr @bl2, align 8
 // CHECK-NEXT:    [[XOR27:%.*]] = xor <2 x i64> [[TMP54]], [[TMP55]]
 // CHECK-NEXT:    store volatile <2 x i64> [[XOR27]], ptr @bl, align 8
+// CHECK-NEXT:    [[TMP56:%.*]] = load volatile <1 x i128>, ptr @slll, align 8
+// CHECK-NEXT:    [[TMP57:%.*]] = load volatile <1 x i128>, ptr @slll2, align 8
+// CHECK-NEXT:    [[XOR28:%.*]] = xor <1 x i128> [[TMP56]], [[TMP57]]
+// CHECK-NEXT:    store volatile <1 x i128> [[XOR28]], ptr @slll, align 8
+// CHECK-NEXT:    [[TMP58:%.*]] = load volatile <1 x i128>, ptr @slll, align 8
+// CHECK-NEXT:    [[TMP59:%.*]] = load volatile <1 x i128>, ptr @blll2, align 8
+// CHECK-NEXT:    [[XOR29:%.*]] = xor <1 x i128> [[TMP58]], [[TMP59]]
+// CHECK-NEXT:    store volatile <1 x i128> [[XOR29]], ptr @slll, align 8
+// CHECK-NEXT:    [[TMP60:%.*]] = load volatile <1 x i128>, ptr @blll, align 8
+// CHECK-NEXT:    [[TMP61:%.*]] = load volatile <1 x i128>, ptr @slll2, align 8
+// CHECK-NEXT:    [[XOR30:%.*]] = xor <1 x i128> [[TMP60]], [[TMP61]]
+// CHECK-NEXT:    store volatile <1 x i128> [[XOR30]], ptr @slll, align 8
+// CHECK-NEXT:    [[TMP62:%.*]] = load volatile <1 x i128>, ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP63:%.*]] = load volatile <1 x i128>, ptr @ulll2, align 8
+// CHECK-NEXT:    [[XOR31:%.*]] = xor <1 x i128> [[TMP62]], [[TMP63]]
+// CHECK-NEXT:    store volatile <1 x i128> [[XOR31]], ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP64:%.*]] = load volatile <1 x i128>, ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP65:%.*]] = load volatile <1 x i128>, ptr @blll2, align 8
+// CHECK-NEXT:    [[XOR32:%.*]] = xor <1 x i128> [[TMP64]], [[TMP65]]
+// CHECK-NEXT:    store volatile <1 x i128> [[XOR32]], ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP66:%.*]] = load volatile <1 x i128>, ptr @blll, align 8
+// CHECK-NEXT:    [[TMP67:%.*]] = load volatile <1 x i128>, ptr @ulll2, align 8
+// CHECK-NEXT:    [[XOR33:%.*]] = xor <1 x i128> [[TMP66]], [[TMP67]]
+// CHECK-NEXT:    store volatile <1 x i128> [[XOR33]], ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP68:%.*]] = load volatile <1 x i128>, ptr @blll, align 8
+// CHECK-NEXT:    [[TMP69:%.*]] = load volatile <1 x i128>, ptr @blll2, align 8
+// CHECK-NEXT:    [[XOR34:%.*]] = xor <1 x i128> [[TMP68]], [[TMP69]]
+// CHECK-NEXT:    store volatile <1 x i128> [[XOR34]], ptr @blll, align 8
 // CHECK-NEXT:    ret void
 //
 void test_xor(void) {
@@ -1874,6 +2267,14 @@ void test_xor(void) {
   ul = ul ^ bl2;
   ul = bl ^ ul2;
   bl = bl ^ bl2;
+
+  slll = slll ^ slll2;
+  slll = slll ^ blll2;
+  slll = blll ^ slll2;
+  ulll = ulll ^ ulll2;
+  ulll = ulll ^ blll2;
+  ulll = blll ^ ulll2;
+  blll = blll ^ blll2;
 }
 
 // CHECK-LABEL: define dso_local void @test_xor_assign(
@@ -1959,6 +2360,26 @@ void test_xor(void) {
 // CHECK-NEXT:    [[TMP39:%.*]] = load volatile <2 x i64>, ptr @bl, align 8
 // CHECK-NEXT:    [[XOR19:%.*]] = xor <2 x i64> [[TMP39]], [[TMP38]]
 // CHECK-NEXT:    store volatile <2 x i64> [[XOR19]], ptr @bl, align 8
+// CHECK-NEXT:    [[TMP40:%.*]] = load volatile <1 x i128>, ptr @slll2, align 8
+// CHECK-NEXT:    [[TMP41:%.*]] = load volatile <1 x i128>, ptr @slll, align 8
+// CHECK-NEXT:    [[XOR20:%.*]] = xor <1 x i128> [[TMP41]], [[TMP40]]
+// CHECK-NEXT:    store volatile <1 x i128> [[XOR20]], ptr @slll, align 8
+// CHECK-NEXT:    [[TMP42:%.*]] = load volatile <1 x i128>, ptr @blll2, align 8
+// CHECK-NEXT:    [[TMP43:%.*]] = load volatile <1 x i128>, ptr @slll, align 8
+// CHECK-NEXT:    [[XOR21:%.*]] = xor <1 x i128> [[TMP43]], [[TMP42]]
+// CHECK-NEXT:    store volatile <1 x i128> [[XOR21]], ptr @slll, align 8
+// CHECK-NEXT:    [[TMP44:%.*]] = load volatile <1 x i128>, ptr @ulll2, align 8
+// CHECK-NEXT:    [[TMP45:%.*]] = load volatile <1 x i128>, ptr @ulll, align 8
+// CHECK-NEXT:    [[XOR22:%.*]] = xor <1 x i128> [[TMP45]], [[TMP44]]
+// CHECK-NEXT:    store volatile <1 x i128> [[XOR22]], ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP46:%.*]] = load volatile <1 x i128>, ptr @blll2, align 8
+// CHECK-NEXT:    [[TMP47:%.*]] = load volatile <1 x i128>, ptr @ulll, align 8
+// CHECK-NEXT:    [[XOR23:%.*]] = xor <1 x i128> [[TMP47]], [[TMP46]]
+// CHECK-NEXT:    store volatile <1 x i128> [[XOR23]], ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP48:%.*]] = load volatile <1 x i128>, ptr @blll2, align 8
+// CHECK-NEXT:    [[TMP49:%.*]] = load volatile <1 x i128>, ptr @blll, align 8
+// CHECK-NEXT:    [[XOR24:%.*]] = xor <1 x i128> [[TMP49]], [[TMP48]]
+// CHECK-NEXT:    store volatile <1 x i128> [[XOR24]], ptr @blll, align 8
 // CHECK-NEXT:    ret void
 //
 void test_xor_assign(void) {
@@ -1986,6 +2407,12 @@ void test_xor_assign(void) {
   ul ^= ul2;
   ul ^= bl2;
   bl ^= bl2;
+
+  slll ^= slll2;
+  slll ^= blll2;
+  ulll ^= ulll2;
+  ulll ^= blll2;
+  blll ^= blll2;
 }
 
 // CHECK-LABEL: define dso_local void @test_sl(
@@ -2133,6 +2560,42 @@ void test_xor_assign(void) {
 // CHECK-NEXT:    [[TMP55:%.*]] = load volatile <2 x i64>, ptr @ul, align 8
 // CHECK-NEXT:    [[SHL50:%.*]] = shl <2 x i64> [[TMP55]], splat (i64 5)
 // CHECK-NEXT:    store volatile <2 x i64> [[SHL50]], ptr @ul, align 8
+// CHECK-NEXT:    [[TMP56:%.*]] = load volatile <1 x i128>, ptr @slll, align 8
+// CHECK-NEXT:    [[TMP57:%.*]] = load volatile <1 x i128>, ptr @slll2, align 8
+// CHECK-NEXT:    [[SHL51:%.*]] = shl <1 x i128> [[TMP56]], [[TMP57]]
+// CHECK-NEXT:    store volatile <1 x i128> [[SHL51]], ptr @slll, align 8
+// CHECK-NEXT:    [[TMP58:%.*]] = load volatile <1 x i128>, ptr @slll, align 8
+// CHECK-NEXT:    [[TMP59:%.*]] = load volatile <1 x i128>, ptr @ulll2, align 8
+// CHECK-NEXT:    [[SHL52:%.*]] = shl <1 x i128> [[TMP58]], [[TMP59]]
+// CHECK-NEXT:    store volatile <1 x i128> [[SHL52]], ptr @slll, align 8
+// CHECK-NEXT:    [[TMP60:%.*]] = load volatile <1 x i128>, ptr @slll, align 8
+// CHECK-NEXT:    [[TMP61:%.*]] = load volatile i32, ptr @cnt, align 4
+// CHECK-NEXT:    [[SPLAT_SPLATINSERT53:%.*]] = insertelement <1 x i32> poison, i32 [[TMP61]], i64 0
+// CHECK-NEXT:    [[SPLAT_SPLAT54:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT53]], <1 x i32> poison, <1 x i32> zeroinitializer
+// CHECK-NEXT:    [[SH_PROM55:%.*]] = zext <1 x i32> [[SPLAT_SPLAT54]] to <1 x i128>
+// CHECK-NEXT:    [[SHL56:%.*]] = shl <1 x i128> [[TMP60]], [[SH_PROM55]]
+// CHECK-NEXT:    store volatile <1 x i128> [[SHL56]], ptr @slll, align 8
+// CHECK-NEXT:    [[TMP62:%.*]] = load volatile <1 x i128>, ptr @slll, align 8
+// CHECK-NEXT:    [[SHL57:%.*]] = shl <1 x i128> [[TMP62]], splat (i128 5)
+// CHECK-NEXT:    store volatile <1 x i128> [[SHL57]], ptr @slll, align 8
+// CHECK-NEXT:    [[TMP63:%.*]] = load volatile <1 x i128>, ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP64:%.*]] = load volatile <1 x i128>, ptr @slll2, align 8
+// CHECK-NEXT:    [[SHL58:%.*]] = shl <1 x i128> [[TMP63]], [[TMP64]]
+// CHECK-NEXT:    store volatile <1 x i128> [[SHL58]], ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP65:%.*]] = load volatile <1 x i128>, ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP66:%.*]] = load volatile <1 x i128>, ptr @ulll2, align 8
+// CHECK-NEXT:    [[SHL59:%.*]] = shl <1 x i128> [[TMP65]], [[TMP66]]
+// CHECK-NEXT:    store volatile <1 x i128> [[SHL59]], ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP67:%.*]] = load volatile <1 x i128>, ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP68:%.*]] = load volatile i32, ptr @cnt, align 4
+// CHECK-NEXT:    [[SPLAT_SPLATINSERT60:%.*]] = insertelement <1 x i32> poison, i32 [[TMP68]], i64 0
+// CHECK-NEXT:    [[SPLAT_SPLAT61:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT60]], <1 x i32> poison, <1 x i32> zeroinitializer
+// CHECK-NEXT:    [[SH_PROM62:%.*]] = zext <1 x i32> [[SPLAT_SPLAT61]] to <1 x i128>
+// CHECK-NEXT:    [[SHL63:%.*]] = shl <1 x i128> [[TMP67]], [[SH_PROM62]]
+// CHECK-NEXT:    store volatile <1 x i128> [[SHL63]], ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP69:%.*]] = load volatile <1 x i128>, ptr @ulll, align 8
+// CHECK-NEXT:    [[SHL64:%.*]] = shl <1 x i128> [[TMP69]], splat (i128 5)
+// CHECK-NEXT:    store volatile <1 x i128> [[SHL64]], ptr @ulll, align 8
 // CHECK-NEXT:    ret void
 //
 void test_sl(void) {
@@ -2172,6 +2635,15 @@ void test_sl(void) {
   ul = ul << ul2;
   ul = ul << cnt;
   ul = ul << 5;
+
+  slll = slll << slll2;
+  slll = slll << ulll2;
+  slll = slll << cnt;
+  slll = slll << 5;
+  ulll = ulll << slll2;
+  ulll = ulll << ulll2;
+  ulll = ulll << cnt;
+  ulll = ulll << 5;
 }
 
 // CHECK-LABEL: define dso_local void @test_sl_assign(
@@ -2319,6 +2791,42 @@ void test_sl(void) {
 // CHECK-NEXT:    [[TMP55:%.*]] = load volatile <2 x i64>, ptr @ul, align 8
 // CHECK-NEXT:    [[SHL50:%.*]] = shl <2 x i64> [[TMP55]], splat (i64 5)
 // CHECK-NEXT:    store volatile <2 x i64> [[SHL50]], ptr @ul, align 8
+// CHECK-NEXT:    [[TMP56:%.*]] = load volatile <1 x i128>, ptr @slll2, align 8
+// CHECK-NEXT:    [[TMP57:%.*]] = load volatile <1 x i128>, ptr @slll, align 8
+// CHECK-NEXT:    [[SHL51:%.*]] = shl <1 x i128> [[TMP57]], [[TMP56]]
+// CHECK-NEXT:    store volatile <1 x i128> [[SHL51]], ptr @slll, align 8
+// CHECK-NEXT:    [[TMP58:%.*]] = load volatile <1 x i128>, ptr @ulll2, align 8
+// CHECK-NEXT:    [[TMP59:%.*]] = load volatile <1 x i128>, ptr @slll, align 8
+// CHECK-NEXT:    [[SHL52:%.*]] = shl <1 x i128> [[TMP59]], [[TMP58]]
+// CHECK-NEXT:    store volatile <1 x i128> [[SHL52]], ptr @slll, align 8
+// CHECK-NEXT:    [[TMP60:%.*]] = load volatile i32, ptr @cnt, align 4
+// CHECK-NEXT:    [[SPLAT_SPLATINSERT53:%.*]] = insertelement <1 x i32> poison, i32 [[TMP60]], i64 0
+// CHECK-NEXT:    [[SPLAT_SPLAT54:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT53]], <1 x i32> poison, <1 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP61:%.*]] = load volatile <1 x i128>, ptr @slll, align 8
+// CHECK-NEXT:    [[SH_PROM55:%.*]] = zext <1 x i32> [[SPLAT_SPLAT54]] to <1 x i128>
+// CHECK-NEXT:    [[SHL56:%.*]] = shl <1 x i128> [[TMP61]], [[SH_PROM55]]
+// CHECK-NEXT:    store volatile <1 x i128> [[SHL56]], ptr @slll, align 8
+// CHECK-NEXT:    [[TMP62:%.*]] = load volatile <1 x i128>, ptr @slll, align 8
+// CHECK-NEXT:    [[SHL57:%.*]] = shl <1 x i128> [[TMP62]], splat (i128 5)
+// CHECK-NEXT:    store volatile <1 x i128> [[SHL57]], ptr @slll, align 8
+// CHECK-NEXT:    [[TMP63:%.*]] = load volatile <1 x i128>, ptr @slll2, align 8
+// CHECK-NEXT:    [[TMP64:%.*]] = load volatile <1 x i128>, ptr @ulll, align 8
+// CHECK-NEXT:    [[SHL58:%.*]] = shl <1 x i128> [[TMP64]], [[TMP63]]
+// CHECK-NEXT:    store volatile <1 x i128> [[SHL58]], ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP65:%.*]] = load volatile <1 x i128>, ptr @ulll2, align 8
+// CHECK-NEXT:    [[TMP66:%.*]] = load volatile <1 x i128>, ptr @ulll, align 8
+// CHECK-NEXT:    [[SHL59:%.*]] = shl <1 x i128> [[TMP66]], [[TMP65]]
+// CHECK-NEXT:    store volatile <1 x i128> [[SHL59]], ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP67:%.*]] = load volatile i32, ptr @cnt, align 4
+// CHECK-NEXT:    [[SPLAT_SPLATINSERT60:%.*]] = insertelement <1 x i32> poison, i32 [[TMP67]], i64 0
+// CHECK-NEXT:    [[SPLAT_SPLAT61:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT60]], <1 x i32> poison, <1 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP68:%.*]] = load volatile <1 x i128>, ptr @ulll, align 8
+// CHECK-NEXT:    [[SH_PROM62:%.*]] = zext <1 x i32> [[SPLAT_SPLAT61]] to <1 x i128>
+// CHECK-NEXT:    [[SHL63:%.*]] = shl <1 x i128> [[TMP68]], [[SH_PROM62]]
+// CHECK-NEXT:    store volatile <1 x i128> [[SHL63]], ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP69:%.*]] = load volatile <1 x i128>, ptr @ulll, align 8
+// CHECK-NEXT:    [[SHL64:%.*]] = shl <1 x i128> [[TMP69]], splat (i128 5)
+// CHECK-NEXT:    store volatile <1 x i128> [[SHL64]], ptr @ulll, align 8
 // CHECK-NEXT:    ret void
 //
 void test_sl_assign(void) {
@@ -2358,6 +2866,15 @@ void test_sl_assign(void) {
   ul <<= ul2;
   ul <<= cnt;
   ul <<= 5;
+
+  slll <<= slll2;
+  slll <<= ulll2;
+  slll <<= cnt;
+  slll <<= 5;
+  ulll <<= slll2;
+  ulll <<= ulll2;
+  ulll <<= cnt;
+  ulll <<= 5;
 }
 
 // CHECK-LABEL: define dso_local void @test_sr(
@@ -2505,6 +3022,42 @@ void test_sl_assign(void) {
 // CHECK-NEXT:    [[TMP55:%.*]] = load volatile <2 x i64>, ptr @ul, align 8
 // CHECK-NEXT:    [[SHR50:%.*]] = lshr <2 x i64> [[TMP55]], splat (i64 5)
 // CHECK-NEXT:    store volatile <2 x i64> [[SHR50]], ptr @ul, align 8
+// CHECK-NEXT:    [[TMP56:%.*]] = load volatile <1 x i128>, ptr @slll, align 8
+// CHECK-NEXT:    [[TMP57:%.*]] = load volatile <1 x i128>, ptr @slll2, align 8
+// CHECK-NEXT:    [[SHR51:%.*]] = ashr <1 x i128> [[TMP56]], [[TMP57]]
+// CHECK-NEXT:    store volatile <1 x i128> [[SHR51]], ptr @slll, align 8
+// CHECK-NEXT:    [[TMP58:%.*]] = load volatile <1 x i128>, ptr @slll, align 8
+// CHECK-NEXT:    [[TMP59:%.*]] = load volatile <1 x i128>, ptr @ulll2, align 8
+// CHECK-NEXT:    [[SHR52:%.*]] = ashr <1 x i128> [[TMP58]], [[TMP59]]
+// CHECK-NEXT:    store volatile <1 x i128> [[SHR52]], ptr @slll, align 8
+// CHECK-NEXT:    [[TMP60:%.*]] = load volatile <1 x i128>, ptr @slll, align 8
+// CHECK-NEXT:    [[TMP61:%.*]] = load volatile i32, ptr @cnt, align 4
+// CHECK-NEXT:    [[SPLAT_SPLATINSERT53:%.*]] = insertelement <1 x i32> poison, i32 [[TMP61]], i64 0
+// CHECK-NEXT:    [[SPLAT_SPLAT54:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT53]], <1 x i32> poison, <1 x i32> zeroinitializer
+// CHECK-NEXT:    [[SH_PROM55:%.*]] = zext <1 x i32> [[SPLAT_SPLAT54]] to <1 x i128>
+// CHECK-NEXT:    [[SHR56:%.*]] = ashr <1 x i128> [[TMP60]], [[SH_PROM55]]
+// CHECK-NEXT:    store volatile <1 x i128> [[SHR56]], ptr @slll, align 8
+// CHECK-NEXT:    [[TMP62:%.*]] = load volatile <1 x i128>, ptr @slll, align 8
+// CHECK-NEXT:    [[SHR57:%.*]] = ashr <1 x i128> [[TMP62]], splat (i128 5)
+// CHECK-NEXT:    store volatile <1 x i128> [[SHR57]], ptr @slll, align 8
+// CHECK-NEXT:    [[TMP63:%.*]] = load volatile <1 x i128>, ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP64:%.*]] = load volatile <1 x i128>, ptr @slll2, align 8
+// CHECK-NEXT:    [[SHR58:%.*]] = lshr <1 x i128> [[TMP63]], [[TMP64]]
+// CHECK-NEXT:    store volatile <1 x i128> [[SHR58]], ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP65:%.*]] = load volatile <1 x i128>, ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP66:%.*]] = load volatile <1 x i128>, ptr @ulll2, align 8
+// CHECK-NEXT:    [[SHR59:%.*]] = lshr <1 x i128> [[TMP65]], [[TMP66]]
+// CHECK-NEXT:    store volatile <1 x i128> [[SHR59]], ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP67:%.*]] = load volatile <1 x i128>, ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP68:%.*]] = load volatile i32, ptr @cnt, align 4
+// CHECK-NEXT:    [[SPLAT_SPLATINSERT60:%.*]] = insertelement <1 x i32> poison, i32 [[TMP68]], i64 0
+// CHECK-NEXT:    [[SPLAT_SPLAT61:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT60]], <1 x i32> poison, <1 x i32> zeroinitializer
+// CHECK-NEXT:    [[SH_PROM62:%.*]] = zext <1 x i32> [[SPLAT_SPLAT61]] to <1 x i128>
+// CHECK-NEXT:    [[SHR63:%.*]] = lshr <1 x i128> [[TMP67]], [[SH_PROM62]]
+// CHECK-NEXT:    store volatile <1 x i128> [[SHR63]], ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP69:%.*]] = load volatile <1 x i128>, ptr @ulll, align 8
+// CHECK-NEXT:    [[SHR64:%.*]] = lshr <1 x i128> [[TMP69]], splat (i128 5)
+// CHECK-NEXT:    store volatile <1 x i128> [[SHR64]], ptr @ulll, align 8
 // CHECK-NEXT:    ret void
 //
 void test_sr(void) {
@@ -2544,6 +3097,15 @@ void test_sr(void) {
   ul = ul >> ul2;
   ul = ul >> cnt;
   ul = ul >> 5;
+
+  slll = slll >> slll2;
+  slll = slll >> ulll2;
+  slll = slll >> cnt;
+  slll = slll >> 5;
+  ulll = ulll >> slll2;
+  ulll = ulll >> ulll2;
+  ulll = ulll >> cnt;
+  ulll = ulll >> 5;
 }
 
 // CHECK-LABEL: define dso_local void @test_sr_assign(
@@ -2691,6 +3253,42 @@ void test_sr(void) {
 // CHECK-NEXT:    [[TMP55:%.*]] = load volatile <2 x i64>, ptr @ul, align 8
 // CHECK-NEXT:    [[SHR50:%.*]] = lshr <2 x i64> [[TMP55]], splat (i64 5)
 // CHECK-NEXT:    store volatile <2 x i64> [[SHR50]], ptr @ul, align 8
+// CHECK-NEXT:    [[TMP56:%.*]] = load volatile <1 x i128>, ptr @slll2, align 8
+// CHECK-NEXT:    [[TMP57:%.*]] = load volatile <1 x i128>, ptr @slll, align 8
+// CHECK-NEXT:    [[SHR51:%.*]] = ashr <1 x i128> [[TMP57]], [[TMP56]]
+// CHECK-NEXT:    store volatile <1 x i128> [[SHR51]], ptr @slll, align 8
+// CHECK-NEXT:    [[TMP58:%.*]] = load volatile <1 x i128>, ptr @ulll2, align 8
+// CHECK-NEXT:    [[TMP59:%.*]] = load volatile <1 x i128>, ptr @slll, align 8
+// CHECK-NEXT:    [[SHR52:%.*]] = ashr <1 x i128> [[TMP59]], [[TMP58]]
+// CHECK-NEXT:    store volatile <1 x i128> [[SHR52]], ptr @slll, align 8
+// CHECK-NEXT:    [[TMP60:%.*]] = load volatile i32, ptr @cnt, align 4
+// CHECK-NEXT:    [[SPLAT_SPLATINSERT53:%.*]] = insertelement <1 x i32> poison, i32 [[TMP60]], i64 0
+// CHECK-NEXT:    [[SPLAT_SPLAT54:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT53]], <1 x i32> poison, <1 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP61:%.*]] = load volatile <1 x i128>, ptr @slll, align 8
+// CHECK-NEXT:    [[SH_PROM55:%.*]] = zext <1 x i32> [[SPLAT_SPLAT54]] to <1 x i128>
+// CHECK-NEXT:    [[SHR56:%.*]] = ashr <1 x i128> [[TMP61]], [[SH_PROM55]]
+// CHECK-NEXT:    store volatile <1 x i128> [[SHR56]], ptr @slll, align 8
+// CHECK-NEXT:    [[TMP62:%.*]] = load volatile <1 x i128>, ptr @slll, align 8
+// CHECK-NEXT:    [[SHR57:%.*]] = ashr <1 x i128> [[TMP62]], splat (i128 5)
+// CHECK-NEXT:    store volatile <1 x i128> [[SHR57]], ptr @slll, align 8
+// CHECK-NEXT:    [[TMP63:%.*]] = load volatile <1 x i128>, ptr @slll2, align 8
+// CHECK-NEXT:    [[TMP64:%.*]] = load volatile <1 x i128>, ptr @ulll, align 8
+// CHECK-NEXT:    [[SHR58:%.*]] = lshr <1 x i128> [[TMP64]], [[TMP63]]
+// CHECK-NEXT:    store volatile <1 x i128> [[SHR58]], ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP65:%.*]] = load volatile <1 x i128>, ptr @ulll2, align 8
+// CHECK-NEXT:    [[TMP66:%.*]] = load volatile <1 x i128>, ptr @ulll, align 8
+// CHECK-NEXT:    [[SHR59:%.*]] = lshr <1 x i128> [[TMP66]], [[TMP65]]
+// CHECK-NEXT:    store volatile <1 x i128> [[SHR59]], ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP67:%.*]] = load volatile i32, ptr @cnt, align 4
+// CHECK-NEXT:    [[SPLAT_SPLATINSERT60:%.*]] = insertelement <1 x i32> poison, i32 [[TMP67]], i64 0
+// CHECK-NEXT:    [[SPLAT_SPLAT61:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT60]], <1 x i32> poison, <1 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP68:%.*]] = load volatile <1 x i128>, ptr @ulll, align 8
+// CHECK-NEXT:    [[SH_PROM62:%.*]] = zext <1 x i32> [[SPLAT_SPLAT61]] to <1 x i128>
+// CHECK-NEXT:    [[SHR63:%.*]] = lshr <1 x i128> [[TMP68]], [[SH_PROM62]]
+// CHECK-NEXT:    store volatile <1 x i128> [[SHR63]], ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP69:%.*]] = load volatile <1 x i128>, ptr @ulll, align 8
+// CHECK-NEXT:    [[SHR64:%.*]] = lshr <1 x i128> [[TMP69]], splat (i128 5)
+// CHECK-NEXT:    store volatile <1 x i128> [[SHR64]], ptr @ulll, align 8
 // CHECK-NEXT:    ret void
 //
 void test_sr_assign(void) {
@@ -2730,6 +3328,15 @@ void test_sr_assign(void) {
   ul >>= ul2;
   ul >>= cnt;
   ul >>= 5;
+
+  slll >>= slll2;
+  slll >>= ulll2;
+  slll >>= cnt;
+  slll >>= 5;
+  ulll >>= slll2;
+  ulll >>= ulll2;
+  ulll >>= cnt;
+  ulll >>= 5;
 }
 
 
@@ -2876,11 +3483,46 @@ void test_sr_assign(void) {
 // CHECK-NEXT:    [[CMP53:%.*]] = icmp eq <2 x i64> [[TMP54]], [[TMP55]]
 // CHECK-NEXT:    [[SEXT54:%.*]] = sext <2 x i1> [[CMP53]] to <2 x i64>
 // CHECK-NEXT:    store volatile <2 x i64> [[SEXT54]], ptr @bl, align 8
-// CHECK-NEXT:    [[TMP56:%.*]] = load volatile <2 x double>, ptr @fd, align 8
-// CHECK-NEXT:    [[TMP57:%.*]] = load volatile <2 x double>, ptr @fd2, align 8
-// CHECK-NEXT:    [[CMP55:%.*]] = fcmp oeq <2 x double> [[TMP56]], [[TMP57]]
-// CHECK-NEXT:    [[SEXT56:%.*]] = sext <2 x i1> [[CMP55]] to <2 x i64>
-// CHECK-NEXT:    store volatile <2 x i64> [[SEXT56]], ptr @bl, align 8
+// CHECK-NEXT:    [[TMP56:%.*]] = load volatile <1 x i128>, ptr @slll, align 8
+// CHECK-NEXT:    [[TMP57:%.*]] = load volatile <1 x i128>, ptr @slll2, align 8
+// CHECK-NEXT:    [[CMP55:%.*]] = icmp eq <1 x i128> [[TMP56]], [[TMP57]]
+// CHECK-NEXT:    [[SEXT56:%.*]] = sext <1 x i1> [[CMP55]] to <1 x i128>
+// CHECK-NEXT:    store volatile <1 x i128> [[SEXT56]], ptr @blll, align 8
+// CHECK-NEXT:    [[TMP58:%.*]] = load volatile <1 x i128>, ptr @slll, align 8
+// CHECK-NEXT:    [[TMP59:%.*]] = load volatile <1 x i128>, ptr @blll2, align 8
+// CHECK-NEXT:    [[CMP57:%.*]] = icmp eq <1 x i128> [[TMP58]], [[TMP59]]
+// CHECK-NEXT:    [[SEXT58:%.*]] = sext <1 x i1> [[CMP57]] to <1 x i128>
+// CHECK-NEXT:    store volatile <1 x i128> [[SEXT58]], ptr @blll, align 8
+// CHECK-NEXT:    [[TMP60:%.*]] = load volatile <1 x i128>, ptr @blll, align 8
+// CHECK-NEXT:    [[TMP61:%.*]] = load volatile <1 x i128>, ptr @slll2, align 8
+// CHECK-NEXT:    [[CMP59:%.*]] = icmp eq <1 x i128> [[TMP60]], [[TMP61]]
+// CHECK-NEXT:    [[SEXT60:%.*]] = sext <1 x i1> [[CMP59]] to <1 x i128>
+// CHECK-NEXT:    store volatile <1 x i128> [[SEXT60]], ptr @blll, align 8
+// CHECK-NEXT:    [[TMP62:%.*]] = load volatile <1 x i128>, ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP63:%.*]] = load volatile <1 x i128>, ptr @ulll2, align 8
+// CHECK-NEXT:    [[CMP61:%.*]] = icmp eq <1 x i128> [[TMP62]], [[TMP63]]
+// CHECK-NEXT:    [[SEXT62:%.*]] = sext <1 x i1> [[CMP61]] to <1 x i128>
+// CHECK-NEXT:    store volatile <1 x i128> [[SEXT62]], ptr @blll, align 8
+// CHECK-NEXT:    [[TMP64:%.*]] = load volatile <1 x i128>, ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP65:%.*]] = load volatile <1 x i128>, ptr @blll2, align 8
+// CHECK-NEXT:    [[CMP63:%.*]] = icmp eq <1 x i128> [[TMP64]], [[TMP65]]
+// CHECK-NEXT:    [[SEXT64:%.*]] = sext <1 x i1> [[CMP63]] to <1 x i128>
+// CHECK-NEXT:    store volatile <1 x i128> [[SEXT64]], ptr @blll, align 8
+// CHECK-NEXT:    [[TMP66:%.*]] = load volatile <1 x i128>, ptr @blll, align 8
+// CHECK-NEXT:    [[TMP67:%.*]] = load volatile <1 x i128>, ptr @ulll2, align 8
+// CHECK-NEXT:    [[CMP65:%.*]] = icmp eq <1 x i128> [[TMP66]], [[TMP67]]
+// CHECK-NEXT:    [[SEXT66:%.*]] = sext <1 x i1> [[CMP65]] to <1 x i128>
+// CHECK-NEXT:    store volatile <1 x i128> [[SEXT66]], ptr @blll, align 8
+// CHECK-NEXT:    [[TMP68:%.*]] = load volatile <1 x i128>, ptr @blll, align 8
+// CHECK-NEXT:    [[TMP69:%.*]] = load volatile <1 x i128>, ptr @blll2, align 8
+// CHECK-NEXT:    [[CMP67:%.*]] = icmp eq <1 x i128> [[TMP68]], [[TMP69]]
+// CHECK-NEXT:    [[SEXT68:%.*]] = sext <1 x i1> [[CMP67]] to <1 x i128>
+// CHECK-NEXT:    store volatile <1 x i128> [[SEXT68]], ptr @blll, align 8
+// CHECK-NEXT:    [[TMP70:%.*]] = load volatile <2 x double>, ptr @fd, align 8
+// CHECK-NEXT:    [[TMP71:%.*]] = load volatile <2 x double>, ptr @fd2, align 8
+// CHECK-NEXT:    [[CMP69:%.*]] = fcmp oeq <2 x double> [[TMP70]], [[TMP71]]
+// CHECK-NEXT:    [[SEXT70:%.*]] = sext <2 x i1> [[CMP69]] to <2 x i64>
+// CHECK-NEXT:    store volatile <2 x i64> [[SEXT70]], ptr @bl, align 8
 // CHECK-NEXT:    ret void
 //
 void test_cmpeq(void) {
@@ -2917,6 +3559,14 @@ void test_cmpeq(void) {
   bl = bl == ul2;
   bl = bl == bl2;
 
+  blll = slll == slll2;
+  blll = slll == blll2;
+  blll = blll == slll2;
+  blll = ulll == ulll2;
+  blll = ulll == blll2;
+  blll = blll == ulll2;
+  blll = blll == blll2;
+
   bl = fd == fd2;
 }
 
@@ -3063,11 +3713,46 @@ void test_cmpeq(void) {
 // CHECK-NEXT:    [[CMP53:%.*]] = icmp ne <2 x i64> [[TMP54]], [[TMP55]]
 // CHECK-NEXT:    [[SEXT54:%.*]] = sext <2 x i1> [[CMP53]] to <2 x i64>
 // CHECK-NEXT:    store volatile <2 x i64> [[SEXT54]], ptr @bl, align 8
-// CHECK-NEXT:    [[TMP56:%.*]] = load volatile <2 x double>, ptr @fd, align 8
-// CHECK-NEXT:    [[TMP57:%.*]] = load volatile <2 x double>, ptr @fd2, align 8
-// CHECK-NEXT:    [[CMP55:%.*]] = fcmp une <2 x double> [[TMP56]], [[TMP57]]
-// CHECK-NEXT:    [[SEXT56:%.*]] = sext <2 x i1> [[CMP55]] to <2 x i64>
-// CHECK-NEXT:    store volatile <2 x i64> [[SEXT56]], ptr @bl, align 8
+// CHECK-NEXT:    [[TMP56:%.*]] = load volatile <1 x i128>, ptr @slll, align 8
+// CHECK-NEXT:    [[TMP57:%.*]] = load volatile <1 x i128>, ptr @slll2, align 8
+// CHECK-NEXT:    [[CMP55:%.*]] = icmp ne <1 x i128> [[TMP56]], [[TMP57]]
+// CHECK-NEXT:    [[SEXT56:%.*]] = sext <1 x i1> [[CMP55]] to <1 x i128>
+// CHECK-NEXT:    store volatile <1 x i128> [[SEXT56]], ptr @blll, align 8
+// CHECK-NEXT:    [[TMP58:%.*]] = load volatile <1 x i128>, ptr @slll, align 8
+// CHECK-NEXT:    [[TMP59:%.*]] = load volatile <1 x i128>, ptr @blll2, align 8
+// CHECK-NEXT:    [[CMP57:%.*]] = icmp ne <1 x i128> [[TMP58]], [[TMP59]]
+// CHECK-NEXT:    [[SEXT58:%.*]] = sext <1 x i1> [[CMP57]] to <1 x i128>
+// CHECK-NEXT:    store volatile <1 x i128> [[SEXT58]], ptr @blll, align 8
+// CHECK-NEXT:    [[TMP60:%.*]] = load volatile <1 x i128>, ptr @blll, align 8
+// CHECK-NEXT:    [[TMP61:%.*]] = load volatile <1 x i128>, ptr @slll2, align 8
+// CHECK-NEXT:    [[CMP59:%.*]] = icmp ne <1 x i128> [[TMP60]], [[TMP61]]
+// CHECK-NEXT:    [[SEXT60:%.*]] = sext <1 x i1> [[CMP59]] to <1 x i128>
+// CHECK-NEXT:    store volatile <1 x i128> [[SEXT60]], ptr @blll, align 8
+// CHECK-NEXT:    [[TMP62:%.*]] = load volatile <1 x i128>, ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP63:%.*]] = load volatile <1 x i128>, ptr @ulll2, align 8
+// CHECK-NEXT:    [[CMP61:%.*]] = icmp ne <1 x i128> [[TMP62]], [[TMP63]]
+// CHECK-NEXT:    [[SEXT62:%.*]] = sext <1 x i1> [[CMP61]] to <1 x i128>
+// CHECK-NEXT:    store volatile <1 x i128> [[SEXT62]], ptr @blll, align 8
+// CHECK-NEXT:    [[TMP64:%.*]] = load volatile <1 x i128>, ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP65:%.*]] = load volatile <1 x i128>, ptr @blll2, align 8
+// CHECK-NEXT:    [[CMP63:%.*]] = icmp ne <1 x i128> [[TMP64]], [[TMP65]]
+// CHECK-NEXT:    [[SEXT64:%.*]] = sext <1 x i1> [[CMP63]] to <1 x i128>
+// CHECK-NEXT:    store volatile <1 x i128> [[SEXT64]], ptr @blll, align 8
+// CHECK-NEXT:    [[TMP66:%.*]] = load volatile <1 x i128>, ptr @blll, align 8
+// CHECK-NEXT:    [[TMP67:%.*]] = load volatile <1 x i128>, ptr @ulll2, align 8
+// CHECK-NEXT:    [[CMP65:%.*]] = icmp ne <1 x i128> [[TMP66]], [[TMP67]]
+// CHECK-NEXT:    [[SEXT66:%.*]] = sext <1 x i1> [[CMP65]] to <1 x i128>
+// CHECK-NEXT:    store volatile <1 x i128> [[SEXT66]], ptr @blll, align 8
+// CHECK-NEXT:    [[TMP68:%.*]] = load volatile <1 x i128>, ptr @blll, align 8
+// CHECK-NEXT:    [[TMP69:%.*]] = load volatile <1 x i128>, ptr @blll2, align 8
+// CHECK-NEXT:    [[CMP67:%.*]] = icmp ne <1 x i128> [[TMP68]], [[TMP69]]
+// CHECK-NEXT:    [[SEXT68:%.*]] = sext <1 x i1> [[CMP67]] to <1 x i128>
+// CHECK-NEXT:    store volatile <1 x i128> [[SEXT68]], ptr @blll, align 8
+// CHECK-NEXT:    [[TMP70:%.*]] = load volatile <2 x double>, ptr @fd, align 8
+// CHECK-NEXT:    [[TMP71:%.*]] = load volatile <2 x double>, ptr @fd2, align 8
+// CHECK-NEXT:    [[CMP69:%.*]] = fcmp une <2 x double> [[TMP70]], [[TMP71]]
+// CHECK-NEXT:    [[SEXT70:%.*]] = sext <2 x i1> [[CMP69]] to <2 x i64>
+// CHECK-NEXT:    store volatile <2 x i64> [[SEXT70]], ptr @bl, align 8
 // CHECK-NEXT:    ret void
 //
 void test_cmpne(void) {
@@ -3104,6 +3789,14 @@ void test_cmpne(void) {
   bl = bl != ul2;
   bl = bl != bl2;
 
+  blll = slll != slll2;
+  blll = slll != blll2;
+  blll = blll != slll2;
+  blll = ulll != ulll2;
+  blll = ulll != blll2;
+  blll = blll != ulll2;
+  blll = blll != blll2;
+
   bl = fd != fd2;
 }
 
@@ -3170,11 +3863,26 @@ void test_cmpne(void) {
 // CHECK-NEXT:    [[CMP21:%.*]] = icmp uge <2 x i64> [[TMP22]], [[TMP23]]
 // CHECK-NEXT:    [[SEXT22:%.*]] = sext <2 x i1> [[CMP21]] to <2 x i64>
 // CHECK-NEXT:    store volatile <2 x i64> [[SEXT22]], ptr @bl, align 8
-// CHECK-NEXT:    [[TMP24:%.*]] = load volatile <2 x double>, ptr @fd, align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = load volatile <2 x double>, ptr @fd2, align 8
-// CHECK-NEXT:    [[CMP23:%.*]] = fcmp oge <2 x double> [[TMP24]], [[TMP25]]
-// CHECK-NEXT:    [[SEXT24:%.*]] = sext <2 x i1> [[CMP23]] to <2 x i64>
-// CHECK-NEXT:    store volatile <2 x i64> [[SEXT24]], ptr @bl, align 8
+// CHECK-NEXT:    [[TMP24:%.*]] = load volatile <1 x i128>, ptr @slll, align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = load volatile <1 x i128>, ptr @slll2, align 8
+// CHECK-NEXT:    [[CMP23:%.*]] = icmp sge <1 x i128> [[TMP24]], [[TMP25]]
+// CHECK-NEXT:    [[SEXT24:%.*]] = sext <1 x i1> [[CMP23]] to <1 x i128>
+// CHECK-NEXT:    store volatile <1 x i128> [[SEXT24]], ptr @blll, align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = load volatile <1 x i128>, ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP27:%.*]] = load volatile <1 x i128>, ptr @ulll2, align 8
+// CHECK-NEXT:    [[CMP25:%.*]] = icmp uge <1 x i128> [[TMP26]], [[TMP27]]
+// CHECK-NEXT:    [[SEXT26:%.*]] = sext <1 x i1> [[CMP25]] to <1 x i128>
+// CHECK-NEXT:    store volatile <1 x i128> [[SEXT26]], ptr @blll, align 8
+// CHECK-NEXT:    [[TMP28:%.*]] = load volatile <1 x i128>, ptr @blll, align 8
+// CHECK-NEXT:    [[TMP29:%.*]] = load volatile <1 x i128>, ptr @blll2, align 8
+// CHECK-NEXT:    [[CMP27:%.*]] = icmp uge <1 x i128> [[TMP28]], [[TMP29]]
+// CHECK-NEXT:    [[SEXT28:%.*]] = sext <1 x i1> [[CMP27]] to <1 x i128>
+// CHECK-NEXT:    store volatile <1 x i128> [[SEXT28]], ptr @blll, align 8
+// CHECK-NEXT:    [[TMP30:%.*]] = load volatile <2 x double>, ptr @fd, align 8
+// CHECK-NEXT:    [[TMP31:%.*]] = load volatile <2 x double>, ptr @fd2, align 8
+// CHECK-NEXT:    [[CMP29:%.*]] = fcmp oge <2 x double> [[TMP30]], [[TMP31]]
+// CHECK-NEXT:    [[SEXT30:%.*]] = sext <2 x i1> [[CMP29]] to <2 x i64>
+// CHECK-NEXT:    store volatile <2 x i64> [[SEXT30]], ptr @bl, align 8
 // CHECK-NEXT:    ret void
 //
 void test_cmpge(void) {
@@ -3195,6 +3903,10 @@ void test_cmpge(void) {
   bl = ul >= ul2;
   bl = bl >= bl2;
 
+  blll = slll >= slll2;
+  blll = ulll >= ulll2;
+  blll = blll >= blll2;
+
   bl = fd >= fd2;
 }
 
@@ -3261,11 +3973,26 @@ void test_cmpge(void) {
 // CHECK-NEXT:    [[CMP21:%.*]] = icmp ugt <2 x i64> [[TMP22]], [[TMP23]]
 // CHECK-NEXT:    [[SEXT22:%.*]] = sext <2 x i1> [[CMP21]] to <2 x i64>
 // CHECK-NEXT:    store volatile <2 x i64> [[SEXT22]], ptr @bl, align 8
-// CHECK-NEXT:    [[TMP24:%.*]] = load volatile <2 x double>, ptr @fd, align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = load volatile <2 x double>, ptr @fd2, align 8
-// CHECK-NEXT:    [[CMP23:%.*]] = fcmp ogt <2 x double> [[TMP24]], [[TMP25]]
-// CHECK-NEXT:    [[SEXT24:%.*]] = sext <2 x i1> [[CMP23]] to <2 x i64>
-// CHECK-NEXT:    store volatile <2 x i64> [[SEXT24]], ptr @bl, align 8
+// CHECK-NEXT:    [[TMP24:%.*]] = load volatile <1 x i128>, ptr @slll, align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = load volatile <1 x i128>, ptr @slll2, align 8
+// CHECK-NEXT:    [[CMP23:%.*]] = icmp sgt <1 x i128> [[TMP24]], [[TMP25]]
+// CHECK-NEXT:    [[SEXT24:%.*]] = sext <1 x i1> [[CMP23]] to <1 x i128>
+// CHECK-NEXT:    store volatile <1 x i128> [[SEXT24]], ptr @blll, align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = load volatile <1 x i128>, ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP27:%.*]] = load volatile <1 x i128>, ptr @ulll2, align 8
+// CHECK-NEXT:    [[CMP25:%.*]] = icmp ugt <1 x i128> [[TMP26]], [[TMP27]]
+// CHECK-NEXT:    [[SEXT26:%.*]] = sext <1 x i1> [[CMP25]] to <1 x i128>
+// CHECK-NEXT:    store volatile <1 x i128> [[SEXT26]], ptr @blll, align 8
+// CHECK-NEXT:    [[TMP28:%.*]] = load volatile <1 x i128>, ptr @blll, align 8
+// CHECK-NEXT:    [[TMP29:%.*]] = load volatile <1 x i128>, ptr @blll2, align 8
+// CHECK-NEXT:    [[CMP27:%.*]] = icmp ugt <1 x i128> [[TMP28]], [[TMP29]]
+// CHECK-NEXT:    [[SEXT28:%.*]] = sext <1 x i1> [[CMP27]] to <1 x i128>
+// CHECK-NEXT:    store volatile <1 x i128> [[SEXT28]], ptr @blll, align 8
+// CHECK-NEXT:    [[TMP30:%.*]] = load volatile <2 x double>, ptr @fd, align 8
+// CHECK-NEXT:    [[TMP31:%.*]] = load volatile <2 x double>, ptr @fd2, align 8
+// CHECK-NEXT:    [[CMP29:%.*]] = fcmp ogt <2 x double> [[TMP30]], [[TMP31]]
+// CHECK-NEXT:    [[SEXT30:%.*]] = sext <2 x i1> [[CMP29]] to <2 x i64>
+// CHECK-NEXT:    store volatile <2 x i64> [[SEXT30]], ptr @bl, align 8
 // CHECK-NEXT:    ret void
 //
 void test_cmpgt(void) {
@@ -3286,6 +4013,10 @@ void test_cmpgt(void) {
   bl = ul > ul2;
   bl = bl > bl2;
 
+  blll = slll > slll2;
+  blll = ulll > ulll2;
+  blll = blll > blll2;
+
   bl = fd > fd2;
 }
 
@@ -3352,11 +4083,26 @@ void test_cmpgt(void) {
 // CHECK-NEXT:    [[CMP21:%.*]] = icmp ule <2 x i64> [[TMP22]], [[TMP23]]
 // CHECK-NEXT:    [[SEXT22:%.*]] = sext <2 x i1> [[CMP21]] to <2 x i64>
 // CHECK-NEXT:    store volatile <2 x i64> [[SEXT22]], ptr @bl, align 8
-// CHECK-NEXT:    [[TMP24:%.*]] = load volatile <2 x double>, ptr @fd, align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = load volatile <2 x double>, ptr @fd2, align 8
-// CHECK-NEXT:    [[CMP23:%.*]] = fcmp ole <2 x double> [[TMP24]], [[TMP25]]
-// CHECK-NEXT:    [[SEXT24:%.*]] = sext <2 x i1> [[CMP23]] to <2 x i64>
-// CHECK-NEXT:    store volatile <2 x i64> [[SEXT24]], ptr @bl, align 8
+// CHECK-NEXT:    [[TMP24:%.*]] = load volatile <1 x i128>, ptr @slll, align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = load volatile <1 x i128>, ptr @slll2, align 8
+// CHECK-NEXT:    [[CMP23:%.*]] = icmp sle <1 x i128> [[TMP24]], [[TMP25]]
+// CHECK-NEXT:    [[SEXT24:%.*]] = sext <1 x i1> [[CMP23]] to <1 x i128>
+// CHECK-NEXT:    store volatile <1 x i128> [[SEXT24]], ptr @blll, align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = load volatile <1 x i128>, ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP27:%.*]] = load volatile <1 x i128>, ptr @ulll2, align 8
+// CHECK-NEXT:    [[CMP25:%.*]] = icmp ule <1 x i128> [[TMP26]], [[TMP27]]
+// CHECK-NEXT:    [[SEXT26:%.*]] = sext <1 x i1> [[CMP25]] to <1 x i128>
+// CHECK-NEXT:    store volatile <1 x i128> [[SEXT26]], ptr @blll, align 8
+// CHECK-NEXT:    [[TMP28:%.*]] = load volatile <1 x i128>, ptr @blll, align 8
+// CHECK-NEXT:    [[TMP29:%.*]] = load volatile <1 x i128>, ptr @blll2, align 8
+// CHECK-NEXT:    [[CMP27:%.*]] = icmp ule <1 x i128> [[TMP28]], [[TMP29]]
+// CHECK-NEXT:    [[SEXT28:%.*]] = sext <1 x i1> [[CMP27]] to <1 x i128>
+// CHECK-NEXT:    store volatile <1 x i128> [[SEXT28]], ptr @blll, align 8
+// CHECK-NEXT:    [[TMP30:%.*]] = load volatile <2 x double>, ptr @fd, align 8
+// CHECK-NEXT:    [[TMP31:%.*]] = load volatile <2 x double>, ptr @fd2, align 8
+// CHECK-NEXT:    [[CMP29:%.*]] = fcmp ole <2 x double> [[TMP30]], [[TMP31]]
+// CHECK-NEXT:    [[SEXT30:%.*]] = sext <2 x i1> [[CMP29]] to <2 x i64>
+// CHECK-NEXT:    store volatile <2 x i64> [[SEXT30]], ptr @bl, align 8
 // CHECK-NEXT:    ret void
 //
 void test_cmple(void) {
@@ -3377,6 +4123,10 @@ void test_cmple(void) {
   bl = ul <= ul2;
   bl = bl <= bl2;
 
+  blll = slll <= slll2;
+  blll = ulll <= ulll2;
+  blll = blll <= blll2;
+
   bl = fd <= fd2;
 }
 
@@ -3443,11 +4193,26 @@ void test_cmple(void) {
 // CHECK-NEXT:    [[CMP21:%.*]] = icmp ult <2 x i64> [[TMP22]], [[TMP23]]
 // CHECK-NEXT:    [[SEXT22:%.*]] = sext <2 x i1> [[CMP21]] to <2 x i64>
 // CHECK-NEXT:    store volatile <2 x i64> [[SEXT22]], ptr @bl, align 8
-// CHECK-NEXT:    [[TMP24:%.*]] = load volatile <2 x double>, ptr @fd, align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = load volatile <2 x double>, ptr @fd2, align 8
-// CHECK-NEXT:    [[CMP23:%.*]] = fcmp olt <2 x double> [[TMP24]], [[TMP25]]
-// CHECK-NEXT:    [[SEXT24:%.*]] = sext <2 x i1> [[CMP23]] to <2 x i64>
-// CHECK-NEXT:    store volatile <2 x i64> [[SEXT24]], ptr @bl, align 8
+// CHECK-NEXT:    [[TMP24:%.*]] = load volatile <1 x i128>, ptr @slll, align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = load volatile <1 x i128>, ptr @slll2, align 8
+// CHECK-NEXT:    [[CMP23:%.*]] = icmp slt <1 x i128> [[TMP24]], [[TMP25]]
+// CHECK-NEXT:    [[SEXT24:%.*]] = sext <1 x i1> [[CMP23]] to <1 x i128>
+// CHECK-NEXT:    store volatile <1 x i128> [[SEXT24]], ptr @blll, align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = load volatile <1 x i128>, ptr @ulll, align 8
+// CHECK-NEXT:    [[TMP27:%.*]] = load volatile <1 x i128>, ptr @ulll2, align 8
+// CHECK-NEXT:    [[CMP25:%.*]] = icmp ult <1 x i128> [[TMP26]], [[TMP27]]
+// CHECK-NEXT:    [[SEXT26:%.*]] = sext <1 x i1> [[CMP25]] to <1 x i128>
+// CHECK-NEXT:    store volatile <1 x i128> [[SEXT26]], ptr @blll, align 8
+// CHECK-NEXT:    [[TMP28:%.*]] = load volatile <1 x i128>, ptr @blll, align 8
+// CHECK-NEXT:    [[TMP29:%.*]] = load volatile <1 x i128>, ptr @blll2, align 8
+// CHECK-NEXT:    [[CMP27:%.*]] = icmp ult <1 x i128> [[TMP28]], [[TMP29]]
+// CHECK-NEXT:    [[SEXT28:%.*]] = sext <1 x i1> [[CMP27]] to <1 x i128>
+// CHECK-NEXT:    store volatile <1 x i128> [[SEXT28]], ptr @blll, align 8
+// CHECK-NEXT:    [[TMP30:%.*]] = load volatile <2 x double>, ptr @fd, align 8
+// CHECK-NEXT:    [[TMP31:%.*]] = load volatile <2 x double>, ptr @fd2, align 8
+// CHECK-NEXT:    [[CMP29:%.*]] = fcmp olt <2 x double> [[TMP30]], [[TMP31]]
+// CHECK-NEXT:    [[SEXT30:%.*]] = sext <2 x i1> [[CMP29]] to <2 x i64>
+// CHECK-NEXT:    store volatile <2 x i64> [[SEXT30]], ptr @bl, align 8
 // CHECK-NEXT:    ret void
 //
 void test_cmplt(void) {
@@ -3468,6 +4233,10 @@ void test_cmplt(void) {
   bl = ul < ul2;
   bl = bl < bl2;
 
+  blll = slll < slll2;
+  blll = ulll < ulll2;
+  blll = blll < blll2;
+
   bl = fd < fd2;
 }
 
diff --git clang/test/CodeGen/X86/avx10_2_512minmax-builtins.c clang/test/CodeGen/X86/avx10_2_512minmax-builtins.c
index 4e80d8b36e19..4e467b36b234 100644
--- clang/test/CodeGen/X86/avx10_2_512minmax-builtins.c
+++ clang/test/CodeGen/X86/avx10_2_512minmax-builtins.c
@@ -5,25 +5,25 @@
 
 #include <immintrin.h>
 
-__m512bh test_mm512_minmaxne_pbh(__m512bh __A, __m512bh __B) {
-  // CHECK-LABEL: @test_mm512_minmaxne_pbh(
-  // CHECK: call <32 x bfloat> @llvm.x86.avx10.vminmaxnepbf16512(
-  return _mm512_minmaxne_pbh(__A, __B, 127);
+__m512bh test_mm512_minmax_pbh(__m512bh __A, __m512bh __B) {
+  // CHECK-LABEL: @test_mm512_minmax_pbh(
+  // CHECK: call <32 x bfloat> @llvm.x86.avx10.vminmaxbf16512(
+  return _mm512_minmax_pbh(__A, __B, 127);
 }
 
-__m512bh test_mm512_mask_minmaxne_pbh(__m512bh __A, __mmask32 __B, __m512bh __C, __m512bh __D) {
-  // CHECK-LABEL: @test_mm512_mask_minmaxne_pbh(
-  // CHECK: call <32 x bfloat> @llvm.x86.avx10.vminmaxnepbf16512(
+__m512bh test_mm512_mask_minmax_pbh(__m512bh __A, __mmask32 __B, __m512bh __C, __m512bh __D) {
+  // CHECK-LABEL: @test_mm512_mask_minmax_pbh(
+  // CHECK: call <32 x bfloat> @llvm.x86.avx10.vminmaxbf16512(
   // CHECK: select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}
-  return _mm512_mask_minmaxne_pbh(__A, __B, __C, __D, 127);
+  return _mm512_mask_minmax_pbh(__A, __B, __C, __D, 127);
 }
 
-__m512bh test_mm512_maskz_minmaxne_pbh(__mmask32 __A, __m512bh __B, __m512bh __C) {
-  // CHECK-LABEL: @test_mm512_maskz_minmaxne_pbh(
-  // CHECK: call <32 x bfloat> @llvm.x86.avx10.vminmaxnepbf16512(
+__m512bh test_mm512_maskz_minmax_pbh(__mmask32 __A, __m512bh __B, __m512bh __C) {
+  // CHECK-LABEL: @test_mm512_maskz_minmax_pbh(
+  // CHECK: call <32 x bfloat> @llvm.x86.avx10.vminmaxbf16512(
   // CHECK: zeroinitializer
   // CHECK: select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}}
-  return _mm512_maskz_minmaxne_pbh(__A, __B, __C, 127);
+  return _mm512_maskz_minmax_pbh(__A, __B, __C, 127);
 }
 
 __m512d test_mm512_minmax_pd(__m512d __A, __m512d __B) {
diff --git clang/test/CodeGen/X86/avx10_2_512minmax-error.c clang/test/CodeGen/X86/avx10_2_512minmax-error.c
index e487c3fad49d..6db7801eb004 100644
--- clang/test/CodeGen/X86/avx10_2_512minmax-error.c
+++ clang/test/CodeGen/X86/avx10_2_512minmax-error.c
@@ -5,20 +5,20 @@
 
 #include <immintrin.h>
 
-__m128bh test_mm_minmaxne_pbh(__m128bh __A, __m128bh __B) {
-  return _mm_minmaxne_pbh(__A, __B, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+__m128bh test_mm_minmax_pbh(__m128bh __A, __m128bh __B) {
+  return _mm_minmax_pbh(__A, __B, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
 }
 
-__m128bh test_mm_mask_minmaxne_pbh(__m128bh __A, __mmask8 __B, __m128bh __C, __m128bh __D) {
-  return _mm_mask_minmaxne_pbh(__A, __B, __C, __D, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+__m128bh test_mm_mask_minmax_pbh(__m128bh __A, __mmask8 __B, __m128bh __C, __m128bh __D) {
+  return _mm_mask_minmax_pbh(__A, __B, __C, __D, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
 }
 
-__m256bh test_mm256_minmaxne_pbh(__m256bh __A, __m256bh __B) {
-  return _mm256_minmaxne_pbh(__A, __B, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+__m256bh test_mm256_minmax_pbh(__m256bh __A, __m256bh __B) {
+  return _mm256_minmax_pbh(__A, __B, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
 }
 
-__m256bh test_mm256_mask_minmaxne_pbh(__m256bh __A, __mmask16 __B, __m256bh __C, __m256bh __D) {
-  return _mm256_mask_minmaxne_pbh(__A, __B, __C, __D, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+__m256bh test_mm256_mask_minmax_pbh(__m256bh __A, __mmask16 __B, __m256bh __C, __m256bh __D) {
+  return _mm256_mask_minmax_pbh(__A, __B, __C, __D, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
 }
 
 __m128d test_mm_minmax_pd(__m128d __A, __m128d __B) {
@@ -69,12 +69,12 @@ __m256 test_mm256_mask_minmax_ps(__m256 __A, __mmask8 __B, __m256 __C, __m256 __
   return _mm256_mask_minmax_ps(__A, __B, __C, __D, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
 }
 
-__m512bh test_mm512_minmaxne_pbh(__m512bh __A, __m512bh __B) {
-  return _mm512_minmaxne_pbh(__A, __B, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+__m512bh test_mm512_minmax_pbh(__m512bh __A, __m512bh __B) {
+  return _mm512_minmax_pbh(__A, __B, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
 }
 
-__m512bh test_mm512_mask_minmaxne_pbh(__m512bh __A, __mmask32 __B, __m512bh __C, __m512bh __D) {
-  return _mm512_mask_minmaxne_pbh(__A, __B, __C, __D, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+__m512bh test_mm512_mask_minmax_pbh(__m512bh __A, __mmask32 __B, __m512bh __C, __m512bh __D) {
+  return _mm512_mask_minmax_pbh(__A, __B, __C, __D, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
 }
 
 __m512d test_mm512_minmax_pd(__m512d __A, __m512d __B) {
diff --git clang/test/CodeGen/X86/avx10_2minmax-builtins.c clang/test/CodeGen/X86/avx10_2minmax-builtins.c
index 1efafe24ab12..7e21858c7183 100644
--- clang/test/CodeGen/X86/avx10_2minmax-builtins.c
+++ clang/test/CodeGen/X86/avx10_2minmax-builtins.c
@@ -5,46 +5,46 @@
 
 #include <immintrin.h>
 
-__m128bh test_mm_minmaxne_pbh(__m128bh __A, __m128bh __B) {
-  // CHECK-LABEL: @test_mm_minmaxne_pbh(
-  // CHECK: call <8 x bfloat> @llvm.x86.avx10.vminmaxnepbf16128(
-  return _mm_minmaxne_pbh(__A, __B, 127);
+__m128bh test_mm_minmax_pbh(__m128bh __A, __m128bh __B) {
+  // CHECK-LABEL: @test_mm_minmax_pbh(
+  // CHECK: call <8 x bfloat> @llvm.x86.avx10.vminmaxbf16128(
+  return _mm_minmax_pbh(__A, __B, 127);
 }
 
-__m128bh test_mm_mask_minmaxne_pbh(__m128bh __A, __mmask8 __B, __m128bh __C, __m128bh __D) {
-  // CHECK-LABEL: @test_mm_mask_minmaxne_pbh(
-  // CHECK: call <8 x bfloat> @llvm.x86.avx10.vminmaxnepbf16128(
+__m128bh test_mm_mask_minmax_pbh(__m128bh __A, __mmask8 __B, __m128bh __C, __m128bh __D) {
+  // CHECK-LABEL: @test_mm_mask_minmax_pbh(
+  // CHECK: call <8 x bfloat> @llvm.x86.avx10.vminmaxbf16128(
   // CHECK: select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}
-  return _mm_mask_minmaxne_pbh(__A, __B, __C, __D, 127);
+  return _mm_mask_minmax_pbh(__A, __B, __C, __D, 127);
 }
 
-__m128bh test_mm_maskz_minmaxne_pbh(__mmask8 __A, __m128bh __B, __m128bh __C) {
-  // CHECK-LABEL: @test_mm_maskz_minmaxne_pbh(
-  // CHECK: call <8 x bfloat> @llvm.x86.avx10.vminmaxnepbf16128(
+__m128bh test_mm_maskz_minmax_pbh(__mmask8 __A, __m128bh __B, __m128bh __C) {
+  // CHECK-LABEL: @test_mm_maskz_minmax_pbh(
+  // CHECK: call <8 x bfloat> @llvm.x86.avx10.vminmaxbf16128(
   // CHECK: zeroinitializer
   // CHECK: select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}}
-  return _mm_maskz_minmaxne_pbh(__A, __B, __C, 127);
+  return _mm_maskz_minmax_pbh(__A, __B, __C, 127);
 }
 
-__m256bh test_mm256_minmaxne_pbh(__m256bh __A, __m256bh __B) {
-  // CHECK-LABEL: @test_mm256_minmaxne_pbh(
-  // CHECK: call <16 x bfloat> @llvm.x86.avx10.vminmaxnepbf16256(
-  return _mm256_minmaxne_pbh(__A, __B, 127);
+__m256bh test_mm256_minmax_pbh(__m256bh __A, __m256bh __B) {
+  // CHECK-LABEL: @test_mm256_minmax_pbh(
+  // CHECK: call <16 x bfloat> @llvm.x86.avx10.vminmaxbf16256(
+  return _mm256_minmax_pbh(__A, __B, 127);
 }
 
-__m256bh test_mm256_mask_minmaxne_pbh(__m256bh __A, __mmask16 __B, __m256bh __C, __m256bh __D) {
-  // CHECK-LABEL: @test_mm256_mask_minmaxne_pbh(
-  // CHECK: call <16 x bfloat> @llvm.x86.avx10.vminmaxnepbf16256(
+__m256bh test_mm256_mask_minmax_pbh(__m256bh __A, __mmask16 __B, __m256bh __C, __m256bh __D) {
+  // CHECK-LABEL: @test_mm256_mask_minmax_pbh(
+  // CHECK: call <16 x bfloat> @llvm.x86.avx10.vminmaxbf16256(
   // CHECK: select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}
-  return _mm256_mask_minmaxne_pbh(__A, __B, __C, __D, 127);
+  return _mm256_mask_minmax_pbh(__A, __B, __C, __D, 127);
 }
 
-__m256bh test_mm256_maskz_minmaxne_pbh(__mmask16 __A, __m256bh __B, __m256bh __C) {
-  // CHECK-LABEL: @test_mm256_maskz_minmaxne_pbh(
-  // CHECK: call <16 x bfloat> @llvm.x86.avx10.vminmaxnepbf16256(
+__m256bh test_mm256_maskz_minmax_pbh(__mmask16 __A, __m256bh __B, __m256bh __C) {
+  // CHECK-LABEL: @test_mm256_maskz_minmax_pbh(
+  // CHECK: call <16 x bfloat> @llvm.x86.avx10.vminmaxbf16256(
   // CHECK: zeroinitializer
   // CHECK: select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}}
-  return _mm256_maskz_minmaxne_pbh(__A, __B, __C, 127);
+  return _mm256_maskz_minmax_pbh(__A, __B, __C, 127);
 }
 
 __m128d test_mm_minmax_pd(__m128d __A, __m128d __B) {
diff --git clang/test/CodeGen/X86/avx512fp16-abi.c clang/test/CodeGen/X86/avx512fp16-abi.c
index 3d1b713a610a..1856f1e22715 100644
--- clang/test/CodeGen/X86/avx512fp16-abi.c
+++ clang/test/CodeGen/X86/avx512fp16-abi.c
@@ -206,6 +206,8 @@ struct fsd {
 
 struct fsd pr52011(void) {
   // CHECK: define{{.*}} { float, double } @
+  struct fsd x;
+  return x;
 }
 
 struct hsd {
@@ -216,6 +218,8 @@ struct hsd {
 
 struct hsd pr52011_2(void) {
   // CHECK: define{{.*}} { half, double } @
+  struct hsd x;
+  return x;
 }
 
 struct hsf {
@@ -226,6 +230,8 @@ struct hsf {
 
 struct hsf pr52011_3(void) {
   // CHECK: define{{.*}} <4 x half> @
+  struct hsf x;
+  return x;
 }
 
 struct fds {
@@ -237,4 +243,6 @@ struct fds {
 struct fds pr52011_4(void) {
   // CHECK-C: define{{.*}} { float, double } @pr52011_4
   // CHECK-CPP: define{{.*}} void @_Z9pr52011_4v({{.*}} sret
+  struct fds x;
+  return x;
 }
diff --git clang/test/CodeGen/X86/ms-x86-intrinsics.c clang/test/CodeGen/X86/ms-x86-intrinsics.c
index 94a1b372974b..34cf690e6d5d 100644
--- clang/test/CodeGen/X86/ms-x86-intrinsics.c
+++ clang/test/CodeGen/X86/ms-x86-intrinsics.c
@@ -171,7 +171,7 @@ __int64 test_mul128(__int64 Multiplier,
                     __int64 *HighProduct) {
   return _mul128(Multiplier, Multiplicand, HighProduct);
 }
-// CHECK-X64-LABEL: define dso_local i64 @test_mul128(i64 noundef %Multiplier, i64 noundef %Multiplicand, ptr{{[a-z_ ]*}} initializes((0, 8)) %HighProduct)
+// CHECK-X64-LABEL: define dso_local i64 @test_mul128(i64 noundef %Multiplier, i64 noundef %Multiplicand, ptr{{.*}} initializes((0, 8)) %HighProduct)
 // CHECK-X64: = sext i64 %Multiplier to i128
 // CHECK-X64: = sext i64 %Multiplicand to i128
 // CHECK-X64: = mul nsw i128 %
@@ -183,7 +183,7 @@ unsigned __int64 test_umul128(unsigned __int64 Multiplier,
                               unsigned __int64 *HighProduct) {
   return _umul128(Multiplier, Multiplicand, HighProduct);
 }
-// CHECK-X64-LABEL: define dso_local i64 @test_umul128(i64 noundef %Multiplier, i64 noundef %Multiplicand, ptr{{[a-z_ ]*}} initializes((0, 8)) %HighProduct)
+// CHECK-X64-LABEL: define dso_local i64 @test_umul128(i64 noundef %Multiplier, i64 noundef %Multiplicand, ptr{{.*}} initializes((0, 8)) %HighProduct)
 // CHECK-X64: = zext i64 %Multiplier to i128
 // CHECK-X64: = zext i64 %Multiplicand to i128
 // CHECK-X64: = mul nuw i128 %
diff --git clang/test/CodeGen/X86/x86_64-atomic-128.c clang/test/CodeGen/X86/x86_64-atomic-128.c
index f682ffc75f82..bf050788fa87 100644
--- clang/test/CodeGen/X86/x86_64-atomic-128.c
+++ clang/test/CodeGen/X86/x86_64-atomic-128.c
@@ -26,4 +26,5 @@ __int128 test_expression(_Atomic __int128 *addr) {
   // CHECK-LABEL: @test_expression
   // CHECK: atomicrmw and ptr {{.*}} seq_cst, align 16
   *addr &= 1;
+  return 0;
 }
diff --git clang/test/CodeGen/align-local.c clang/test/CodeGen/align-local.c
index 9e8a56066e1a..a88dd1637a74 100644
--- clang/test/CodeGen/align-local.c
+++ clang/test/CodeGen/align-local.c
@@ -4,7 +4,7 @@ typedef struct __attribute((aligned(16))) {int x[4];} ff;
 
 // CHECK: alloca %struct.ff, align 16
 // CHECK: alloca %struct.anon, align 16
-int a(void) {
+void a(void) {
   ff a;
   struct {int x[4];} b __attribute((aligned(16)));
 }
diff --git clang/test/CodeGen/arm-bf16-convert-intrinsics.c clang/test/CodeGen/arm-bf16-convert-intrinsics.c
index 51aa5aa758f0..93f54c70c340 100644
--- clang/test/CodeGen/arm-bf16-convert-intrinsics.c
+++ clang/test/CodeGen/arm-bf16-convert-intrinsics.c
@@ -223,10 +223,8 @@ float32x4_t test_vcvtq_high_f32_bf16(bfloat16x8_t a) {
 // CHECK-A64-LABEL: @test_vcvt_bf16_f32(
 // CHECK-A64-NEXT:  entry:
 // CHECK-A64-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A64-NEXT:    [[__A64_VCVTQ_LOW_BF16_F321_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float> [[A]])
-// CHECK-A64-NEXT:    [[__A64_VCVTQ_LOW_BF16_F322_I:%.*]] = bitcast <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]] to <16 x i8>
-// CHECK-A64-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]], <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK-A64-NEXT:    ret <4 x bfloat> [[SHUFFLE_I]]
+// CHECK-A64-NEXT:    [[TMP1:%.*]] = fptrunc <4 x float> [[A]] to <4 x bfloat>
+// CHECK-A64-NEXT:    ret <4 x bfloat> [[TMP1]]
 //
 // CHECK-A32-HARDFP-LABEL: @test_vcvt_bf16_f32(
 // CHECK-A32-HARDFP-NEXT:  entry:
@@ -263,9 +261,9 @@ bfloat16x4_t test_vcvt_bf16_f32(float32x4_t a) {
 // CHECK-A64-LABEL: @test_vcvtq_low_bf16_f32(
 // CHECK-A64-NEXT:  entry:
 // CHECK-A64-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A64-NEXT:    [[__A64_VCVTQ_LOW_BF16_F321_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float> [[A]])
-// CHECK-A64-NEXT:    [[__A64_VCVTQ_LOW_BF16_F322_I:%.*]] = bitcast <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]] to <16 x i8>
-// CHECK-A64-NEXT:    ret <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]]
+// CHECK-A64-NEXT:    [[TMP1:%.*]] = fptrunc <4 x float> [[A]] to <4 x bfloat>
+// CHECK-A64-NEXT:    [[TMP2:%.*]] = shufflevector <4 x bfloat> [[TMP1]], <4 x bfloat> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-A64-NEXT:    ret <8 x bfloat> [[TMP2]]
 //
 // CHECK-A32-HARDFP-LABEL: @test_vcvtq_low_bf16_f32(
 // CHECK-A32-HARDFP-NEXT:  entry:
@@ -323,9 +321,10 @@ bfloat16x8_t test_vcvtq_low_bf16_f32(float32x4_t a) {
 // CHECK-A64-NEXT:  entry:
 // CHECK-A64-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[INACTIVE:%.*]] to <16 x i8>
 // CHECK-A64-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A64-NEXT:    [[VCVTQ_HIGH_BF16_F322_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn2(<8 x bfloat> [[INACTIVE]], <4 x float> [[A]])
-// CHECK-A64-NEXT:    [[VCVTQ_HIGH_BF16_F323_I:%.*]] = bitcast <8 x bfloat> [[VCVTQ_HIGH_BF16_F322_I]] to <16 x i8>
-// CHECK-A64-NEXT:    ret <8 x bfloat> [[VCVTQ_HIGH_BF16_F322_I]]
+// CHECK-A64-NEXT:    [[TMP2:%.*]] = shufflevector <8 x bfloat> [[INACTIVE]], <8 x bfloat> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-A64-NEXT:    [[TMP3:%.*]] = fptrunc <4 x float> [[A]] to <4 x bfloat>
+// CHECK-A64-NEXT:    [[TMP4:%.*]] = shufflevector <4 x bfloat> [[TMP2]], <4 x bfloat> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-A64-NEXT:    ret <8 x bfloat> [[TMP4]]
 //
 // CHECK-A32-HARDFP-LABEL: @test_vcvtq_high_bf16_f32(
 // CHECK-A32-HARDFP-NEXT:  entry:
@@ -404,8 +403,8 @@ bfloat16x8_t test_vcvtq_high_bf16_f32(bfloat16x8_t inactive, float32x4_t a) {
 
 // CHECK-A64-LABEL: @test_vcvth_bf16_f32(
 // CHECK-A64-NEXT:  entry:
-// CHECK-A64-NEXT:    [[VCVTH_BF16_F32_I:%.*]] = call bfloat @llvm.aarch64.neon.bfcvt(float [[A:%.*]])
-// CHECK-A64-NEXT:    ret bfloat [[VCVTH_BF16_F32_I]]
+// CHECK-A64-NEXT:    [[TMP0:%.*]] = fptrunc float [[A:%.*]] to bfloat
+// CHECK-A64-NEXT:    ret bfloat [[TMP0]]
 //
 // CHECK-A32-HARDFP-LABEL: @test_vcvth_bf16_f32(
 // CHECK-A32-HARDFP-NEXT:  entry:
diff --git clang/test/CodeGen/arm-cmse-nonsecure.c clang/test/CodeGen/arm-cmse-nonsecure.c
index 3310f0cbd0e9..42221ac775cd 100644
--- clang/test/CodeGen/arm-cmse-nonsecure.c
+++ clang/test/CodeGen/arm-cmse-nonsecure.c
@@ -41,8 +41,8 @@ typedef struct {
     int x, y, z;
 } Point;
 
-void *test_pointed_object(void *p) {
-// CHECK: define {{.*}} ptr @test_pointed_object
+void test_pointed_object(void *p) {
+// CHECK: define {{.*}} void @test_pointed_object
   Point *pt = (Point *)p;
   cmse_check_pointed_object(pt, CMSE_MPU_READ);
 // CHECK: call i32 @llvm.arm.cmse.tt
diff --git clang/test/CodeGen/arm-cmse-secure.c clang/test/CodeGen/arm-cmse-secure.c
index 132172b6c870..305d25ad7388 100644
--- clang/test/CodeGen/arm-cmse-secure.c
+++ clang/test/CodeGen/arm-cmse-secure.c
@@ -53,8 +53,8 @@ typedef struct {
   int x, y, z;
 } Point;
 
-void *test_pointed_object(void *p) {
-// CHECK: define {{.*}} ptr @test_pointed_object
+void test_pointed_object(void *p) {
+// CHECK: define {{.*}} void @test_pointed_object
   Point *pt = (Point *)p;
   cmse_check_pointed_object(pt, CMSE_NONSECURE
                               | CMSE_MPU_READ
diff --git clang/test/CodeGen/attr-counted-by-pr110385.c clang/test/CodeGen/attr-counted-by-pr110385.c
index c2ff032334fe..f211610c3173 100644
--- clang/test/CodeGen/attr-counted-by-pr110385.c
+++ clang/test/CodeGen/attr-counted-by-pr110385.c
@@ -31,11 +31,11 @@ void init(void * __attribute__((pass_dynamic_object_size(0))));
 // CHECK-NEXT:    [[GROWABLE:%.*]] = getelementptr inbounds nuw i8, ptr [[FOO]], i64 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[GROWABLE]], align 8, !tbaa [[TBAA2:![0-9]+]]
 // CHECK-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 12
-// CHECK-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 8
-// CHECK-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOT_COUNTED_BY_GEP]], align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[DOT_COUNTED_BY_LOAD]] to i64
+// CHECK-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 8
+// CHECK-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[COUNTED_BY_LOAD]] to i64
 // CHECK-NEXT:    [[TMP2:%.*]] = shl nsw i64 [[TMP1]], 1
-// CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt i32 [[DOT_COUNTED_BY_LOAD]], -1
+// CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt i32 [[COUNTED_BY_LOAD]], -1
 // CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i64 [[TMP2]], i64 0
 // CHECK-NEXT:    tail call void @init(ptr noundef nonnull [[ARRAY]], i64 noundef [[TMP4]]) #[[ATTR2:[0-9]+]]
 // CHECK-NEXT:    ret void
@@ -48,11 +48,11 @@ void test1(struct bucket *foo) {
 // CHECK-SAME: ptr noundef [[FOO:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[FOO]], i64 16
-// CHECK-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[FOO]], i64 12
-// CHECK-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOT_COUNTED_BY_GEP]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[DOT_COUNTED_BY_LOAD]] to i64
+// CHECK-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[FOO]], i64 12
+// CHECK-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[COUNTED_BY_LOAD]] to i64
 // CHECK-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[TMP0]], 1
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt i32 [[DOT_COUNTED_BY_LOAD]], -1
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt i32 [[COUNTED_BY_LOAD]], -1
 // CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i64 [[TMP1]], i64 0
 // CHECK-NEXT:    tail call void @init(ptr noundef nonnull [[ARRAY]], i64 noundef [[TMP3]]) #[[ATTR2]]
 // CHECK-NEXT:    ret void
@@ -60,3 +60,12 @@ void test1(struct bucket *foo) {
 void test2(struct bucket2 *foo) {
         init(foo->growable.array);
 }
+//.
+// CHECK: [[TBAA2]] = !{[[META3:![0-9]+]], [[META7:![0-9]+]], i64 8}
+// CHECK: [[META3]] = !{!"bucket", [[META4:![0-9]+]], i64 0, [[META7]], i64 8, [[META4]], i64 16}
+// CHECK: [[META4]] = !{!"int", [[META5:![0-9]+]], i64 0}
+// CHECK: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
+// CHECK: [[META6]] = !{!"Simple C/C++ TBAA"}
+// CHECK: [[META7]] = !{!"p1 _ZTS8variable", [[META8:![0-9]+]], i64 0}
+// CHECK: [[META8]] = !{!"any pointer", [[META5]], i64 0}
+//.
diff --git clang/test/CodeGen/attr-counted-by.c clang/test/CodeGen/attr-counted-by.c
index 6b3cad570883..1066e2e74160 100644
--- clang/test/CodeGen/attr-counted-by.c
+++ clang/test/CodeGen/attr-counted-by.c
@@ -108,9 +108,9 @@ void test1(struct annotated *p, int index, int val) {
 // SANITIZE-WITH-ATTR-LABEL: define dso_local void @test2(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
-// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
-// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOT_COUNTED_BY_GEP]], align 4
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOT_COUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
+// SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[COUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[INDEX]], [[TMP0]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT3:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
@@ -119,7 +119,7 @@ void test1(struct annotated *p, int index, int val) {
 // SANITIZE-WITH-ATTR:       cont3:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]]
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = tail call i32 @llvm.smax.i32(i32 [[DOT_COUNTED_BY_LOAD]], i32 0)
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = tail call i32 @llvm.smax.i32(i32 [[COUNTED_BY_LOAD]], i32 0)
 // SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = shl i32 [[TMP2]], 2
 // SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA4]]
 // SANITIZE-WITH-ATTR-NEXT:    ret void
@@ -127,9 +127,9 @@ void test1(struct annotated *p, int index, int val) {
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test2(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef [[P:%.*]], i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOT_COUNTED_BY_GEP]], align 4
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.smax.i32(i32 [[DOT_COUNTED_BY_LOAD]], i32 0)
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.smax.i32(i32 [[COUNTED_BY_LOAD]], i32 0)
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = shl i32 [[TMP0]], 2
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]]
@@ -159,22 +159,22 @@ void test2(struct annotated *p, size_t index) {
 // SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -8589934592, 8589934589) i64 @test2_bdos(
 // SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
-// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
-// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOT_COUNTED_BY_GEP]], align 4
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = sext i32 [[DOT_COUNTED_BY_LOAD]] to i64
+// SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
+// SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = sext i32 [[COUNTED_BY_LOAD]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[TMP0]], 2
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp sgt i32 [[DOT_COUNTED_BY_LOAD]], -1
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp sgt i32 [[COUNTED_BY_LOAD]], -1
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i64 [[TMP1]], i64 0
 // SANITIZE-WITH-ATTR-NEXT:    ret i64 [[TMP3]]
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -8589934592, 8589934589) i64 @test2_bdos(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOT_COUNTED_BY_GEP]], align 4
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = sext i32 [[DOT_COUNTED_BY_LOAD]] to i64
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = sext i32 [[COUNTED_BY_LOAD]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[TMP0]], 2
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp sgt i32 [[DOT_COUNTED_BY_LOAD]], -1
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp sgt i32 [[COUNTED_BY_LOAD]], -1
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i64 [[TMP1]], i64 0
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret i64 [[TMP3]]
 //
@@ -266,19 +266,19 @@ size_t test3_bdos(struct annotated *p) {
 // SANITIZE-WITH-ATTR-LABEL: define dso_local void @test4(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]], i32 noundef [[FAM_IDX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
-// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
-// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOT_COUNTED_BY_GEP]], align 4
+// SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
+// SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOT_COUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[COUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT4:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB5:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont4:
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp sgt i32 [[DOT_COUNTED_BY_LOAD]], 2
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = shl i32 [[DOT_COUNTED_BY_LOAD]], 2
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp sgt i32 [[COUNTED_BY_LOAD]], 2
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = shl i32 [[COUNTED_BY_LOAD]], 2
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 244
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP5:%.*]] = and i32 [[TMP4]], 252
 // SANITIZE-WITH-ATTR-NEXT:    [[CONV1:%.*]] = select i1 [[TMP2]], i32 [[TMP5]], i32 0
@@ -292,16 +292,16 @@ size_t test3_bdos(struct annotated *p) {
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB6:[0-9]+]], i64 [[IDXPROM12]]) #[[ATTR8]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont19:
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[DOT_COUNTED_BY_LOAD]], 3
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[COUNTED_BY_LOAD]], 3
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP8:%.*]] = add i32 [[TMP3]], 240
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP9:%.*]] = and i32 [[TMP8]], 252
 // SANITIZE-WITH-ATTR-NEXT:    [[CONV8:%.*]] = select i1 [[TMP7]], i32 [[TMP9]], i32 0
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds nuw [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM12]]
 // SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV8]], ptr [[ARRAYIDX17]], align 4, !tbaa [[TBAA4]]
-// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD21:%.*]] = load i32, ptr [[DOT_COUNTED_BY_GEP]], align 4
+// SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD21:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[ADD27:%.*]] = add nsw i32 [[INDEX]], 2
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM28:%.*]] = sext i32 [[ADD27]] to i64
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP10:%.*]] = zext i32 [[DOT_COUNTED_BY_LOAD21]] to i64, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP10:%.*]] = zext i32 [[COUNTED_BY_LOAD21]] to i64, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP11:%.*]] = icmp ult i64 [[IDXPROM28]], [[TMP10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP11]], label [[CONT35:%.*]], label [[HANDLER_OUT_OF_BOUNDS31:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds31:
@@ -310,7 +310,7 @@ size_t test3_bdos(struct annotated *p) {
 // SANITIZE-WITH-ATTR:       cont35:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX33:%.*]] = getelementptr inbounds nuw [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM28]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP12:%.*]] = icmp sgt i32 [[FAM_IDX]], -1
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP13:%.*]] = sext i32 [[DOT_COUNTED_BY_LOAD21]] to i64
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP13:%.*]] = sext i32 [[COUNTED_BY_LOAD21]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP14:%.*]] = sext i32 [[FAM_IDX]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP15:%.*]] = sub nsw i64 [[TMP13]], [[TMP14]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP16:%.*]] = icmp sgt i64 [[TMP15]], -1
@@ -325,29 +325,29 @@ size_t test3_bdos(struct annotated *p) {
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test4(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef [[P:%.*]], i32 noundef [[INDEX:%.*]], i32 noundef [[FAM_IDX:%.*]]) local_unnamed_addr #[[ATTR1]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOT_COUNTED_BY_GEP]], align 4
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = shl i32 [[DOT_COUNTED_BY_LOAD]], 2
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = shl i32 [[COUNTED_BY_LOAD]], 2
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = add i32 [[TMP0]], 244
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp sgt i32 [[DOT_COUNTED_BY_LOAD]], 2
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp sgt i32 [[COUNTED_BY_LOAD]], 2
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = and i32 [[TMP1]], 252
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[CONV1:%.*]] = select i1 [[TMP2]], i32 [[TMP3]], i32 0
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV1]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD3:%.*]] = load i32, ptr [[DOT_COUNTED_BY_GEP]], align 4
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP4:%.*]] = shl i32 [[DOT_COUNTED_BY_LOAD3]], 2
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD3:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP4:%.*]] = shl i32 [[COUNTED_BY_LOAD3]], 2
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], 240
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP6:%.*]] = icmp sgt i32 [[DOT_COUNTED_BY_LOAD3]], 3
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP6:%.*]] = icmp sgt i32 [[COUNTED_BY_LOAD3]], 3
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP7:%.*]] = and i32 [[TMP5]], 252
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[CONV5:%.*]] = select i1 [[TMP6]], i32 [[TMP7]], i32 0
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ADD:%.*]] = add nsw i32 [[INDEX]], 1
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM7:%.*]] = sext i32 [[ADD]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM7]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV5]], ptr [[ARRAYIDX8]], align 4, !tbaa [[TBAA2]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD10:%.*]] = load i32, ptr [[DOT_COUNTED_BY_GEP]], align 4
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP8:%.*]] = sext i32 [[DOT_COUNTED_BY_LOAD10]] to i64
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD10:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP8:%.*]] = sext i32 [[COUNTED_BY_LOAD10]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP9:%.*]] = sext i32 [[FAM_IDX]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP10:%.*]] = sub nsw i64 [[TMP8]], [[TMP9]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP11:%.*]] = icmp sgt i64 [[TMP10]], -1
@@ -407,9 +407,9 @@ void test4(struct annotated *p, int index, int fam_idx) {
 // SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -17179869180, 17179869181) i64 @test4_bdos(
 // SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
-// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
-// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOT_COUNTED_BY_GEP]], align 4
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = sext i32 [[DOT_COUNTED_BY_LOAD]] to i64
+// SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
+// SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = sext i32 [[COUNTED_BY_LOAD]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = sub nsw i64 [[TMP0]], [[TMP1]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = shl nsw i64 [[TMP2]], 2
@@ -422,9 +422,9 @@ void test4(struct annotated *p, int index, int fam_idx) {
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -17179869180, 17179869181) i64 @test4_bdos(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOT_COUNTED_BY_GEP]], align 4
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = sext i32 [[DOT_COUNTED_BY_LOAD]] to i64
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = sext i32 [[COUNTED_BY_LOAD]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = sext i32 [[INDEX]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = sub nsw i64 [[TMP0]], [[TMP1]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = shl nsw i64 [[TMP2]], 2
@@ -523,10 +523,10 @@ size_t test5_bdos(struct anon_struct *p) {
 // SANITIZE-WITH-ATTR-LABEL: define dso_local void @test6(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
-// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
-// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i64, ptr [[DOT_COUNTED_BY_GEP]], align 4
+// SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
+// SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i64, ptr [[COUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = icmp ugt i64 [[DOT_COUNTED_BY_LOAD]], [[IDXPROM]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = icmp ugt i64 [[COUNTED_BY_LOAD]], [[IDXPROM]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label [[CONT3:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB9:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META2]]
@@ -534,7 +534,7 @@ size_t test5_bdos(struct anon_struct *p) {
 // SANITIZE-WITH-ATTR:       cont3:
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[TMP1]], i64 0, i64 [[IDXPROM]]
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.smax.i64(i64 [[DOT_COUNTED_BY_LOAD]], i64 0)
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.smax.i64(i64 [[COUNTED_BY_LOAD]], i64 0)
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTTR:%.*]] = trunc i64 [[TMP2]] to i32
 // SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = shl i32 [[DOTTR]], 2
 // SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA4]]
@@ -543,9 +543,9 @@ size_t test5_bdos(struct anon_struct *p) {
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test6(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i64, ptr [[DOT_COUNTED_BY_GEP]], align 4
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.smax.i64(i64 [[DOT_COUNTED_BY_LOAD]], i64 0)
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i64, ptr [[COUNTED_BY_GEP]], align 4
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.smax.i64(i64 [[COUNTED_BY_LOAD]], i64 0)
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[DOTTR:%.*]] = trunc i64 [[TMP0]] to i32
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = shl i32 [[DOTTR]], 2
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16
@@ -579,18 +579,18 @@ void test6(struct anon_struct *p, int index) {
 // SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 0, -3) i64 @test6_bdos(
 // SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
-// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
-// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i64, ptr [[DOT_COUNTED_BY_GEP]], align 4
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.smax.i64(i64 [[DOT_COUNTED_BY_LOAD]], i64 0)
+// SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
+// SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i64, ptr [[COUNTED_BY_GEP]], align 4
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.smax.i64(i64 [[COUNTED_BY_LOAD]], i64 0)
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 2
 // SANITIZE-WITH-ATTR-NEXT:    ret i64 [[TMP1]]
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 0, -3) i64 @test6_bdos(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i64, ptr [[DOT_COUNTED_BY_GEP]], align 4
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.smax.i64(i64 [[DOT_COUNTED_BY_LOAD]], i64 0)
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i64, ptr [[COUNTED_BY_GEP]], align 4
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.smax.i64(i64 [[COUNTED_BY_LOAD]], i64 0)
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 2
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret i64 [[TMP1]]
 //
@@ -684,10 +684,10 @@ size_t test7_bdos(struct union_of_fams *p) {
 // SANITIZE-WITH-ATTR-LABEL: define dso_local void @test8(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
-// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
-// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i8, ptr [[DOT_COUNTED_BY_GEP]], align 4
+// SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
+// SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i8, ptr [[COUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i8 [[DOT_COUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i8 [[COUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT7:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
@@ -696,18 +696,18 @@ size_t test7_bdos(struct union_of_fams *p) {
 // SANITIZE-WITH-ATTR:       cont7:
 // SANITIZE-WITH-ATTR-NEXT:    [[INTS:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 9
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [0 x i8], ptr [[INTS]], i64 0, i64 [[IDXPROM]]
-// SANITIZE-WITH-ATTR-NEXT:    store i8 [[DOT_COUNTED_BY_LOAD]], ptr [[ARRAYIDX]], align 1, !tbaa [[TBAA8]]
+// SANITIZE-WITH-ATTR-NEXT:    store i8 [[COUNTED_BY_LOAD]], ptr [[ARRAYIDX]], align 1, !tbaa [[TBAA8]]
 // SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test8(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i8, ptr [[DOT_COUNTED_BY_GEP]], align 4
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i8, ptr [[COUNTED_BY_GEP]], align 4
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[INTS:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 9
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i8], ptr [[INTS]], i64 0, i64 [[IDXPROM]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    store i8 [[DOT_COUNTED_BY_LOAD]], ptr [[ARRAYIDX]], align 1, !tbaa [[TBAA6]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    store i8 [[COUNTED_BY_LOAD]], ptr [[ARRAYIDX]], align 1, !tbaa [[TBAA6]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test8(
@@ -735,17 +735,17 @@ void test8(struct union_of_fams *p, int index) {
 // SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 0, 256) i64 @test8_bdos(
 // SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
-// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
-// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i8, ptr [[DOT_COUNTED_BY_GEP]], align 4
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i8 [[DOT_COUNTED_BY_LOAD]] to i64
+// SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
+// SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i8, ptr [[COUNTED_BY_GEP]], align 4
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i8 [[COUNTED_BY_LOAD]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    ret i64 [[TMP0]]
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 0, 256) i64 @test8_bdos(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i8, ptr [[DOT_COUNTED_BY_GEP]], align 4
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i8 [[DOT_COUNTED_BY_LOAD]] to i64
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i8, ptr [[COUNTED_BY_GEP]], align 4
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i8 [[COUNTED_BY_LOAD]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret i64 [[TMP0]]
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test8_bdos(
@@ -838,10 +838,10 @@ size_t test9_bdos(struct union_of_fams *p) {
 // SANITIZE-WITH-ATTR-LABEL: define dso_local void @test10(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
-// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
-// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOT_COUNTED_BY_GEP]], align 4
+// SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
+// SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOT_COUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[COUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT7:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
@@ -850,7 +850,7 @@ size_t test9_bdos(struct union_of_fams *p) {
 // SANITIZE-WITH-ATTR:       cont7:
 // SANITIZE-WITH-ATTR-NEXT:    [[BYTES:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [0 x i8], ptr [[BYTES]], i64 0, i64 [[IDXPROM]]
-// SANITIZE-WITH-ATTR-NEXT:    [[NARROW:%.*]] = tail call i32 @llvm.smax.i32(i32 [[DOT_COUNTED_BY_LOAD]], i32 0)
+// SANITIZE-WITH-ATTR-NEXT:    [[NARROW:%.*]] = tail call i32 @llvm.smax.i32(i32 [[COUNTED_BY_LOAD]], i32 0)
 // SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = trunc i32 [[NARROW]] to i8
 // SANITIZE-WITH-ATTR-NEXT:    store i8 [[CONV]], ptr [[ARRAYIDX]], align 1, !tbaa [[TBAA8]]
 // SANITIZE-WITH-ATTR-NEXT:    ret void
@@ -858,9 +858,9 @@ size_t test9_bdos(struct union_of_fams *p) {
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test10(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOT_COUNTED_BY_GEP]], align 4
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[NARROW:%.*]] = tail call i32 @llvm.smax.i32(i32 [[DOT_COUNTED_BY_LOAD]], i32 0)
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[NARROW:%.*]] = tail call i32 @llvm.smax.i32(i32 [[COUNTED_BY_LOAD]], i32 0)
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = trunc i32 [[NARROW]] to i8
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[BYTES:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
@@ -893,18 +893,18 @@ void test10(struct union_of_fams *p, int index) {
 // SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 0, 2147483648) i64 @test10_bdos(
 // SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
-// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
-// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOT_COUNTED_BY_GEP]], align 4
-// SANITIZE-WITH-ATTR-NEXT:    [[NARROW:%.*]] = tail call i32 @llvm.smax.i32(i32 [[DOT_COUNTED_BY_LOAD]], i32 0)
+// SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
+// SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
+// SANITIZE-WITH-ATTR-NEXT:    [[NARROW:%.*]] = tail call i32 @llvm.smax.i32(i32 [[COUNTED_BY_LOAD]], i32 0)
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext nneg i32 [[NARROW]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    ret i64 [[TMP0]]
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 0, 2147483648) i64 @test10_bdos(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOT_COUNTED_BY_GEP]], align 4
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[NARROW:%.*]] = tail call i32 @llvm.smax.i32(i32 [[DOT_COUNTED_BY_LOAD]], i32 0)
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[NARROW:%.*]] = tail call i32 @llvm.smax.i32(i32 [[COUNTED_BY_LOAD]], i32 0)
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext nneg i32 [[NARROW]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret i64 [[TMP0]]
 //
@@ -1715,10 +1715,10 @@ struct annotated_struct_array {
 // SANITIZE-WITH-ATTR:       cont3:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [10 x ptr], ptr [[ANN]], i64 0, i64 [[TMP1]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA23:![0-9]+]]
-// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 8
-// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOT_COUNTED_BY_GEP]], align 4
+// SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 8
+// SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM15:%.*]] = sext i32 [[IDX2]] to i64
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = zext i32 [[DOT_COUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = zext i32 [[COUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[IDXPROM15]], [[TMP3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP4]], label [[CONT20:%.*]], label [[HANDLER_OUT_OF_BOUNDS16:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds16:
@@ -1727,7 +1727,7 @@ struct annotated_struct_array {
 // SANITIZE-WITH-ATTR:       cont20:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 12
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds nuw [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM15]]
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP5:%.*]] = tail call i32 @llvm.smax.i32(i32 [[DOT_COUNTED_BY_LOAD]], i32 0)
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP5:%.*]] = tail call i32 @llvm.smax.i32(i32 [[COUNTED_BY_LOAD]], i32 0)
 // SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = shl i32 [[TMP5]], 2
 // SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX18]], align 4, !tbaa [[TBAA4]]
 // SANITIZE-WITH-ATTR-NEXT:    ret void
@@ -1738,9 +1738,9 @@ struct annotated_struct_array {
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IDX1]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x ptr], ptr [[ANN]], i64 0, i64 [[IDXPROM]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA20:![0-9]+]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 8
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOT_COUNTED_BY_GEP]], align 4
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.smax.i32(i32 [[DOT_COUNTED_BY_LOAD]], i32 0)
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 8
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.smax.i32(i32 [[COUNTED_BY_LOAD]], i32 0)
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = shl i32 [[TMP1]], 2
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 12
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM4:%.*]] = sext i32 [[IDX2]] to i64
diff --git clang/test/CodeGen/attr-noinline.cpp clang/test/CodeGen/attr-noinline.cpp
index c1fb9941b525..69916be31953 100644
--- clang/test/CodeGen/attr-noinline.cpp
+++ clang/test/CodeGen/attr-noinline.cpp
@@ -8,7 +8,7 @@ static int baz(int x) {
     return x * 10;
 }
 
-[[clang::noinline]] bool noi() { }
+[[clang::noinline]] bool noi() { return true; }
 [[msvc::noinline]] bool ms_noi() { return true; }
 
 void foo(int i) {
diff --git clang/test/CodeGen/attr-noreturn.c clang/test/CodeGen/attr-noreturn.c
index 93816b7570e8..c3f41d8424be 100644
--- clang/test/CodeGen/attr-noreturn.c
+++ clang/test/CodeGen/attr-noreturn.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -emit-llvm -std=c2x %s -o - | FileCheck %s
-// RUN: %clang_cc1 -triple %itanium_abi_triple -x c++ -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK-CXX
+// RUN: %clang_cc1 -Wno-error=return-type -emit-llvm -std=c2x %s -o - | FileCheck %s
+// RUN: %clang_cc1 -Wno-error=return-type -triple %itanium_abi_triple -x c++ -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK-CXX
 
 typedef void (*fptrs_t[4])(void);
 fptrs_t p __attribute__((noreturn));
diff --git clang/test/CodeGen/block-copy.c clang/test/CodeGen/block-copy.c
index 4679fe19b78f..69881264c932 100644
--- clang/test/CodeGen/block-copy.c
+++ clang/test/CodeGen/block-copy.c
@@ -6,7 +6,7 @@
 
 void foo(float *);
 
-float bar(void) {
+void bar(void) {
   float lookupTable[] = {-1,-1,-1,0, -1,-1,0,-1, -1,-1,0,1, -1,-1,1,0,
 			 -1,0,-1,-1, -1,0,-1,1, -1,0,1,-1, -1,0,1,1,
 			 -1,1,-1,0, -1,1,0,-1, -1,1,0,1, -1,1,1,0,
diff --git clang/test/CodeGen/builtin-memfns.c clang/test/CodeGen/builtin-memfns.c
index 581eb85eb28e..40e287c06517 100644
--- clang/test/CodeGen/builtin-memfns.c
+++ clang/test/CodeGen/builtin-memfns.c
@@ -56,7 +56,7 @@ int test6(char *X) {
 
 // CHECK: @test7
 // PR12094
-int test7(int *p) {
+void test7(int *p) {
   struct snd_pcm_hw_params_t* hwparams;  // incomplete type.
   
   // CHECK: call void @llvm.memset{{.*}} align 4 {{.*}}256, i1 false)
diff --git clang/test/CodeGen/catch-undef-behavior.c clang/test/CodeGen/catch-undef-behavior.c
index 7580290b0b03..14cb7705c378 100644
--- clang/test/CodeGen/catch-undef-behavior.c
+++ clang/test/CodeGen/catch-undef-behavior.c
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -fsanitize=alignment,null,object-size,shift-base,shift-exponent,return,signed-integer-overflow,vla-bound,float-cast-overflow,integer-divide-by-zero,bool,returns-nonnull-attribute,nonnull-attribute -fsanitize-recover=alignment,null,object-size,shift-base,shift-exponent,signed-integer-overflow,vla-bound,float-cast-overflow,integer-divide-by-zero,bool,returns-nonnull-attribute,nonnull-attribute -emit-llvm %s -o - -triple x86_64-linux-gnu | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-UBSAN
-// RUN: %clang_cc1 -fsanitize-trap=alignment,null,object-size,shift-base,shift-exponent,return,signed-integer-overflow,vla-bound,float-cast-overflow,integer-divide-by-zero,bool,returns-nonnull-attribute,nonnull-attribute -fsanitize-recover=alignment,null,object-size,shift-base,shift-exponent,signed-integer-overflow,vla-bound,float-cast-overflow,integer-divide-by-zero,bool,returns-nonnull-attribute,nonnull-attribute -fsanitize=alignment,null,object-size,shift-base,shift-exponent,return,signed-integer-overflow,vla-bound,float-cast-overflow,integer-divide-by-zero,bool,returns-nonnull-attribute,nonnull-attribute -fsanitize-recover=alignment,null,object-size,shift-base,shift-exponent,signed-integer-overflow,vla-bound,float-cast-overflow,integer-divide-by-zero,bool,returns-nonnull-attribute,nonnull-attribute -emit-llvm %s -o - -triple x86_64-linux-gnu | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-TRAP
-// RUN: %clang_cc1 -fsanitize=signed-integer-overflow -emit-llvm %s -o - -triple x86_64-linux-gnu | FileCheck %s --check-prefix=CHECK-OVERFLOW
+// RUN: %clang_cc1 -Wno-error=return-type -fsanitize=alignment,null,object-size,shift-base,shift-exponent,return,signed-integer-overflow,vla-bound,float-cast-overflow,integer-divide-by-zero,bool,returns-nonnull-attribute,nonnull-attribute -fsanitize-recover=alignment,null,object-size,shift-base,shift-exponent,signed-integer-overflow,vla-bound,float-cast-overflow,integer-divide-by-zero,bool,returns-nonnull-attribute,nonnull-attribute -emit-llvm %s -o - -triple x86_64-linux-gnu | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-UBSAN
+// RUN: %clang_cc1 -Wno-error=return-type -fsanitize-trap=alignment,null,object-size,shift-base,shift-exponent,return,signed-integer-overflow,vla-bound,float-cast-overflow,integer-divide-by-zero,bool,returns-nonnull-attribute,nonnull-attribute -fsanitize-recover=alignment,null,object-size,shift-base,shift-exponent,signed-integer-overflow,vla-bound,float-cast-overflow,integer-divide-by-zero,bool,returns-nonnull-attribute,nonnull-attribute -fsanitize=alignment,null,object-size,shift-base,shift-exponent,return,signed-integer-overflow,vla-bound,float-cast-overflow,integer-divide-by-zero,bool,returns-nonnull-attribute,nonnull-attribute -fsanitize-recover=alignment,null,object-size,shift-base,shift-exponent,signed-integer-overflow,vla-bound,float-cast-overflow,integer-divide-by-zero,bool,returns-nonnull-attribute,nonnull-attribute -emit-llvm %s -o - -triple x86_64-linux-gnu | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-TRAP
+// RUN: %clang_cc1 -Wno-error=return-type -fsanitize=signed-integer-overflow -emit-llvm %s -o - -triple x86_64-linux-gnu | FileCheck %s --check-prefix=CHECK-OVERFLOW
 
 // CHECK-UBSAN: @[[INT:.*]] = private unnamed_addr constant { i16, i16, [6 x i8] } { i16 0, i16 11, [6 x i8] c"'int'\00" }
 
diff --git clang/test/CodeGen/cfi-unrelated-cast.cpp clang/test/CodeGen/cfi-unrelated-cast.cpp
index abd67901f002..4095d67450d3 100644
--- clang/test/CodeGen/cfi-unrelated-cast.cpp
+++ clang/test/CodeGen/cfi-unrelated-cast.cpp
@@ -29,7 +29,7 @@ class C1 {
   virtual void f() {}
 };
 
-C1 *f1() {
+void f1() {
   myalloc<C1> allocator;
   (void)allocator.allocate(16);
   (void)allocator.allocate(16, 0);
diff --git clang/test/CodeGen/const-label-addr.c clang/test/CodeGen/const-label-addr.c
index edfff0f8f639..8030f96cb8ae 100644
--- clang/test/CodeGen/const-label-addr.c
+++ clang/test/CodeGen/const-label-addr.c
@@ -2,7 +2,7 @@
 // REQUIRES: asserts
 
 // CHECK: @a.a = internal global ptr blockaddress(@a, %A)
-int a(void) {
+void a(void) {
 A:;static void* a = &&A;
 }
 
diff --git clang/test/CodeGen/debug-info-crash.c clang/test/CodeGen/debug-info-crash.c
index b53dbc50676d..e3a8f81d3508 100644
--- clang/test/CodeGen/debug-info-crash.c
+++ clang/test/CodeGen/debug-info-crash.c
@@ -6,7 +6,7 @@ __attribute__((visibility("default")))
 extern struct dispatch_queue_s _dispatch_main_q;
 typedef struct dispatch_item_s *dispatch_item_t;
 typedef void (^dispatch_legacy_block_t)(dispatch_item_t);
-dispatch_item_t LEGACY_dispatch_call(dispatch_queue_t dq,
+void LEGACY_dispatch_call(dispatch_queue_t dq,
                                      dispatch_legacy_block_t dispatch_block,
                                      dispatch_legacy_block_t callback_block) {
   dispatch_queue_t lq = _dispatch_queue_get_current() ?: (&_dispatch_main_q);
diff --git clang/test/CodeGen/debug-info.c clang/test/CodeGen/debug-info.c
index 4c444515757b..024e9579843d 100644
--- clang/test/CodeGen/debug-info.c
+++ clang/test/CodeGen/debug-info.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple x86_64-unk-unk -o - -emit-llvm -debug-info-kind=limited %s | FileCheck %s
-// RUN: %clang_cc1 -triple powerpc64-ibm-aix-xcoff -o - -emit-llvm -debug-info-kind=limited %s | FileCheck %s
+// RUN: %clang_cc1 -Wno-error=return-type -triple x86_64-unk-unk -o - -emit-llvm -debug-info-kind=limited %s | FileCheck %s
+// RUN: %clang_cc1 -Wno-error=return-type -triple powerpc64-ibm-aix-xcoff -o - -emit-llvm -debug-info-kind=limited %s | FileCheck %s
 
 // PR3023
 void convert(void) {
diff --git clang/test/CodeGen/empty-union-init.c clang/test/CodeGen/empty-union-init.c
index f1cdfb12f624..386aeb260fae 100644
--- clang/test/CodeGen/empty-union-init.c
+++ clang/test/CodeGen/empty-union-init.c
@@ -8,6 +8,7 @@ struct Mem {
 
 struct Mem *columnMem(void){
         static const struct Mem nullMem = { {} };
+        return 0;
 }
 
 
diff --git clang/test/CodeGen/exceptions-seh.c clang/test/CodeGen/exceptions-seh.c
index a102afb1d271..25d622419b09 100644
--- clang/test/CodeGen/exceptions-seh.c
+++ clang/test/CodeGen/exceptions-seh.c
@@ -276,6 +276,7 @@ int exception_code_in_except(void) {
   } __except(1) {
     return _exception_code();
   }
+  return 0;
 }
 
 // CHECK-LABEL: define dso_local i32 @exception_code_in_except()
diff --git clang/test/CodeGen/exprs.c clang/test/CodeGen/exprs.c
index 13a64f027106..5cca9722dcb3 100644
--- clang/test/CodeGen/exprs.c
+++ clang/test/CodeGen/exprs.c
@@ -7,7 +7,7 @@ int x=sizeof(zxcv);
 int y=__alignof__(zxcv);
 
 
-void *test(int *i) {
+void test(int *i) {
  short a = 1;
  i += a;
  i + a;
@@ -18,7 +18,7 @@ _Bool test2b;
 int test2(void) { if (test2b); return 0; }
 
 // PR1921
-int test3(void) {
+void test3(void) {
   const unsigned char *bp;
   bp -= (short)1;
 }
diff --git clang/test/CodeGen/ext-int-cc.c clang/test/CodeGen/ext-int-cc.c
index 05b2bf1bec81..14efd54e24ff 100644
--- clang/test/CodeGen/ext-int-cc.c
+++ clang/test/CodeGen/ext-int-cc.c
@@ -162,7 +162,7 @@ void ParamPassing4(_BitInt(129) a) {}
 // LA32-NOT: define{{.*}} void @ParamPassing4(ptr %{{.+}})
 #endif
 
-_BitInt(63) ReturnPassing(void){}
+_BitInt(63) ReturnPassing(void) { return 0; }
 // LIN64: define{{.*}} i64 @ReturnPassing(
 // WIN64: define dso_local i63 @ReturnPassing(
 // LIN32: define{{.*}} i63 @ReturnPassing(
@@ -193,7 +193,7 @@ _BitInt(63) ReturnPassing(void){}
 // LA64: define{{.*}} signext i63 @ReturnPassing(
 // LA32: define{{.*}} i63 @ReturnPassing(
 
-_BitInt(64) ReturnPassing2(void){}
+_BitInt(64) ReturnPassing2(void) { return 0; }
 // LIN64: define{{.*}} i64 @ReturnPassing2(
 // WIN64: define dso_local i64 @ReturnPassing2(
 // LIN32: define{{.*}} i64 @ReturnPassing2(
@@ -224,7 +224,7 @@ _BitInt(64) ReturnPassing2(void){}
 // LA64: define{{.*}} i64 @ReturnPassing2(
 // LA32: define{{.*}} i64 @ReturnPassing2(
 
-_BitInt(127) ReturnPassing3(void){}
+_BitInt(127) ReturnPassing3(void) { return 0; }
 // LIN64: define{{.*}} { i64, i64 } @ReturnPassing3(
 // WIN64: define dso_local void @ReturnPassing3(ptr dead_on_unwind noalias writable sret
 // LIN32: define{{.*}} void @ReturnPassing3(ptr dead_on_unwind noalias writable sret
@@ -257,7 +257,7 @@ _BitInt(127) ReturnPassing3(void){}
 // LA64: define{{.*}} i127 @ReturnPassing3(
 // LA32: define{{.*}} void @ReturnPassing3(ptr dead_on_unwind noalias writable sret
 
-_BitInt(128) ReturnPassing4(void){}
+_BitInt(128) ReturnPassing4(void) { return 0; }
 // LIN64: define{{.*}} { i64, i64 } @ReturnPassing4(
 // WIN64: define dso_local void @ReturnPassing4(ptr dead_on_unwind noalias writable sret
 // LIN32: define{{.*}} void @ReturnPassing4(ptr dead_on_unwind noalias writable sret
@@ -289,7 +289,7 @@ _BitInt(128) ReturnPassing4(void){}
 // LA32: define{{.*}} void @ReturnPassing4(ptr dead_on_unwind noalias writable sret
 
 #if __BITINT_MAXWIDTH__ > 128
-_BitInt(129) ReturnPassing5(void){}
+_BitInt(129) ReturnPassing5(void) { return 0; }
 // LIN64: define{{.*}} void @ReturnPassing5(ptr dead_on_unwind noalias writable sret
 // WIN64: define dso_local void @ReturnPassing5(ptr dead_on_unwind noalias writable sret
 // LIN32: define{{.*}} void @ReturnPassing5(ptr dead_on_unwind noalias writable sret
@@ -322,8 +322,8 @@ _BitInt(129) ReturnPassing5(void){}
 
 // SparcV9 is odd in that it has a return-size limit of 256, not 128 or 64
 // like other platforms, so test to make sure this behavior will still work.
-_BitInt(256) ReturnPassing6(void) {}
+_BitInt(256) ReturnPassing6(void) { return 0; }
 // SPARCV9-NOT: define{{.*}} i256 @ReturnPassing6(
-_BitInt(257) ReturnPassing7(void) {}
+_BitInt(257) ReturnPassing7(void) { return 0; }
 // SPARCV9-NOT: define{{.*}} void @ReturnPassing7(ptr dead_on_unwind noalias writable sret
 #endif
diff --git clang/test/CodeGen/implicit-arg.c clang/test/CodeGen/implicit-arg.c
index 9959419f9b7c..3630fbbbdb1a 100644
--- clang/test/CodeGen/implicit-arg.c
+++ clang/test/CodeGen/implicit-arg.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 %s -emit-llvm     -o -
-// RUN: %clang_cc1 %s -emit-llvm -O1 -o -
+// RUN: %clang_cc1 %s -Wno-error=return-type -emit-llvm     -o -
+// RUN: %clang_cc1 %s -Wno-error=return-type -emit-llvm -O1 -o -
 
 static int bar();
 void foo() {
diff --git clang/test/CodeGen/mips-vector-return.c clang/test/CodeGen/mips-vector-return.c
index c2a9bbfe9db0..bd9d9391a5da 100644
--- clang/test/CodeGen/mips-vector-return.c
+++ clang/test/CodeGen/mips-vector-return.c
@@ -8,14 +8,14 @@ typedef float  v4sf __attribute__ ((__vector_size__ (16)));
 typedef double v4df __attribute__ ((__vector_size__ (32)));
 typedef int v4i32 __attribute__ ((__vector_size__ (16)));
 
-// O32-LABEL: define{{.*}} void @test_v4sf(ptr dead_on_unwind noalias nocapture writable writeonly sret
+// O32-LABEL: define{{.*}} void @test_v4sf(ptr {{.*}} sret
 // N64: define{{.*}} inreg { i64, i64 } @test_v4sf
 v4sf test_v4sf(float a) {
   return (v4sf){0.0f, a, 0.0f, 0.0f};
 }
 
-// O32-LABEL: define{{.*}} void @test_v4df(ptr dead_on_unwind noalias nocapture writable writeonly sret
-// N64-LABEL: define{{.*}} void @test_v4df(ptr dead_on_unwind noalias nocapture writable writeonly sret
+// O32-LABEL: define{{.*}} void @test_v4df(ptr {{.*}} sret
+// N64-LABEL: define{{.*}} void @test_v4df(ptr {{.*}} sret
 v4df test_v4df(double a) {
   return (v4df){0.0, a, 0.0, 0.0};
 }
diff --git clang/test/CodeGen/mips64-nontrivial-return.cpp clang/test/CodeGen/mips64-nontrivial-return.cpp
index a8fbf4622f80..a038574140bb 100644
--- clang/test/CodeGen/mips64-nontrivial-return.cpp
+++ clang/test/CodeGen/mips64-nontrivial-return.cpp
@@ -10,7 +10,7 @@ class D : public B {
 
 extern D gd0;
 
-// CHECK: _Z4foo1v(ptr dead_on_unwind noalias nocapture writable writeonly sret
+// CHECK: _Z4foo1v(ptr {{.*}} sret
 
 D foo1(void) {
   return gd0;
diff --git clang/test/CodeGen/ms-intrinsics-other.c clang/test/CodeGen/ms-intrinsics-other.c
index fa8422e5bf19..013277cbf6a2 100644
--- clang/test/CodeGen/ms-intrinsics-other.c
+++ clang/test/CodeGen/ms-intrinsics-other.c
@@ -49,7 +49,7 @@ extern "C" {
 unsigned char test_BitScanForward(unsigned LONG *Index, unsigned LONG Mask) {
   return _BitScanForward(Index, Mask);
 }
-// CHECK: define{{.*}}i8 @test_BitScanForward(ptr {{[a-z_ ]*}}%Index, i32 {{[a-z_ ]*}}%Mask){{.*}}{
+// CHECK: define{{.*}}i8 @test_BitScanForward(ptr {{.*}}%Index, i32 {{[a-z_ ]*}}%Mask){{.*}}{
 // CHECK:   [[ISNOTZERO:%[a-z0-9._]+]] = icmp eq i32 %Mask, 0
 // CHECK:   br i1 [[ISNOTZERO]], label %[[END_LABEL:[a-z0-9._]+]], label %[[ISNOTZERO_LABEL:[a-z0-9._]+]]
 // CHECK:   [[END_LABEL]]:
@@ -63,7 +63,7 @@ unsigned char test_BitScanForward(unsigned LONG *Index, unsigned LONG Mask) {
 unsigned char test_BitScanReverse(unsigned LONG *Index, unsigned LONG Mask) {
   return _BitScanReverse(Index, Mask);
 }
-// CHECK: define{{.*}}i8 @test_BitScanReverse(ptr {{[a-z_ ]*}}%Index, i32 {{[a-z_ ]*}}%Mask){{.*}}{
+// CHECK: define{{.*}}i8 @test_BitScanReverse(ptr {{.*}}%Index, i32 {{[a-z_ ]*}}%Mask){{.*}}{
 // CHECK:   [[ISNOTZERO:%[0-9]+]] = icmp eq i32 %Mask, 0
 // CHECK:   br i1 [[ISNOTZERO]], label %[[END_LABEL:[a-z0-9._]+]], label %[[ISNOTZERO_LABEL:[a-z0-9._]+]]
 // CHECK:   [[END_LABEL]]:
@@ -79,7 +79,7 @@ unsigned char test_BitScanReverse(unsigned LONG *Index, unsigned LONG Mask) {
 unsigned char test_BitScanForward64(unsigned LONG *Index, unsigned __int64 Mask) {
   return _BitScanForward64(Index, Mask);
 }
-// CHECK: define{{.*}}i8 @test_BitScanForward64(ptr {{[a-z_ ]*}}%Index, i64 {{[a-z_ ]*}}%Mask){{.*}}{
+// CHECK: define{{.*}}i8 @test_BitScanForward64(ptr {{.*}}%Index, i64 {{[a-z_ ]*}}%Mask){{.*}}{
 // CHECK:   [[ISNOTZERO:%[a-z0-9._]+]] = icmp eq i64 %Mask, 0
 // CHECK:   br i1 [[ISNOTZERO]], label %[[END_LABEL:[a-z0-9._]+]], label %[[ISNOTZERO_LABEL:[a-z0-9._]+]]
 // CHECK:   [[END_LABEL]]:
@@ -94,7 +94,7 @@ unsigned char test_BitScanForward64(unsigned LONG *Index, unsigned __int64 Mask)
 unsigned char test_BitScanReverse64(unsigned LONG *Index, unsigned __int64 Mask) {
   return _BitScanReverse64(Index, Mask);
 }
-// CHECK: define{{.*}}i8 @test_BitScanReverse64(ptr {{[a-z_ ]*}}%Index, i64 {{[a-z_ ]*}}%Mask){{.*}}{
+// CHECK: define{{.*}}i8 @test_BitScanReverse64(ptr {{.*}}%Index, i64 {{[a-z_ ]*}}%Mask){{.*}}{
 // CHECK:   [[ISNOTZERO:%[0-9]+]] = icmp eq i64 %Mask, 0
 // CHECK:   br i1 [[ISNOTZERO]], label %[[END_LABEL:[a-z0-9._]+]], label %[[ISNOTZERO_LABEL:[a-z0-9._]+]]
 // CHECK:   [[END_LABEL]]:
@@ -111,7 +111,7 @@ unsigned char test_BitScanReverse64(unsigned LONG *Index, unsigned __int64 Mask)
 LONG test_InterlockedExchange(LONG volatile *value, LONG mask) {
   return _InterlockedExchange(value, mask);
 }
-// CHECK: define{{.*}}i32 @test_InterlockedExchange(ptr{{[a-z_ ]*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK: define{{.*}}i32 @test_InterlockedExchange(ptr{{.*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK:   [[RESULT:%[0-9]+]] = atomicrmw xchg ptr %value, i32 %mask seq_cst, align 4
 // CHECK:   ret i32 [[RESULT:%[0-9]+]]
 // CHECK: }
@@ -119,7 +119,7 @@ LONG test_InterlockedExchange(LONG volatile *value, LONG mask) {
 LONG test_InterlockedExchangeAdd(LONG volatile *value, LONG mask) {
   return _InterlockedExchangeAdd(value, mask);
 }
-// CHECK: define{{.*}}i32 @test_InterlockedExchangeAdd(ptr{{[a-z_ ]*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK: define{{.*}}i32 @test_InterlockedExchangeAdd(ptr{{.*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK:   [[RESULT:%[0-9]+]] = atomicrmw add ptr %value, i32 %mask seq_cst, align 4
 // CHECK:   ret i32 [[RESULT:%[0-9]+]]
 // CHECK: }
@@ -127,7 +127,7 @@ LONG test_InterlockedExchangeAdd(LONG volatile *value, LONG mask) {
 LONG test_InterlockedExchangeSub(LONG volatile *value, LONG mask) {
   return _InterlockedExchangeSub(value, mask);
 }
-// CHECK: define{{.*}}i32 @test_InterlockedExchangeSub(ptr{{[a-z_ ]*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK: define{{.*}}i32 @test_InterlockedExchangeSub(ptr{{.*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK:   [[RESULT:%[0-9]+]] = atomicrmw sub ptr %value, i32 %mask seq_cst, align 4
 // CHECK:   ret i32 [[RESULT:%[0-9]+]]
 // CHECK: }
@@ -135,7 +135,7 @@ LONG test_InterlockedExchangeSub(LONG volatile *value, LONG mask) {
 LONG test_InterlockedOr(LONG volatile *value, LONG mask) {
   return _InterlockedOr(value, mask);
 }
-// CHECK: define{{.*}}i32 @test_InterlockedOr(ptr{{[a-z_ ]*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK: define{{.*}}i32 @test_InterlockedOr(ptr{{.*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK:   [[RESULT:%[0-9]+]] = atomicrmw or ptr %value, i32 %mask seq_cst, align 4
 // CHECK:   ret i32 [[RESULT:%[0-9]+]]
 // CHECK: }
@@ -143,7 +143,7 @@ LONG test_InterlockedOr(LONG volatile *value, LONG mask) {
 LONG test_InterlockedXor(LONG volatile *value, LONG mask) {
   return _InterlockedXor(value, mask);
 }
-// CHECK: define{{.*}}i32 @test_InterlockedXor(ptr{{[a-z_ ]*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK: define{{.*}}i32 @test_InterlockedXor(ptr{{.*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK:   [[RESULT:%[0-9]+]] = atomicrmw xor ptr %value, i32 %mask seq_cst, align 4
 // CHECK:   ret i32 [[RESULT:%[0-9]+]]
 // CHECK: }
@@ -151,7 +151,7 @@ LONG test_InterlockedXor(LONG volatile *value, LONG mask) {
 LONG test_InterlockedAnd(LONG volatile *value, LONG mask) {
   return _InterlockedAnd(value, mask);
 }
-// CHECK: define{{.*}}i32 @test_InterlockedAnd(ptr{{[a-z_ ]*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK: define{{.*}}i32 @test_InterlockedAnd(ptr{{.*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK:   [[RESULT:%[0-9]+]] = atomicrmw and ptr %value, i32 %mask seq_cst, align 4
 // CHECK:   ret i32 [[RESULT:%[0-9]+]]
 // CHECK: }
@@ -168,7 +168,7 @@ LONG test_InterlockedCompareExchange(LONG volatile *Destination, LONG Exchange,
 LONG test_InterlockedIncrement(LONG volatile *Addend) {
   return _InterlockedIncrement(Addend);
 }
-// CHECK: define{{.*}}i32 @test_InterlockedIncrement(ptr{{[a-z_ ]*}}%Addend){{.*}}{
+// CHECK: define{{.*}}i32 @test_InterlockedIncrement(ptr{{.*}}%Addend){{.*}}{
 // CHECK: [[TMP:%[0-9]+]] = atomicrmw add ptr %Addend, i32 1 seq_cst, align 4
 // CHECK: [[RESULT:%[0-9]+]] = add i32 [[TMP]], 1
 // CHECK: ret i32 [[RESULT]]
@@ -177,7 +177,7 @@ LONG test_InterlockedIncrement(LONG volatile *Addend) {
 LONG test_InterlockedDecrement(LONG volatile *Addend) {
   return _InterlockedDecrement(Addend);
 }
-// CHECK: define{{.*}}i32 @test_InterlockedDecrement(ptr{{[a-z_ ]*}}%Addend){{.*}}{
+// CHECK: define{{.*}}i32 @test_InterlockedDecrement(ptr{{.*}}%Addend){{.*}}{
 // CHECK: [[TMP:%[0-9]+]] = atomicrmw sub ptr %Addend, i32 1 seq_cst, align 4
 // CHECK: [[RESULT:%[0-9]+]] = add i32 [[TMP]], -1
 // CHECK: ret i32 [[RESULT]]
@@ -236,7 +236,7 @@ LONG test_InterlockedAdd(LONG volatile *Addend, LONG Value) {
   return _InterlockedAdd(Addend, Value);
 }
 
-// CHECK-ARM-ARM64: define{{.*}}i32 @test_InterlockedAdd(ptr{{[a-z_ ]*}}%Addend, i32 noundef %Value) {{.*}} {
+// CHECK-ARM-ARM64: define{{.*}}i32 @test_InterlockedAdd(ptr{{.*}}%Addend, i32 noundef %Value) {{.*}} {
 // CHECK-ARM-ARM64: %[[OLDVAL:[0-9]+]] = atomicrmw add ptr %Addend, i32 %Value seq_cst, align 4
 // CHECK-ARM-ARM64: %[[NEWVAL:[0-9]+]] = add i32 %[[OLDVAL:[0-9]+]], %Value
 // CHECK-ARM-ARM64: ret i32 %[[NEWVAL:[0-9]+]]
@@ -245,7 +245,7 @@ __int64 test_InterlockedAdd64(__int64 volatile *Addend, __int64 Value) {
   return _InterlockedAdd64(Addend, Value);
 }
 
-// CHECK-ARM-ARM64: define{{.*}}i64 @test_InterlockedAdd64(ptr{{[a-z_ ]*}}%Addend, i64 noundef %Value) {{.*}} {
+// CHECK-ARM-ARM64: define{{.*}}i64 @test_InterlockedAdd64(ptr{{.*}}%Addend, i64 noundef %Value) {{.*}} {
 // CHECK-ARM-ARM64: %[[OLDVAL:[0-9]+]] = atomicrmw add ptr %Addend, i64 %Value seq_cst, align 8
 // CHECK-ARM-ARM64: %[[NEWVAL:[0-9]+]] = add i64 %[[OLDVAL:[0-9]+]], %Value
 // CHECK-ARM-ARM64: ret i64 %[[NEWVAL:[0-9]+]]
@@ -255,21 +255,21 @@ __int64 test_InterlockedAdd64(__int64 volatile *Addend, __int64 Value) {
 LONG test_InterlockedExchangeAdd_acq(LONG volatile *value, LONG mask) {
   return _InterlockedExchangeAdd_acq(value, mask);
 }
-// CHECK-ARM: define{{.*}}i32 @test_InterlockedExchangeAdd_acq(ptr{{[a-z_ ]*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM: define{{.*}}i32 @test_InterlockedExchangeAdd_acq(ptr{{.*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM:   [[RESULT:%[0-9]+]] = atomicrmw add ptr %value, i32 %mask acquire, align 4
 // CHECK-ARM:   ret i32 [[RESULT:%[0-9]+]]
 // CHECK-ARM: }
 LONG test_InterlockedExchangeAdd_rel(LONG volatile *value, LONG mask) {
   return _InterlockedExchangeAdd_rel(value, mask);
 }
-// CHECK-ARM: define{{.*}}i32 @test_InterlockedExchangeAdd_rel(ptr{{[a-z_ ]*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM: define{{.*}}i32 @test_InterlockedExchangeAdd_rel(ptr{{.*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM:   [[RESULT:%[0-9]+]] = atomicrmw add ptr %value, i32 %mask release, align 4
 // CHECK-ARM:   ret i32 [[RESULT:%[0-9]+]]
 // CHECK-ARM: }
 LONG test_InterlockedExchangeAdd_nf(LONG volatile *value, LONG mask) {
   return _InterlockedExchangeAdd_nf(value, mask);
 }
-// CHECK-ARM: define{{.*}}i32 @test_InterlockedExchangeAdd_nf(ptr{{[a-z_ ]*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM: define{{.*}}i32 @test_InterlockedExchangeAdd_nf(ptr{{.*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM:   [[RESULT:%[0-9]+]] = atomicrmw add ptr %value, i32 %mask monotonic, align 4
 // CHECK-ARM:   ret i32 [[RESULT:%[0-9]+]]
 // CHECK-ARM: }
@@ -277,21 +277,21 @@ LONG test_InterlockedExchangeAdd_nf(LONG volatile *value, LONG mask) {
 LONG test_InterlockedExchange_acq(LONG volatile *value, LONG mask) {
   return _InterlockedExchange_acq(value, mask);
 }
-// CHECK-ARM: define{{.*}}i32 @test_InterlockedExchange_acq(ptr{{[a-z_ ]*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM: define{{.*}}i32 @test_InterlockedExchange_acq(ptr{{.*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM:   [[RESULT:%[0-9]+]] = atomicrmw xchg ptr %value, i32 %mask acquire, align 4
 // CHECK-ARM:   ret i32 [[RESULT:%[0-9]+]]
 // CHECK-ARM: }
 LONG test_InterlockedExchange_rel(LONG volatile *value, LONG mask) {
   return _InterlockedExchange_rel(value, mask);
 }
-// CHECK-ARM: define{{.*}}i32 @test_InterlockedExchange_rel(ptr{{[a-z_ ]*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM: define{{.*}}i32 @test_InterlockedExchange_rel(ptr{{.*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM:   [[RESULT:%[0-9]+]] = atomicrmw xchg ptr %value, i32 %mask release, align 4
 // CHECK-ARM:   ret i32 [[RESULT:%[0-9]+]]
 // CHECK-ARM: }
 LONG test_InterlockedExchange_nf(LONG volatile *value, LONG mask) {
   return _InterlockedExchange_nf(value, mask);
 }
-// CHECK-ARM: define{{.*}}i32 @test_InterlockedExchange_nf(ptr{{[a-z_ ]*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM: define{{.*}}i32 @test_InterlockedExchange_nf(ptr{{.*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM:   [[RESULT:%[0-9]+]] = atomicrmw xchg ptr %value, i32 %mask monotonic, align 4
 // CHECK-ARM:   ret i32 [[RESULT:%[0-9]+]]
 // CHECK-ARM: }
@@ -299,7 +299,7 @@ LONG test_InterlockedExchange_nf(LONG volatile *value, LONG mask) {
 LONG test_InterlockedCompareExchange_acq(LONG volatile *Destination, LONG Exchange, LONG Comperand) {
   return _InterlockedCompareExchange_acq(Destination, Exchange, Comperand);
 }
-// CHECK-ARM: define{{.*}}i32 @test_InterlockedCompareExchange_acq(ptr{{[a-z_ ]*}}%Destination, i32{{[a-z_ ]*}}%Exchange, i32{{[a-z_ ]*}}%Comperand){{.*}}{
+// CHECK-ARM: define{{.*}}i32 @test_InterlockedCompareExchange_acq(ptr{{.*}}%Destination, i32{{[a-z_ ]*}}%Exchange, i32{{[a-z_ ]*}}%Comperand){{.*}}{
 // CHECK-ARM: [[TMP:%[0-9]+]] = cmpxchg volatile ptr %Destination, i32 %Comperand, i32 %Exchange acquire acquire, align 4
 // CHECK-ARM: [[RESULT:%[0-9]+]] = extractvalue { i32, i1 } [[TMP]], 0
 // CHECK-ARM: ret i32 [[RESULT]]
@@ -308,7 +308,7 @@ LONG test_InterlockedCompareExchange_acq(LONG volatile *Destination, LONG Exchan
 LONG test_InterlockedCompareExchange_rel(LONG volatile *Destination, LONG Exchange, LONG Comperand) {
   return _InterlockedCompareExchange_rel(Destination, Exchange, Comperand);
 }
-// CHECK-ARM: define{{.*}}i32 @test_InterlockedCompareExchange_rel(ptr{{[a-z_ ]*}}%Destination, i32{{[a-z_ ]*}}%Exchange, i32{{[a-z_ ]*}}%Comperand){{.*}}{
+// CHECK-ARM: define{{.*}}i32 @test_InterlockedCompareExchange_rel(ptr{{.*}}%Destination, i32{{[a-z_ ]*}}%Exchange, i32{{[a-z_ ]*}}%Comperand){{.*}}{
 // CHECK-ARM: [[TMP:%[0-9]+]] = cmpxchg volatile ptr %Destination, i32 %Comperand, i32 %Exchange release monotonic, align 4
 // CHECK-ARM: [[RESULT:%[0-9]+]] = extractvalue { i32, i1 } [[TMP]], 0
 // CHECK-ARM: ret i32 [[RESULT]]
@@ -317,7 +317,7 @@ LONG test_InterlockedCompareExchange_rel(LONG volatile *Destination, LONG Exchan
 LONG test_InterlockedCompareExchange_nf(LONG volatile *Destination, LONG Exchange, LONG Comperand) {
   return _InterlockedCompareExchange_nf(Destination, Exchange, Comperand);
 }
-// CHECK-ARM: define{{.*}}i32 @test_InterlockedCompareExchange_nf(ptr{{[a-z_ ]*}}%Destination, i32{{[a-z_ ]*}}%Exchange, i32{{[a-z_ ]*}}%Comperand){{.*}}{
+// CHECK-ARM: define{{.*}}i32 @test_InterlockedCompareExchange_nf(ptr{{.*}}%Destination, i32{{[a-z_ ]*}}%Exchange, i32{{[a-z_ ]*}}%Comperand){{.*}}{
 // CHECK-ARM: [[TMP:%[0-9]+]] = cmpxchg volatile ptr %Destination, i32 %Comperand, i32 %Exchange monotonic monotonic, align 4
 // CHECK-ARM: [[RESULT:%[0-9]+]] = extractvalue { i32, i1 } [[TMP]], 0
 // CHECK-ARM: ret i32 [[RESULT]]
@@ -326,7 +326,7 @@ LONG test_InterlockedCompareExchange_nf(LONG volatile *Destination, LONG Exchang
 LONG test_InterlockedOr_acq(LONG volatile *value, LONG mask) {
   return _InterlockedOr_acq(value, mask);
 }
-// CHECK-ARM: define{{.*}}i32 @test_InterlockedOr_acq(ptr{{[a-z_ ]*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM: define{{.*}}i32 @test_InterlockedOr_acq(ptr{{.*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM:   [[RESULT:%[0-9]+]] = atomicrmw or ptr %value, i32 %mask acquire, align 4
 // CHECK-ARM:   ret i32 [[RESULT:%[0-9]+]]
 // CHECK-ARM: }
@@ -334,7 +334,7 @@ LONG test_InterlockedOr_acq(LONG volatile *value, LONG mask) {
 LONG test_InterlockedOr_rel(LONG volatile *value, LONG mask) {
   return _InterlockedOr_rel(value, mask);
 }
-// CHECK-ARM: define{{.*}}i32 @test_InterlockedOr_rel(ptr{{[a-z_ ]*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM: define{{.*}}i32 @test_InterlockedOr_rel(ptr{{.*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM:   [[RESULT:%[0-9]+]] = atomicrmw or ptr %value, i32 %mask release, align 4
 // CHECK-ARM:   ret i32 [[RESULT:%[0-9]+]]
 // CHECK-ARM: }
@@ -342,7 +342,7 @@ LONG test_InterlockedOr_rel(LONG volatile *value, LONG mask) {
 LONG test_InterlockedOr_nf(LONG volatile *value, LONG mask) {
   return _InterlockedOr_nf(value, mask);
 }
-// CHECK-ARM: define{{.*}}i32 @test_InterlockedOr_nf(ptr{{[a-z_ ]*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM: define{{.*}}i32 @test_InterlockedOr_nf(ptr{{.*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM:   [[RESULT:%[0-9]+]] = atomicrmw or ptr %value, i32 %mask monotonic, align 4
 // CHECK-ARM:   ret i32 [[RESULT:%[0-9]+]]
 // CHECK-ARM: }
@@ -350,7 +350,7 @@ LONG test_InterlockedOr_nf(LONG volatile *value, LONG mask) {
 LONG test_InterlockedXor_acq(LONG volatile *value, LONG mask) {
   return _InterlockedXor_acq(value, mask);
 }
-// CHECK-ARM: define{{.*}}i32 @test_InterlockedXor_acq(ptr{{[a-z_ ]*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM: define{{.*}}i32 @test_InterlockedXor_acq(ptr{{.*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM:   [[RESULT:%[0-9]+]] = atomicrmw xor ptr %value, i32 %mask acquire, align 4
 // CHECK-ARM:   ret i32 [[RESULT:%[0-9]+]]
 // CHECK-ARM: }
@@ -358,7 +358,7 @@ LONG test_InterlockedXor_acq(LONG volatile *value, LONG mask) {
 LONG test_InterlockedXor_rel(LONG volatile *value, LONG mask) {
   return _InterlockedXor_rel(value, mask);
 }
-// CHECK-ARM: define{{.*}}i32 @test_InterlockedXor_rel(ptr{{[a-z_ ]*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM: define{{.*}}i32 @test_InterlockedXor_rel(ptr{{.*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM:   [[RESULT:%[0-9]+]] = atomicrmw xor ptr %value, i32 %mask release, align 4
 // CHECK-ARM:   ret i32 [[RESULT:%[0-9]+]]
 // CHECK-ARM: }
@@ -366,7 +366,7 @@ LONG test_InterlockedXor_rel(LONG volatile *value, LONG mask) {
 LONG test_InterlockedXor_nf(LONG volatile *value, LONG mask) {
   return _InterlockedXor_nf(value, mask);
 }
-// CHECK-ARM: define{{.*}}i32 @test_InterlockedXor_nf(ptr{{[a-z_ ]*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM: define{{.*}}i32 @test_InterlockedXor_nf(ptr{{.*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM:   [[RESULT:%[0-9]+]] = atomicrmw xor ptr %value, i32 %mask monotonic, align 4
 // CHECK-ARM:   ret i32 [[RESULT:%[0-9]+]]
 // CHECK-ARM: }
@@ -374,7 +374,7 @@ LONG test_InterlockedXor_nf(LONG volatile *value, LONG mask) {
 LONG test_InterlockedAnd_acq(LONG volatile *value, LONG mask) {
   return _InterlockedAnd_acq(value, mask);
 }
-// CHECK-ARM: define{{.*}}i32 @test_InterlockedAnd_acq(ptr{{[a-z_ ]*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM: define{{.*}}i32 @test_InterlockedAnd_acq(ptr{{.*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM:   [[RESULT:%[0-9]+]] = atomicrmw and ptr %value, i32 %mask acquire, align 4
 // CHECK-ARM:   ret i32 [[RESULT:%[0-9]+]]
 // CHECK-ARM: }
@@ -382,7 +382,7 @@ LONG test_InterlockedAnd_acq(LONG volatile *value, LONG mask) {
 LONG test_InterlockedAnd_rel(LONG volatile *value, LONG mask) {
   return _InterlockedAnd_rel(value, mask);
 }
-// CHECK-ARM: define{{.*}}i32 @test_InterlockedAnd_rel(ptr{{[a-z_ ]*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM: define{{.*}}i32 @test_InterlockedAnd_rel(ptr{{.*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM:   [[RESULT:%[0-9]+]] = atomicrmw and ptr %value, i32 %mask release, align 4
 // CHECK-ARM:   ret i32 [[RESULT:%[0-9]+]]
 // CHECK-ARM: }
@@ -390,7 +390,7 @@ LONG test_InterlockedAnd_rel(LONG volatile *value, LONG mask) {
 LONG test_InterlockedAnd_nf(LONG volatile *value, LONG mask) {
   return _InterlockedAnd_nf(value, mask);
 }
-// CHECK-ARM: define{{.*}}i32 @test_InterlockedAnd_nf(ptr{{[a-z_ ]*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM: define{{.*}}i32 @test_InterlockedAnd_nf(ptr{{.*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM:   [[RESULT:%[0-9]+]] = atomicrmw and ptr %value, i32 %mask monotonic, align 4
 // CHECK-ARM:   ret i32 [[RESULT:%[0-9]+]]
 // CHECK-ARM: }
@@ -399,7 +399,7 @@ LONG test_InterlockedAnd_nf(LONG volatile *value, LONG mask) {
 LONG test_InterlockedIncrement_acq(LONG volatile *Addend) {
   return _InterlockedIncrement_acq(Addend);
 }
-// CHECK-ARM: define{{.*}}i32 @test_InterlockedIncrement_acq(ptr{{[a-z_ ]*}}%Addend){{.*}}{
+// CHECK-ARM: define{{.*}}i32 @test_InterlockedIncrement_acq(ptr{{.*}}%Addend){{.*}}{
 // CHECK-ARM: [[TMP:%[0-9]+]] = atomicrmw add ptr %Addend, i32 1 acquire, align 4
 // CHECK-ARM: [[RESULT:%[0-9]+]] = add i32 [[TMP]], 1
 // CHECK-ARM: ret i32 [[RESULT]]
@@ -408,7 +408,7 @@ LONG test_InterlockedIncrement_acq(LONG volatile *Addend) {
 LONG test_InterlockedIncrement_rel(LONG volatile *Addend) {
   return _InterlockedIncrement_rel(Addend);
 }
-// CHECK-ARM: define{{.*}}i32 @test_InterlockedIncrement_rel(ptr{{[a-z_ ]*}}%Addend){{.*}}{
+// CHECK-ARM: define{{.*}}i32 @test_InterlockedIncrement_rel(ptr{{.*}}%Addend){{.*}}{
 // CHECK-ARM: [[TMP:%[0-9]+]] = atomicrmw add ptr %Addend, i32 1 release, align 4
 // CHECK-ARM: [[RESULT:%[0-9]+]] = add i32 [[TMP]], 1
 // CHECK-ARM: ret i32 [[RESULT]]
@@ -417,7 +417,7 @@ LONG test_InterlockedIncrement_rel(LONG volatile *Addend) {
 LONG test_InterlockedIncrement_nf(LONG volatile *Addend) {
   return _InterlockedIncrement_nf(Addend);
 }
-// CHECK-ARM: define{{.*}}i32 @test_InterlockedIncrement_nf(ptr{{[a-z_ ]*}}%Addend){{.*}}{
+// CHECK-ARM: define{{.*}}i32 @test_InterlockedIncrement_nf(ptr{{.*}}%Addend){{.*}}{
 // CHECK-ARM: [[TMP:%[0-9]+]] = atomicrmw add ptr %Addend, i32 1 monotonic, align 4
 // CHECK-ARM: [[RESULT:%[0-9]+]] = add i32 [[TMP]], 1
 // CHECK-ARM: ret i32 [[RESULT]]
@@ -426,7 +426,7 @@ LONG test_InterlockedIncrement_nf(LONG volatile *Addend) {
 LONG test_InterlockedDecrement_acq(LONG volatile *Addend) {
   return _InterlockedDecrement_acq(Addend);
 }
-// CHECK-ARM: define{{.*}}i32 @test_InterlockedDecrement_acq(ptr{{[a-z_ ]*}}%Addend){{.*}}{
+// CHECK-ARM: define{{.*}}i32 @test_InterlockedDecrement_acq(ptr{{.*}}%Addend){{.*}}{
 // CHECK-ARM: [[TMP:%[0-9]+]] = atomicrmw sub ptr %Addend, i32 1 acquire, align 4
 // CHECK-ARM: [[RESULT:%[0-9]+]] = add i32 [[TMP]], -1
 // CHECK-ARM: ret i32 [[RESULT]]
@@ -435,7 +435,7 @@ LONG test_InterlockedDecrement_acq(LONG volatile *Addend) {
 LONG test_InterlockedDecrement_rel(LONG volatile *Addend) {
   return _InterlockedDecrement_rel(Addend);
 }
-// CHECK-ARM: define{{.*}}i32 @test_InterlockedDecrement_rel(ptr{{[a-z_ ]*}}%Addend){{.*}}{
+// CHECK-ARM: define{{.*}}i32 @test_InterlockedDecrement_rel(ptr{{.*}}%Addend){{.*}}{
 // CHECK-ARM: [[TMP:%[0-9]+]] = atomicrmw sub ptr %Addend, i32 1 release, align 4
 // CHECK-ARM: [[RESULT:%[0-9]+]] = add i32 [[TMP]], -1
 // CHECK-ARM: ret i32 [[RESULT]]
@@ -444,7 +444,7 @@ LONG test_InterlockedDecrement_rel(LONG volatile *Addend) {
 LONG test_InterlockedDecrement_nf(LONG volatile *Addend) {
   return _InterlockedDecrement_nf(Addend);
 }
-// CHECK-ARM: define{{.*}}i32 @test_InterlockedDecrement_nf(ptr{{[a-z_ ]*}}%Addend){{.*}}{
+// CHECK-ARM: define{{.*}}i32 @test_InterlockedDecrement_nf(ptr{{.*}}%Addend){{.*}}{
 // CHECK-ARM: [[TMP:%[0-9]+]] = atomicrmw sub ptr %Addend, i32 1 monotonic, align 4
 // CHECK-ARM: [[RESULT:%[0-9]+]] = add i32 [[TMP]], -1
 // CHECK-ARM: ret i32 [[RESULT]]
diff --git clang/test/CodeGen/ms-intrinsics.c clang/test/CodeGen/ms-intrinsics.c
index bb1b95fa7521..b86662ee6778 100644
--- clang/test/CodeGen/ms-intrinsics.c
+++ clang/test/CodeGen/ms-intrinsics.c
@@ -149,7 +149,7 @@ void *test_AddressOfReturnAddress(void) {
 unsigned char test_BitScanForward(unsigned long *Index, unsigned long Mask) {
   return _BitScanForward(++Index, Mask);
 }
-// CHECK: define{{.*}}i8 @test_BitScanForward(ptr {{[a-z_ ]*}}%Index, i32 {{[a-z_ ]*}}%Mask){{.*}}{
+// CHECK: define{{.*}}i8 @test_BitScanForward(ptr {{.*}}%Index, i32 {{[a-z_ ]*}}%Mask){{.*}}{
 // CHECK:   [[ISNOTZERO:%[a-z0-9._]+]] = icmp eq i32 %Mask, 0
 // CHECK:   br i1 [[ISNOTZERO]], label %[[END_LABEL:[a-z0-9._]+]], label %[[ISNOTZERO_LABEL:[a-z0-9._]+]]
 // CHECK:   [[END_LABEL]]:
@@ -164,7 +164,7 @@ unsigned char test_BitScanForward(unsigned long *Index, unsigned long Mask) {
 unsigned char test_BitScanReverse(unsigned long *Index, unsigned long Mask) {
   return _BitScanReverse(++Index, Mask);
 }
-// CHECK: define{{.*}}i8 @test_BitScanReverse(ptr {{[a-z_ ]*}}%Index, i32 {{[a-z_ ]*}}%Mask){{.*}}{
+// CHECK: define{{.*}}i8 @test_BitScanReverse(ptr {{.*}}%Index, i32 {{[a-z_ ]*}}%Mask){{.*}}{
 // CHECK:   [[ISNOTZERO:%[0-9]+]] = icmp eq i32 %Mask, 0
 // CHECK:   br i1 [[ISNOTZERO]], label %[[END_LABEL:[a-z0-9._]+]], label %[[ISNOTZERO_LABEL:[a-z0-9._]+]]
 // CHECK:   [[END_LABEL]]:
@@ -181,7 +181,7 @@ unsigned char test_BitScanReverse(unsigned long *Index, unsigned long Mask) {
 unsigned char test_BitScanForward64(unsigned long *Index, unsigned __int64 Mask) {
   return _BitScanForward64(Index, Mask);
 }
-// CHECK-ARM-X64: define{{.*}}i8 @test_BitScanForward64(ptr {{[a-z_ ]*}}%Index, i64 {{[a-z_ ]*}}%Mask){{.*}}{
+// CHECK-ARM-X64: define{{.*}}i8 @test_BitScanForward64(ptr {{.*}}%Index, i64 {{[a-z_ ]*}}%Mask){{.*}}{
 // CHECK-ARM-X64:   [[ISNOTZERO:%[a-z0-9._]+]] = icmp eq i64 %Mask, 0
 // CHECK-ARM-X64:   br i1 [[ISNOTZERO]], label %[[END_LABEL:[a-z0-9._]+]], label %[[ISNOTZERO_LABEL:[a-z0-9._]+]]
 // CHECK-ARM-X64:   [[END_LABEL]]:
@@ -196,7 +196,7 @@ unsigned char test_BitScanForward64(unsigned long *Index, unsigned __int64 Mask)
 unsigned char test_BitScanReverse64(unsigned long *Index, unsigned __int64 Mask) {
   return _BitScanReverse64(Index, Mask);
 }
-// CHECK-ARM-X64: define{{.*}}i8 @test_BitScanReverse64(ptr {{[a-z_ ]*}}%Index, i64 {{[a-z_ ]*}}%Mask){{.*}}{
+// CHECK-ARM-X64: define{{.*}}i8 @test_BitScanReverse64(ptr {{.*}}%Index, i64 {{[a-z_ ]*}}%Mask){{.*}}{
 // CHECK-ARM-X64:   [[ISNOTZERO:%[0-9]+]] = icmp eq i64 %Mask, 0
 // CHECK-ARM-X64:   br i1 [[ISNOTZERO]], label %[[END_LABEL:[a-z0-9._]+]], label %[[ISNOTZERO_LABEL:[a-z0-9._]+]]
 // CHECK-ARM-X64:   [[END_LABEL]]:
@@ -214,7 +214,7 @@ void *test_InterlockedExchangePointer(void * volatile *Target, void *Value) {
   return _InterlockedExchangePointer(Target, Value);
 }
 
-// CHECK: define{{.*}}ptr @test_InterlockedExchangePointer(ptr {{[a-z_ ]*}}%Target, ptr {{[a-z_ ]*}}%Value){{.*}}{
+// CHECK: define{{.*}}ptr @test_InterlockedExchangePointer(ptr {{.*}}%Target, ptr {{[a-z_ ]*}}%Value){{.*}}{
 // CHECK:   %[[VALUE:[0-9]+]] = ptrtoint ptr %Value to [[iPTR:i[0-9]+]]
 // CHECK:   %[[EXCHANGE:[0-9]+]] = atomicrmw xchg ptr %Target, [[iPTR]] %[[VALUE]] seq_cst, align {{4|8}}
 // CHECK:   %[[RESULT:[0-9]+]] = inttoptr [[iPTR]] %[[EXCHANGE]] to ptr
@@ -226,7 +226,7 @@ void *test_InterlockedExchangePointer_acq(void * volatile *Target, void *Value)
   return _InterlockedExchangePointer_acq(Target, Value);
 }
 
-// CHECK-ARM-ARM64: define{{.*}}ptr @test_InterlockedExchangePointer_acq(ptr {{[a-z_ ]*}}%Target, ptr {{[a-z_ ]*}}%Value){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}ptr @test_InterlockedExchangePointer_acq(ptr {{.*}}%Target, ptr {{[a-z_ ]*}}%Value){{.*}}{
 // CHECK-ARM-ARM64:   %[[VALUE:[0-9]+]] = ptrtoint ptr %Value to [[iPTR:i[0-9]+]]
 // CHECK-ARM-ARM64:   %[[EXCHANGE:[0-9]+]] = atomicrmw xchg ptr %Target, [[iPTR]] %[[VALUE]] acquire, align {{4|8}}
 // CHECK-ARM-ARM64:   %[[RESULT:[0-9]+]] = inttoptr [[iPTR]] %[[EXCHANGE]] to ptr
@@ -237,7 +237,7 @@ void *test_InterlockedExchangePointer_nf(void * volatile *Target, void *Value) {
   return _InterlockedExchangePointer_nf(Target, Value);
 }
 
-// CHECK-ARM-ARM64: define{{.*}}ptr @test_InterlockedExchangePointer_nf(ptr {{[a-z_ ]*}}%Target, ptr {{[a-z_ ]*}}%Value){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}ptr @test_InterlockedExchangePointer_nf(ptr {{.*}}%Target, ptr {{[a-z_ ]*}}%Value){{.*}}{
 // CHECK-ARM-ARM64:   %[[VALUE:[0-9]+]] = ptrtoint ptr %Value to [[iPTR]]
 // CHECK-ARM-ARM64:   %[[EXCHANGE:[0-9]+]] = atomicrmw xchg ptr %Target, [[iPTR]] %[[VALUE]] monotonic, align {{4|8}}
 // CHECK-ARM-ARM64:   %[[RESULT:[0-9]+]] = inttoptr [[iPTR]] %[[EXCHANGE]] to ptr
@@ -248,7 +248,7 @@ void *test_InterlockedExchangePointer_rel(void * volatile *Target, void *Value)
   return _InterlockedExchangePointer_rel(Target, Value);
 }
 
-// CHECK-ARM-ARM64: define{{.*}}ptr @test_InterlockedExchangePointer_rel(ptr {{[a-z_ ]*}}%Target, ptr {{[a-z_ ]*}}%Value){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}ptr @test_InterlockedExchangePointer_rel(ptr {{.*}}%Target, ptr {{[a-z_ ]*}}%Value){{.*}}{
 // CHECK-ARM-ARM64:   %[[VALUE:[0-9]+]] = ptrtoint ptr %Value to [[iPTR]]
 // CHECK-ARM-ARM64:   %[[EXCHANGE:[0-9]+]] = atomicrmw xchg ptr %Target, [[iPTR]] %[[VALUE]] release, align {{4|8}}
 // CHECK-ARM-ARM64:   %[[RESULT:[0-9]+]] = inttoptr [[iPTR]] %[[EXCHANGE]] to ptr
@@ -261,7 +261,7 @@ void *test_InterlockedCompareExchangePointer(void * volatile *Destination,
   return _InterlockedCompareExchangePointer(Destination, Exchange, Comparand);
 }
 
-// CHECK: define{{.*}}ptr @test_InterlockedCompareExchangePointer(ptr {{[a-z_ ]*}}%Destination, ptr {{[a-z_ ]*}}%Exchange, ptr {{[a-z_ ]*}}%Comparand){{.*}}{
+// CHECK: define{{.*}}ptr @test_InterlockedCompareExchangePointer(ptr {{.*}}%Destination, ptr {{[a-z_ ]*}}%Exchange, ptr {{[a-z_ ]*}}%Comparand){{.*}}{
 // CHECK:   %[[EXCHANGE:[0-9]+]] = ptrtoint ptr %Exchange to [[iPTR]]
 // CHECK:   %[[COMPARAND:[0-9]+]] = ptrtoint ptr %Comparand to [[iPTR]]
 // CHECK:   %[[XCHG:[0-9]+]] = cmpxchg volatile ptr %[[DEST:.+]], [[iPTR]] %[[COMPARAND:[0-9]+]], [[iPTR]] %[[EXCHANGE:[0-9]+]] seq_cst seq_cst, align {{4|8}}
@@ -275,7 +275,7 @@ void *test_InterlockedCompareExchangePointer_nf(void * volatile *Destination,
   return _InterlockedCompareExchangePointer_nf(Destination, Exchange, Comparand);
 }
 
-// CHECK: define{{.*}}ptr @test_InterlockedCompareExchangePointer_nf(ptr {{[a-z_ ]*}}%Destination, ptr {{[a-z_ ]*}}%Exchange, ptr {{[a-z_ ]*}}%Comparand){{.*}}{
+// CHECK: define{{.*}}ptr @test_InterlockedCompareExchangePointer_nf(ptr {{.*}}%Destination, ptr {{[a-z_ ]*}}%Exchange, ptr {{[a-z_ ]*}}%Comparand){{.*}}{
 // CHECK:   %[[EXCHANGE:[0-9]+]] = ptrtoint ptr %Exchange to [[iPTR]]
 // CHECK:   %[[COMPARAND:[0-9]+]] = ptrtoint ptr %Comparand to [[iPTR]]
 // CHECK:   %[[XCHG:[0-9]+]] = cmpxchg volatile ptr %[[DEST:.+]], [[iPTR]] %[[COMPARAND:[0-9]+]], [[iPTR]] %[[EXCHANGE:[0-9]+]] monotonic monotonic, align {{4|8}}
@@ -290,7 +290,7 @@ void *test_InterlockedCompareExchangePointer_acq(void * volatile *Destination,
   return _InterlockedCompareExchangePointer_acq(Destination, Exchange, Comparand);
 }
 
-// CHECK-ARM-ARM64: define{{.*}}ptr @test_InterlockedCompareExchangePointer_acq(ptr {{[a-z_ ]*}}%Destination, ptr {{[a-z_ ]*}}%Exchange, ptr {{[a-z_ ]*}}%Comparand){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}ptr @test_InterlockedCompareExchangePointer_acq(ptr {{.*}}%Destination, ptr {{[a-z_ ]*}}%Exchange, ptr {{[a-z_ ]*}}%Comparand){{.*}}{
 // CHECK-ARM-ARM64:   %[[EXCHANGE:[0-9]+]] = ptrtoint ptr %Exchange to [[iPTR]]
 // CHECK-ARM-ARM64:   %[[COMPARAND:[0-9]+]] = ptrtoint ptr %Comparand to [[iPTR]]
 // CHECK-ARM-ARM64:   %[[XCHG:[0-9]+]] = cmpxchg volatile ptr %[[DEST:.+]], [[iPTR]] %[[COMPARAND:[0-9]+]], [[iPTR]] %[[EXCHANGE:[0-9]+]] acquire acquire, align {{4|8}}
@@ -305,7 +305,7 @@ void *test_InterlockedCompareExchangePointer_rel(void * volatile *Destination,
   return _InterlockedCompareExchangePointer_rel(Destination, Exchange, Comparand);
 }
 
-// CHECK-ARM-ARM64: define{{.*}}ptr @test_InterlockedCompareExchangePointer_rel(ptr {{[a-z_ ]*}}%Destination, ptr {{[a-z_ ]*}}%Exchange, ptr {{[a-z_ ]*}}%Comparand){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}ptr @test_InterlockedCompareExchangePointer_rel(ptr {{.*}}%Destination, ptr {{[a-z_ ]*}}%Exchange, ptr {{[a-z_ ]*}}%Comparand){{.*}}{
 // CHECK-ARM-ARM64:   %[[EXCHANGE:[0-9]+]] = ptrtoint ptr %Exchange to [[iPTR]]
 // CHECK-ARM-ARM64:   %[[COMPARAND:[0-9]+]] = ptrtoint ptr %Comparand to [[iPTR]]
 // CHECK-ARM-ARM64:   %[[XCHG:[0-9]+]] = cmpxchg volatile ptr %[[DEST:.+]], [[iPTR]] %[[COMPARAND:[0-9]+]], [[iPTR]] %[[EXCHANGE:[0-9]+]] release monotonic, align {{4|8}}
@@ -318,7 +318,7 @@ void *test_InterlockedCompareExchangePointer_rel(void * volatile *Destination,
 char test_InterlockedExchange8(char volatile *value, char mask) {
   return _InterlockedExchange8(value, mask);
 }
-// CHECK: define{{.*}}i8 @test_InterlockedExchange8(ptr{{[a-z_ ]*}}%value, i8{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK: define{{.*}}i8 @test_InterlockedExchange8(ptr{{.*}}%value, i8{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK:   [[RESULT:%[0-9]+]] = atomicrmw xchg ptr %value, i8 %mask seq_cst, align 1
 // CHECK:   ret i8 [[RESULT:%[0-9]+]]
 // CHECK: }
@@ -326,7 +326,7 @@ char test_InterlockedExchange8(char volatile *value, char mask) {
 short test_InterlockedExchange16(short volatile *value, short mask) {
   return _InterlockedExchange16(value, mask);
 }
-// CHECK: define{{.*}}i16 @test_InterlockedExchange16(ptr{{[a-z_ ]*}}%value, i16{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK: define{{.*}}i16 @test_InterlockedExchange16(ptr{{.*}}%value, i16{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK:   [[RESULT:%[0-9]+]] = atomicrmw xchg ptr %value, i16 %mask seq_cst, align 2
 // CHECK:   ret i16 [[RESULT:%[0-9]+]]
 // CHECK: }
@@ -334,7 +334,7 @@ short test_InterlockedExchange16(short volatile *value, short mask) {
 long test_InterlockedExchange(long volatile *value, long mask) {
   return _InterlockedExchange(value, mask);
 }
-// CHECK: define{{.*}}i32 @test_InterlockedExchange(ptr{{[a-z_ ]*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK: define{{.*}}i32 @test_InterlockedExchange(ptr{{.*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK:   [[RESULT:%[0-9]+]] = atomicrmw xchg ptr %value, i32 %mask seq_cst, align 4
 // CHECK:   ret i32 [[RESULT:%[0-9]+]]
 // CHECK: }
@@ -342,7 +342,7 @@ long test_InterlockedExchange(long volatile *value, long mask) {
 char test_InterlockedExchangeAdd8(char volatile *value, char mask) {
   return _InterlockedExchangeAdd8(value, mask);
 }
-// CHECK: define{{.*}}i8 @test_InterlockedExchangeAdd8(ptr{{[a-z_ ]*}}%value, i8{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK: define{{.*}}i8 @test_InterlockedExchangeAdd8(ptr{{.*}}%value, i8{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK:   [[RESULT:%[0-9]+]] = atomicrmw add ptr %value, i8 %mask seq_cst, align 1
 // CHECK:   ret i8 [[RESULT:%[0-9]+]]
 // CHECK: }
@@ -350,7 +350,7 @@ char test_InterlockedExchangeAdd8(char volatile *value, char mask) {
 short test_InterlockedExchangeAdd16(short volatile *value, short mask) {
   return _InterlockedExchangeAdd16(value, mask);
 }
-// CHECK: define{{.*}}i16 @test_InterlockedExchangeAdd16(ptr{{[a-z_ ]*}}%value, i16{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK: define{{.*}}i16 @test_InterlockedExchangeAdd16(ptr{{.*}}%value, i16{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK:   [[RESULT:%[0-9]+]] = atomicrmw add ptr %value, i16 %mask seq_cst, align 2
 // CHECK:   ret i16 [[RESULT:%[0-9]+]]
 // CHECK: }
@@ -358,7 +358,7 @@ short test_InterlockedExchangeAdd16(short volatile *value, short mask) {
 long test_InterlockedExchangeAdd(long volatile *value, long mask) {
   return _InterlockedExchangeAdd(value, mask);
 }
-// CHECK: define{{.*}}i32 @test_InterlockedExchangeAdd(ptr{{[a-z_ ]*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK: define{{.*}}i32 @test_InterlockedExchangeAdd(ptr{{.*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK:   [[RESULT:%[0-9]+]] = atomicrmw add ptr %value, i32 %mask seq_cst, align 4
 // CHECK:   ret i32 [[RESULT:%[0-9]+]]
 // CHECK: }
@@ -366,7 +366,7 @@ long test_InterlockedExchangeAdd(long volatile *value, long mask) {
 char test_InterlockedExchangeSub8(char volatile *value, char mask) {
   return _InterlockedExchangeSub8(value, mask);
 }
-// CHECK: define{{.*}}i8 @test_InterlockedExchangeSub8(ptr{{[a-z_ ]*}}%value, i8{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK: define{{.*}}i8 @test_InterlockedExchangeSub8(ptr{{.*}}%value, i8{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK:   [[RESULT:%[0-9]+]] = atomicrmw sub ptr %value, i8 %mask seq_cst, align 1
 // CHECK:   ret i8 [[RESULT:%[0-9]+]]
 // CHECK: }
@@ -374,7 +374,7 @@ char test_InterlockedExchangeSub8(char volatile *value, char mask) {
 short test_InterlockedExchangeSub16(short volatile *value, short mask) {
   return _InterlockedExchangeSub16(value, mask);
 }
-// CHECK: define{{.*}}i16 @test_InterlockedExchangeSub16(ptr{{[a-z_ ]*}}%value, i16{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK: define{{.*}}i16 @test_InterlockedExchangeSub16(ptr{{.*}}%value, i16{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK:   [[RESULT:%[0-9]+]] = atomicrmw sub ptr %value, i16 %mask seq_cst, align 2
 // CHECK:   ret i16 [[RESULT:%[0-9]+]]
 // CHECK: }
@@ -382,7 +382,7 @@ short test_InterlockedExchangeSub16(short volatile *value, short mask) {
 long test_InterlockedExchangeSub(long volatile *value, long mask) {
   return _InterlockedExchangeSub(value, mask);
 }
-// CHECK: define{{.*}}i32 @test_InterlockedExchangeSub(ptr{{[a-z_ ]*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK: define{{.*}}i32 @test_InterlockedExchangeSub(ptr{{.*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK:   [[RESULT:%[0-9]+]] = atomicrmw sub ptr %value, i32 %mask seq_cst, align 4
 // CHECK:   ret i32 [[RESULT:%[0-9]+]]
 // CHECK: }
@@ -390,7 +390,7 @@ long test_InterlockedExchangeSub(long volatile *value, long mask) {
 char test_InterlockedOr8(char volatile *value, char mask) {
   return _InterlockedOr8(value, mask);
 }
-// CHECK: define{{.*}}i8 @test_InterlockedOr8(ptr{{[a-z_ ]*}}%value, i8{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK: define{{.*}}i8 @test_InterlockedOr8(ptr{{.*}}%value, i8{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK:   [[RESULT:%[0-9]+]] = atomicrmw or ptr %value, i8 %mask seq_cst, align 1
 // CHECK:   ret i8 [[RESULT:%[0-9]+]]
 // CHECK: }
@@ -398,7 +398,7 @@ char test_InterlockedOr8(char volatile *value, char mask) {
 short test_InterlockedOr16(short volatile *value, short mask) {
   return _InterlockedOr16(value, mask);
 }
-// CHECK: define{{.*}}i16 @test_InterlockedOr16(ptr{{[a-z_ ]*}}%value, i16{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK: define{{.*}}i16 @test_InterlockedOr16(ptr{{.*}}%value, i16{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK:   [[RESULT:%[0-9]+]] = atomicrmw or ptr %value, i16 %mask seq_cst, align 2
 // CHECK:   ret i16 [[RESULT:%[0-9]+]]
 // CHECK: }
@@ -406,7 +406,7 @@ short test_InterlockedOr16(short volatile *value, short mask) {
 long test_InterlockedOr(long volatile *value, long mask) {
   return _InterlockedOr(value, mask);
 }
-// CHECK: define{{.*}}i32 @test_InterlockedOr(ptr{{[a-z_ ]*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK: define{{.*}}i32 @test_InterlockedOr(ptr{{.*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK:   [[RESULT:%[0-9]+]] = atomicrmw or ptr %value, i32 %mask seq_cst, align 4
 // CHECK:   ret i32 [[RESULT:%[0-9]+]]
 // CHECK: }
@@ -414,7 +414,7 @@ long test_InterlockedOr(long volatile *value, long mask) {
 char test_InterlockedXor8(char volatile *value, char mask) {
   return _InterlockedXor8(value, mask);
 }
-// CHECK: define{{.*}}i8 @test_InterlockedXor8(ptr{{[a-z_ ]*}}%value, i8{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK: define{{.*}}i8 @test_InterlockedXor8(ptr{{.*}}%value, i8{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK:   [[RESULT:%[0-9]+]] = atomicrmw xor ptr %value, i8 %mask seq_cst, align 1
 // CHECK:   ret i8 [[RESULT:%[0-9]+]]
 // CHECK: }
@@ -422,7 +422,7 @@ char test_InterlockedXor8(char volatile *value, char mask) {
 short test_InterlockedXor16(short volatile *value, short mask) {
   return _InterlockedXor16(value, mask);
 }
-// CHECK: define{{.*}}i16 @test_InterlockedXor16(ptr{{[a-z_ ]*}}%value, i16{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK: define{{.*}}i16 @test_InterlockedXor16(ptr{{.*}}%value, i16{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK:   [[RESULT:%[0-9]+]] = atomicrmw xor ptr %value, i16 %mask seq_cst, align 2
 // CHECK:   ret i16 [[RESULT:%[0-9]+]]
 // CHECK: }
@@ -430,7 +430,7 @@ short test_InterlockedXor16(short volatile *value, short mask) {
 long test_InterlockedXor(long volatile *value, long mask) {
   return _InterlockedXor(value, mask);
 }
-// CHECK: define{{.*}}i32 @test_InterlockedXor(ptr{{[a-z_ ]*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK: define{{.*}}i32 @test_InterlockedXor(ptr{{.*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK:   [[RESULT:%[0-9]+]] = atomicrmw xor ptr %value, i32 %mask seq_cst, align 4
 // CHECK:   ret i32 [[RESULT:%[0-9]+]]
 // CHECK: }
@@ -438,7 +438,7 @@ long test_InterlockedXor(long volatile *value, long mask) {
 char test_InterlockedAnd8(char volatile *value, char mask) {
   return _InterlockedAnd8(value, mask);
 }
-// CHECK: define{{.*}}i8 @test_InterlockedAnd8(ptr{{[a-z_ ]*}}%value, i8{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK: define{{.*}}i8 @test_InterlockedAnd8(ptr{{.*}}%value, i8{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK:   [[RESULT:%[0-9]+]] = atomicrmw and ptr %value, i8 %mask seq_cst, align 1
 // CHECK:   ret i8 [[RESULT:%[0-9]+]]
 // CHECK: }
@@ -446,7 +446,7 @@ char test_InterlockedAnd8(char volatile *value, char mask) {
 short test_InterlockedAnd16(short volatile *value, short mask) {
   return _InterlockedAnd16(value, mask);
 }
-// CHECK: define{{.*}}i16 @test_InterlockedAnd16(ptr{{[a-z_ ]*}}%value, i16{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK: define{{.*}}i16 @test_InterlockedAnd16(ptr{{.*}}%value, i16{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK:   [[RESULT:%[0-9]+]] = atomicrmw and ptr %value, i16 %mask seq_cst, align 2
 // CHECK:   ret i16 [[RESULT:%[0-9]+]]
 // CHECK: }
@@ -454,7 +454,7 @@ short test_InterlockedAnd16(short volatile *value, short mask) {
 long test_InterlockedAnd(long volatile *value, long mask) {
   return _InterlockedAnd(value, mask);
 }
-// CHECK: define{{.*}}i32 @test_InterlockedAnd(ptr{{[a-z_ ]*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK: define{{.*}}i32 @test_InterlockedAnd(ptr{{.*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK:   [[RESULT:%[0-9]+]] = atomicrmw and ptr %value, i32 %mask seq_cst, align 4
 // CHECK:   ret i32 [[RESULT:%[0-9]+]]
 // CHECK: }
@@ -462,7 +462,7 @@ long test_InterlockedAnd(long volatile *value, long mask) {
 char test_InterlockedCompareExchange8(char volatile *Destination, char Exchange, char Comperand) {
   return _InterlockedCompareExchange8(Destination, Exchange, Comperand);
 }
-// CHECK: define{{.*}}i8 @test_InterlockedCompareExchange8(ptr{{[a-z_ ]*}}%Destination, i8{{[a-z_ ]*}}%Exchange, i8{{[a-z_ ]*}}%Comperand){{.*}}{
+// CHECK: define{{.*}}i8 @test_InterlockedCompareExchange8(ptr{{.*}}%Destination, i8{{[a-z_ ]*}}%Exchange, i8{{[a-z_ ]*}}%Comperand){{.*}}{
 // CHECK: [[TMP:%[0-9]+]] = cmpxchg volatile ptr %Destination, i8 %Comperand, i8 %Exchange seq_cst seq_cst, align 1
 // CHECK: [[RESULT:%[0-9]+]] = extractvalue { i8, i1 } [[TMP]], 0
 // CHECK: ret i8 [[RESULT]]
@@ -471,7 +471,7 @@ char test_InterlockedCompareExchange8(char volatile *Destination, char Exchange,
 short test_InterlockedCompareExchange16(short volatile *Destination, short Exchange, short Comperand) {
   return _InterlockedCompareExchange16(Destination, Exchange, Comperand);
 }
-// CHECK: define{{.*}}i16 @test_InterlockedCompareExchange16(ptr{{[a-z_ ]*}}%Destination, i16{{[a-z_ ]*}}%Exchange, i16{{[a-z_ ]*}}%Comperand){{.*}}{
+// CHECK: define{{.*}}i16 @test_InterlockedCompareExchange16(ptr{{.*}}%Destination, i16{{[a-z_ ]*}}%Exchange, i16{{[a-z_ ]*}}%Comperand){{.*}}{
 // CHECK: [[TMP:%[0-9]+]] = cmpxchg volatile ptr %Destination, i16 %Comperand, i16 %Exchange seq_cst seq_cst, align 2
 // CHECK: [[RESULT:%[0-9]+]] = extractvalue { i16, i1 } [[TMP]], 0
 // CHECK: ret i16 [[RESULT]]
@@ -480,7 +480,7 @@ short test_InterlockedCompareExchange16(short volatile *Destination, short Excha
 long test_InterlockedCompareExchange(long volatile *Destination, long Exchange, long Comperand) {
   return _InterlockedCompareExchange(Destination, Exchange, Comperand);
 }
-// CHECK: define{{.*}}i32 @test_InterlockedCompareExchange(ptr{{[a-z_ ]*}}%Destination, i32{{[a-z_ ]*}}%Exchange, i32{{[a-z_ ]*}}%Comperand){{.*}}{
+// CHECK: define{{.*}}i32 @test_InterlockedCompareExchange(ptr{{.*}}%Destination, i32{{[a-z_ ]*}}%Exchange, i32{{[a-z_ ]*}}%Comperand){{.*}}{
 // CHECK: [[TMP:%[0-9]+]] = cmpxchg volatile ptr %Destination, i32 %Comperand, i32 %Exchange seq_cst seq_cst, align 4
 // CHECK: [[RESULT:%[0-9]+]] = extractvalue { i32, i1 } [[TMP]], 0
 // CHECK: ret i32 [[RESULT]]
@@ -489,7 +489,7 @@ long test_InterlockedCompareExchange(long volatile *Destination, long Exchange,
 __int64 test_InterlockedCompareExchange64(__int64 volatile *Destination, __int64 Exchange, __int64 Comperand) {
   return _InterlockedCompareExchange64(Destination, Exchange, Comperand);
 }
-// CHECK: define{{.*}}i64 @test_InterlockedCompareExchange64(ptr{{[a-z_ ]*}}%Destination, i64{{[a-z_ ]*}}%Exchange, i64{{[a-z_ ]*}}%Comperand){{.*}}{
+// CHECK: define{{.*}}i64 @test_InterlockedCompareExchange64(ptr{{.*}}%Destination, i64{{[a-z_ ]*}}%Exchange, i64{{[a-z_ ]*}}%Comperand){{.*}}{
 // CHECK: [[TMP:%[0-9]+]] = cmpxchg volatile ptr %Destination, i64 %Comperand, i64 %Exchange seq_cst seq_cst, align 8
 // CHECK: [[RESULT:%[0-9]+]] = extractvalue { i64, i1 } [[TMP]], 0
 // CHECK: ret i64 [[RESULT]]
@@ -502,7 +502,7 @@ unsigned char test_InterlockedCompareExchange128(
   return _InterlockedCompareExchange128(++Destination, ++ExchangeHigh,
                                         ++ExchangeLow, ++ComparandResult);
 }
-// CHECK-64: define{{.*}}i8 @test_InterlockedCompareExchange128(ptr{{[a-z_ ]*}}%Destination, i64{{[a-z_ ]*}}%ExchangeHigh, i64{{[a-z_ ]*}}%ExchangeLow, ptr{{[a-z_ ]*}}%ComparandResult){{.*}}{
+// CHECK-64: define{{.*}}i8 @test_InterlockedCompareExchange128(ptr{{.*}}%Destination, i64{{[a-z_ ]*}}%ExchangeHigh, i64{{[a-z_ ]*}}%ExchangeLow, ptr{{.*}}%ComparandResult){{.*}}{
 // CHECK-64: %incdec.ptr = getelementptr inbounds nuw i8, ptr %Destination, i64 8
 // CHECK-64: %inc = add nsw i64 %ExchangeHigh, 1
 // CHECK-64: %inc1 = add nsw i64 %ExchangeLow, 1
@@ -551,7 +551,7 @@ unsigned char test_InterlockedCompareExchange128_rel(
 short test_InterlockedIncrement16(short volatile *Addend) {
   return _InterlockedIncrement16(++Addend);
 }
-// CHECK: define{{.*}}i16 @test_InterlockedIncrement16(ptr{{[a-z_ ]*}}%Addend){{.*}}{
+// CHECK: define{{.*}}i16 @test_InterlockedIncrement16(ptr{{.*}}%Addend){{.*}}{
 // CHECK: %incdec.ptr = getelementptr inbounds nuw i8, ptr %Addend, {{i64|i32}} 2
 // CHECK: [[TMP:%[0-9]+]] = atomicrmw add ptr %incdec.ptr, i16 1 seq_cst, align 2
 // CHECK: [[RESULT:%[0-9]+]] = add i16 [[TMP]], 1
@@ -561,7 +561,7 @@ short test_InterlockedIncrement16(short volatile *Addend) {
 long test_InterlockedIncrement(long volatile *Addend) {
   return _InterlockedIncrement(++Addend);
 }
-// CHECK: define{{.*}}i32 @test_InterlockedIncrement(ptr{{[a-z_ ]*}}%Addend){{.*}}{
+// CHECK: define{{.*}}i32 @test_InterlockedIncrement(ptr{{.*}}%Addend){{.*}}{
 // CHECK: %incdec.ptr = getelementptr inbounds nuw i8, ptr %Addend, {{i64|i32}} 4
 // CHECK: [[TMP:%[0-9]+]] = atomicrmw add ptr %incdec.ptr, i32 1 seq_cst, align 4
 // CHECK: [[RESULT:%[0-9]+]] = add i32 [[TMP]], 1
@@ -571,7 +571,7 @@ long test_InterlockedIncrement(long volatile *Addend) {
 short test_InterlockedDecrement16(short volatile *Addend) {
   return _InterlockedDecrement16(Addend);
 }
-// CHECK: define{{.*}}i16 @test_InterlockedDecrement16(ptr{{[a-z_ ]*}}%Addend){{.*}}{
+// CHECK: define{{.*}}i16 @test_InterlockedDecrement16(ptr{{.*}}%Addend){{.*}}{
 // CHECK: [[TMP:%[0-9]+]] = atomicrmw sub ptr %Addend, i16 1 seq_cst, align 2
 // CHECK: [[RESULT:%[0-9]+]] = add i16 [[TMP]], -1
 // CHECK: ret i16 [[RESULT]]
@@ -580,7 +580,7 @@ short test_InterlockedDecrement16(short volatile *Addend) {
 long test_InterlockedDecrement(long volatile *Addend) {
   return _InterlockedDecrement(Addend);
 }
-// CHECK: define{{.*}}i32 @test_InterlockedDecrement(ptr{{[a-z_ ]*}}%Addend){{.*}}{
+// CHECK: define{{.*}}i32 @test_InterlockedDecrement(ptr{{.*}}%Addend){{.*}}{
 // CHECK: [[TMP:%[0-9]+]] = atomicrmw sub ptr %Addend, i32 1 seq_cst, align 4
 // CHECK: [[RESULT:%[0-9]+]] = add i32 [[TMP]], -1
 // CHECK: ret i32 [[RESULT]]
@@ -591,13 +591,13 @@ short test_iso_volatile_load16(short volatile *p) { return __iso_volatile_load16
 int test_iso_volatile_load32(int volatile *p) { return __iso_volatile_load32(p); }
 __int64 test_iso_volatile_load64(__int64 volatile *p) { return __iso_volatile_load64(p); }
 
-// CHECK: define{{.*}}i8 @test_iso_volatile_load8(ptr{{[a-z_ ]*}}%p)
+// CHECK: define{{.*}}i8 @test_iso_volatile_load8(ptr{{.*}}%p)
 // CHECK: = load volatile i8, ptr %p
-// CHECK: define{{.*}}i16 @test_iso_volatile_load16(ptr{{[a-z_ ]*}}%p)
+// CHECK: define{{.*}}i16 @test_iso_volatile_load16(ptr{{.*}}%p)
 // CHECK: = load volatile i16, ptr %p
-// CHECK: define{{.*}}i32 @test_iso_volatile_load32(ptr{{[a-z_ ]*}}%p)
+// CHECK: define{{.*}}i32 @test_iso_volatile_load32(ptr{{.*}}%p)
 // CHECK: = load volatile i32, ptr %p
-// CHECK: define{{.*}}i64 @test_iso_volatile_load64(ptr{{[a-z_ ]*}}%p)
+// CHECK: define{{.*}}i64 @test_iso_volatile_load64(ptr{{.*}}%p)
 // CHECK: = load volatile i64, ptr %p
 
 void test_iso_volatile_store8(char volatile *p, char v) { __iso_volatile_store8(p, v); }
@@ -605,13 +605,13 @@ void test_iso_volatile_store16(short volatile *p, short v) { __iso_volatile_stor
 void test_iso_volatile_store32(int volatile *p, int v) { __iso_volatile_store32(p, v); }
 void test_iso_volatile_store64(__int64 volatile *p, __int64 v) { __iso_volatile_store64(p, v); }
 
-// CHECK: define{{.*}}void @test_iso_volatile_store8(ptr{{[a-z_ ]*}}%p, i8 {{[a-z_ ]*}}%v)
+// CHECK: define{{.*}}void @test_iso_volatile_store8(ptr{{.*}}%p, i8 {{[a-z_ ]*}}%v)
 // CHECK: store volatile i8 %v, ptr %p
-// CHECK: define{{.*}}void @test_iso_volatile_store16(ptr{{[a-z_ ]*}}%p, i16 {{[a-z_ ]*}}%v)
+// CHECK: define{{.*}}void @test_iso_volatile_store16(ptr{{.*}}%p, i16 {{[a-z_ ]*}}%v)
 // CHECK: store volatile i16 %v, ptr %p
-// CHECK: define{{.*}}void @test_iso_volatile_store32(ptr{{[a-z_ ]*}}%p, i32 {{[a-z_ ]*}}%v)
+// CHECK: define{{.*}}void @test_iso_volatile_store32(ptr{{.*}}%p, i32 {{[a-z_ ]*}}%v)
 // CHECK: store volatile i32 %v, ptr %p
-// CHECK: define{{.*}}void @test_iso_volatile_store64(ptr{{[a-z_ ]*}}%p, i64 {{[a-z_ ]*}}%v)
+// CHECK: define{{.*}}void @test_iso_volatile_store64(ptr{{.*}}%p, i64 {{[a-z_ ]*}}%v)
 // CHECK: store volatile i64 %v, ptr %p
 
 
@@ -619,7 +619,7 @@ void test_iso_volatile_store64(__int64 volatile *p, __int64 v) { __iso_volatile_
 __int64 test_InterlockedExchange64(__int64 volatile *value, __int64 mask) {
   return _InterlockedExchange64(value, mask);
 }
-// CHECK: define{{.*}}i64 @test_InterlockedExchange64(ptr{{[a-z_ ]*}}%value, i64{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK: define{{.*}}i64 @test_InterlockedExchange64(ptr{{.*}}%value, i64{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK:   [[RESULT:%[0-9]+]] = atomicrmw xchg ptr %value, i64 %mask seq_cst, align 8
 // CHECK:   ret i64 [[RESULT:%[0-9]+]]
 // CHECK: }
@@ -627,7 +627,7 @@ __int64 test_InterlockedExchange64(__int64 volatile *value, __int64 mask) {
 __int64 test_InterlockedExchangeAdd64(__int64 volatile *value, __int64 mask) {
   return _InterlockedExchangeAdd64(value, mask);
 }
-// CHECK: define{{.*}}i64 @test_InterlockedExchangeAdd64(ptr{{[a-z_ ]*}}%value, i64{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK: define{{.*}}i64 @test_InterlockedExchangeAdd64(ptr{{.*}}%value, i64{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK:   [[RESULT:%[0-9]+]] = atomicrmw add ptr %value, i64 %mask seq_cst, align 8
 // CHECK:   ret i64 [[RESULT:%[0-9]+]]
 // CHECK: }
@@ -635,7 +635,7 @@ __int64 test_InterlockedExchangeAdd64(__int64 volatile *value, __int64 mask) {
 __int64 test_InterlockedExchangeSub64(__int64 volatile *value, __int64 mask) {
   return _InterlockedExchangeSub64(value, mask);
 }
-// CHECK: define{{.*}}i64 @test_InterlockedExchangeSub64(ptr{{[a-z_ ]*}}%value, i64{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK: define{{.*}}i64 @test_InterlockedExchangeSub64(ptr{{.*}}%value, i64{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK:   [[RESULT:%[0-9]+]] = atomicrmw sub ptr %value, i64 %mask seq_cst, align 8
 // CHECK:   ret i64 [[RESULT:%[0-9]+]]
 // CHECK: }
@@ -643,7 +643,7 @@ __int64 test_InterlockedExchangeSub64(__int64 volatile *value, __int64 mask) {
 __int64 test_InterlockedOr64(__int64 volatile *value, __int64 mask) {
   return _InterlockedOr64(value, mask);
 }
-// CHECK: define{{.*}}i64 @test_InterlockedOr64(ptr{{[a-z_ ]*}}%value, i64{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK: define{{.*}}i64 @test_InterlockedOr64(ptr{{.*}}%value, i64{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK:   [[RESULT:%[0-9]+]] = atomicrmw or ptr %value, i64 %mask seq_cst, align 8
 // CHECK:   ret i64 [[RESULT:%[0-9]+]]
 // CHECK: }
@@ -651,7 +651,7 @@ __int64 test_InterlockedOr64(__int64 volatile *value, __int64 mask) {
 __int64 test_InterlockedXor64(__int64 volatile *value, __int64 mask) {
   return _InterlockedXor64(value, mask);
 }
-// CHECK: define{{.*}}i64 @test_InterlockedXor64(ptr{{[a-z_ ]*}}%value, i64{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK: define{{.*}}i64 @test_InterlockedXor64(ptr{{.*}}%value, i64{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK:   [[RESULT:%[0-9]+]] = atomicrmw xor ptr %value, i64 %mask seq_cst, align 8
 // CHECK:   ret i64 [[RESULT:%[0-9]+]]
 // CHECK: }
@@ -659,7 +659,7 @@ __int64 test_InterlockedXor64(__int64 volatile *value, __int64 mask) {
 __int64 test_InterlockedAnd64(__int64 volatile *value, __int64 mask) {
   return _InterlockedAnd64(value, mask);
 }
-// CHECK: define{{.*}}i64 @test_InterlockedAnd64(ptr{{[a-z_ ]*}}%value, i64{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK: define{{.*}}i64 @test_InterlockedAnd64(ptr{{.*}}%value, i64{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK:   [[RESULT:%[0-9]+]] = atomicrmw and ptr %value, i64 %mask seq_cst, align 8
 // CHECK:   ret i64 [[RESULT:%[0-9]+]]
 // CHECK: }
@@ -667,7 +667,7 @@ __int64 test_InterlockedAnd64(__int64 volatile *value, __int64 mask) {
 __int64 test_InterlockedIncrement64(__int64 volatile *Addend) {
   return _InterlockedIncrement64(Addend);
 }
-// CHECK: define{{.*}}i64 @test_InterlockedIncrement64(ptr{{[a-z_ ]*}}%Addend){{.*}}{
+// CHECK: define{{.*}}i64 @test_InterlockedIncrement64(ptr{{.*}}%Addend){{.*}}{
 // CHECK: [[TMP:%[0-9]+]] = atomicrmw add ptr %Addend, i64 1 seq_cst, align 8
 // CHECK: [[RESULT:%[0-9]+]] = add i64 [[TMP]], 1
 // CHECK: ret i64 [[RESULT]]
@@ -676,7 +676,7 @@ __int64 test_InterlockedIncrement64(__int64 volatile *Addend) {
 __int64 test_InterlockedDecrement64(__int64 volatile *Addend) {
   return _InterlockedDecrement64(Addend);
 }
-// CHECK: define{{.*}}i64 @test_InterlockedDecrement64(ptr{{[a-z_ ]*}}%Addend){{.*}}{
+// CHECK: define{{.*}}i64 @test_InterlockedDecrement64(ptr{{.*}}%Addend){{.*}}{
 // CHECK: [[TMP:%[0-9]+]] = atomicrmw sub ptr %Addend, i64 1 seq_cst, align 8
 // CHECK: [[RESULT:%[0-9]+]] = add i64 [[TMP]], -1
 // CHECK: ret i64 [[RESULT]]
@@ -686,48 +686,48 @@ __int64 test_InterlockedDecrement64(__int64 volatile *Addend) {
 
 #if defined(__i386__) || defined(__x86_64__)
 long test_InterlockedExchange_HLEAcquire(long volatile *Target, long Value) {
-// CHECK-INTEL: define{{.*}} i32 @test_InterlockedExchange_HLEAcquire(ptr{{[a-z_ ]*}}%Target, i32{{[a-z_ ]*}}%Value)
+// CHECK-INTEL: define{{.*}} i32 @test_InterlockedExchange_HLEAcquire(ptr{{.*}}%Target, i32{{[a-z_ ]*}}%Value)
 // CHECK-INTEL: call i32 asm sideeffect ".byte 0xf2 ; lock ; xchg $($0, $1$|$1, $0$)", "=r,=*m,0,*m,~{memory},~{dirflag},~{fpsr},~{flags}"(ptr elementtype(i32) %Target, i32 %Value, ptr elementtype(i32) %Target)
   return _InterlockedExchange_HLEAcquire(Target, Value);
 }
 long test_InterlockedExchange_HLERelease(long volatile *Target, long Value) {
-// CHECK-INTEL: define{{.*}} i32 @test_InterlockedExchange_HLERelease(ptr{{[a-z_ ]*}}%Target, i32{{[a-z_ ]*}}%Value)
+// CHECK-INTEL: define{{.*}} i32 @test_InterlockedExchange_HLERelease(ptr{{.*}}%Target, i32{{[a-z_ ]*}}%Value)
 // CHECK-INTEL: call i32 asm sideeffect ".byte 0xf3 ; lock ; xchg $($0, $1$|$1, $0$)", "=r,=*m,0,*m,~{memory},~{dirflag},~{fpsr},~{flags}"(ptr elementtype(i32) %Target, i32 %Value, ptr elementtype(i32) %Target)
   return _InterlockedExchange_HLERelease(Target, Value);
 }
 long test_InterlockedCompareExchange_HLEAcquire(long volatile *Destination,
                                                 long Exchange, long Comparand) {
-// CHECK-INTEL: define{{.*}} i32 @test_InterlockedCompareExchange_HLEAcquire(ptr{{[a-z_ ]*}}%Destination, i32{{[a-z_ ]*}}%Exchange, i32{{[a-z_ ]*}}%Comparand)
+// CHECK-INTEL: define{{.*}} i32 @test_InterlockedCompareExchange_HLEAcquire(ptr{{.*}}%Destination, i32{{[a-z_ ]*}}%Exchange, i32{{[a-z_ ]*}}%Comparand)
 // CHECK-INTEL: call i32 asm sideeffect ".byte 0xf2 ; lock ; cmpxchg $($2, $1$|$1, $2$)", "={ax},=*m,r,0,*m,~{memory},~{dirflag},~{fpsr},~{flags}"(ptr elementtype(i32) %Destination, i32 %Exchange, i32 %Comparand, ptr elementtype(i32) %Destination)
   return _InterlockedCompareExchange_HLEAcquire(Destination, Exchange, Comparand);
 }
 long test_InterlockedCompareExchange_HLERelease(long volatile *Destination,
                                             long Exchange, long Comparand) {
-// CHECK-INTEL: define{{.*}} i32 @test_InterlockedCompareExchange_HLERelease(ptr{{[a-z_ ]*}}%Destination, i32{{[a-z_ ]*}}%Exchange, i32{{[a-z_ ]*}}%Comparand)
+// CHECK-INTEL: define{{.*}} i32 @test_InterlockedCompareExchange_HLERelease(ptr{{.*}}%Destination, i32{{[a-z_ ]*}}%Exchange, i32{{[a-z_ ]*}}%Comparand)
 // CHECK-INTEL: call i32 asm sideeffect ".byte 0xf3 ; lock ; cmpxchg $($2, $1$|$1, $2$)", "={ax},=*m,r,0,*m,~{memory},~{dirflag},~{fpsr},~{flags}"(ptr elementtype(i32) %Destination, i32 %Exchange, i32 %Comparand, ptr elementtype(i32) %Destination)
   return _InterlockedCompareExchange_HLERelease(Destination, Exchange, Comparand);
 }
 #endif
 #if defined(__x86_64__)
 __int64 test_InterlockedExchange64_HLEAcquire(__int64 volatile *Target, __int64 Value) {
-// CHECK-X64: define{{.*}} i64 @test_InterlockedExchange64_HLEAcquire(ptr{{[a-z_ ]*}}%Target, i64{{[a-z_ ]*}}%Value)
+// CHECK-X64: define{{.*}} i64 @test_InterlockedExchange64_HLEAcquire(ptr{{.*}}%Target, i64{{[a-z_ ]*}}%Value)
 // CHECK-X64: call i64 asm sideeffect ".byte 0xf2 ; lock ; xchg $($0, $1$|$1, $0$)", "=r,=*m,0,*m,~{memory},~{dirflag},~{fpsr},~{flags}"(ptr elementtype(i64) %Target, i64 %Value, ptr elementtype(i64) %Target)
   return _InterlockedExchange64_HLEAcquire(Target, Value);
 }
 __int64 test_InterlockedExchange64_HLERelease(__int64 volatile *Target, __int64 Value) {
-// CHECK-X64: define{{.*}} i64 @test_InterlockedExchange64_HLERelease(ptr{{[a-z_ ]*}}%Target, i64{{[a-z_ ]*}}%Value)
+// CHECK-X64: define{{.*}} i64 @test_InterlockedExchange64_HLERelease(ptr{{.*}}%Target, i64{{[a-z_ ]*}}%Value)
 // CHECK-X64: call i64 asm sideeffect ".byte 0xf3 ; lock ; xchg $($0, $1$|$1, $0$)", "=r,=*m,0,*m,~{memory},~{dirflag},~{fpsr},~{flags}"(ptr elementtype(i64) %Target, i64 %Value, ptr elementtype(i64) %Target)
   return _InterlockedExchange64_HLERelease(Target, Value);
 }
 __int64 test_InterlockedCompareExchange64_HLEAcquire(__int64 volatile *Destination,
                                                      __int64 Exchange, __int64 Comparand) {
-// CHECK-X64: define{{.*}} i64 @test_InterlockedCompareExchange64_HLEAcquire(ptr{{[a-z_ ]*}}%Destination, i64{{[a-z_ ]*}}%Exchange, i64{{[a-z_ ]*}}%Comparand)
+// CHECK-X64: define{{.*}} i64 @test_InterlockedCompareExchange64_HLEAcquire(ptr{{.*}}%Destination, i64{{[a-z_ ]*}}%Exchange, i64{{[a-z_ ]*}}%Comparand)
 // CHECK-X64: call i64 asm sideeffect ".byte 0xf2 ; lock ; cmpxchg $($2, $1$|$1, $2$)", "={ax},=*m,r,0,*m,~{memory},~{dirflag},~{fpsr},~{flags}"(ptr elementtype(i64) %Destination, i64 %Exchange, i64 %Comparand, ptr elementtype(i64) %Destination)
   return _InterlockedCompareExchange64_HLEAcquire(Destination, Exchange, Comparand);
 }
 __int64 test_InterlockedCompareExchange64_HLERelease(__int64 volatile *Destination,
                                                      __int64 Exchange, __int64 Comparand) {
-// CHECK-X64: define{{.*}} i64 @test_InterlockedCompareExchange64_HLERelease(ptr{{[a-z_ ]*}}%Destination, i64{{[a-z_ ]*}}%Exchange, i64{{[a-z_ ]*}}%Comparand)
+// CHECK-X64: define{{.*}} i64 @test_InterlockedCompareExchange64_HLERelease(ptr{{.*}}%Destination, i64{{[a-z_ ]*}}%Exchange, i64{{[a-z_ ]*}}%Comparand)
 // CHECK-X64: call i64 asm sideeffect ".byte 0xf3 ; lock ; cmpxchg $($2, $1$|$1, $2$)", "={ax},=*m,r,0,*m,~{memory},~{dirflag},~{fpsr},~{flags}"(ptr elementtype(i64) %Destination, i64 %Exchange, i64 %Comparand, ptr elementtype(i64) %Destination)
   return _InterlockedCompareExchange64_HLERelease(Destination, Exchange, Comparand);
 }
@@ -737,84 +737,84 @@ __int64 test_InterlockedCompareExchange64_HLERelease(__int64 volatile *Destinati
 char test_InterlockedExchangeAdd8_acq(char volatile *value, char mask) {
   return _InterlockedExchangeAdd8_acq(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i8 @test_InterlockedExchangeAdd8_acq(ptr{{[a-z_ ]*}}%value, i8{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i8 @test_InterlockedExchangeAdd8_acq(ptr{{.*}}%value, i8{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw add ptr %value, i8 %mask acquire, align 1
 // CHECK-ARM-ARM64:   ret i8 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
 char test_InterlockedExchangeAdd8_rel(char volatile *value, char mask) {
   return _InterlockedExchangeAdd8_rel(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i8 @test_InterlockedExchangeAdd8_rel(ptr{{[a-z_ ]*}}%value, i8{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i8 @test_InterlockedExchangeAdd8_rel(ptr{{.*}}%value, i8{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw add ptr %value, i8 %mask release, align 1
 // CHECK-ARM-ARM64:   ret i8 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
 char test_InterlockedExchangeAdd8_nf(char volatile *value, char mask) {
   return _InterlockedExchangeAdd8_nf(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i8 @test_InterlockedExchangeAdd8_nf(ptr{{[a-z_ ]*}}%value, i8{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i8 @test_InterlockedExchangeAdd8_nf(ptr{{.*}}%value, i8{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw add ptr %value, i8 %mask monotonic, align 1
 // CHECK-ARM-ARM64:   ret i8 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
 short test_InterlockedExchangeAdd16_acq(short volatile *value, short mask) {
   return _InterlockedExchangeAdd16_acq(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i16 @test_InterlockedExchangeAdd16_acq(ptr{{[a-z_ ]*}}%value, i16{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i16 @test_InterlockedExchangeAdd16_acq(ptr{{.*}}%value, i16{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw add ptr %value, i16 %mask acquire, align 2
 // CHECK-ARM-ARM64:   ret i16 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
 short test_InterlockedExchangeAdd16_rel(short volatile *value, short mask) {
   return _InterlockedExchangeAdd16_rel(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i16 @test_InterlockedExchangeAdd16_rel(ptr{{[a-z_ ]*}}%value, i16{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i16 @test_InterlockedExchangeAdd16_rel(ptr{{.*}}%value, i16{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw add ptr %value, i16 %mask release, align 2
 // CHECK-ARM-ARM64:   ret i16 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
 short test_InterlockedExchangeAdd16_nf(short volatile *value, short mask) {
   return _InterlockedExchangeAdd16_nf(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i16 @test_InterlockedExchangeAdd16_nf(ptr{{[a-z_ ]*}}%value, i16{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i16 @test_InterlockedExchangeAdd16_nf(ptr{{.*}}%value, i16{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw add ptr %value, i16 %mask monotonic, align 2
 // CHECK-ARM-ARM64:   ret i16 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
 long test_InterlockedExchangeAdd_acq(long volatile *value, long mask) {
   return _InterlockedExchangeAdd_acq(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i32 @test_InterlockedExchangeAdd_acq(ptr{{[a-z_ ]*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i32 @test_InterlockedExchangeAdd_acq(ptr{{.*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw add ptr %value, i32 %mask acquire, align 4
 // CHECK-ARM-ARM64:   ret i32 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
 long test_InterlockedExchangeAdd_rel(long volatile *value, long mask) {
   return _InterlockedExchangeAdd_rel(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i32 @test_InterlockedExchangeAdd_rel(ptr{{[a-z_ ]*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i32 @test_InterlockedExchangeAdd_rel(ptr{{.*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw add ptr %value, i32 %mask release, align 4
 // CHECK-ARM-ARM64:   ret i32 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
 long test_InterlockedExchangeAdd_nf(long volatile *value, long mask) {
   return _InterlockedExchangeAdd_nf(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i32 @test_InterlockedExchangeAdd_nf(ptr{{[a-z_ ]*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i32 @test_InterlockedExchangeAdd_nf(ptr{{.*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw add ptr %value, i32 %mask monotonic, align 4
 // CHECK-ARM-ARM64:   ret i32 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
 __int64 test_InterlockedExchangeAdd64_acq(__int64 volatile *value, __int64 mask) {
   return _InterlockedExchangeAdd64_acq(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i64 @test_InterlockedExchangeAdd64_acq(ptr{{[a-z_ ]*}}%value, i64{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i64 @test_InterlockedExchangeAdd64_acq(ptr{{.*}}%value, i64{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw add ptr %value, i64 %mask acquire, align 8
 // CHECK-ARM-ARM64:   ret i64 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
 __int64 test_InterlockedExchangeAdd64_rel(__int64 volatile *value, __int64 mask) {
   return _InterlockedExchangeAdd64_rel(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i64 @test_InterlockedExchangeAdd64_rel(ptr{{[a-z_ ]*}}%value, i64{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i64 @test_InterlockedExchangeAdd64_rel(ptr{{.*}}%value, i64{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw add ptr %value, i64 %mask release, align 8
 // CHECK-ARM-ARM64:   ret i64 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
 __int64 test_InterlockedExchangeAdd64_nf(__int64 volatile *value, __int64 mask) {
   return _InterlockedExchangeAdd64_nf(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i64 @test_InterlockedExchangeAdd64_nf(ptr{{[a-z_ ]*}}%value, i64{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i64 @test_InterlockedExchangeAdd64_nf(ptr{{.*}}%value, i64{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw add ptr %value, i64 %mask monotonic, align 8
 // CHECK-ARM-ARM64:   ret i64 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
@@ -822,84 +822,84 @@ __int64 test_InterlockedExchangeAdd64_nf(__int64 volatile *value, __int64 mask)
 char test_InterlockedExchange8_acq(char volatile *value, char mask) {
   return _InterlockedExchange8_acq(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i8 @test_InterlockedExchange8_acq(ptr{{[a-z_ ]*}}%value, i8{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i8 @test_InterlockedExchange8_acq(ptr{{.*}}%value, i8{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw xchg ptr %value, i8 %mask acquire, align 1
 // CHECK-ARM-ARM64:   ret i8 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
 char test_InterlockedExchange8_rel(char volatile *value, char mask) {
   return _InterlockedExchange8_rel(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i8 @test_InterlockedExchange8_rel(ptr{{[a-z_ ]*}}%value, i8{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i8 @test_InterlockedExchange8_rel(ptr{{.*}}%value, i8{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw xchg ptr %value, i8 %mask release, align 1
 // CHECK-ARM-ARM64:   ret i8 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
 char test_InterlockedExchange8_nf(char volatile *value, char mask) {
   return _InterlockedExchange8_nf(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i8 @test_InterlockedExchange8_nf(ptr{{[a-z_ ]*}}%value, i8{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i8 @test_InterlockedExchange8_nf(ptr{{.*}}%value, i8{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw xchg ptr %value, i8 %mask monotonic, align 1
 // CHECK-ARM-ARM64:   ret i8 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
 short test_InterlockedExchange16_acq(short volatile *value, short mask) {
   return _InterlockedExchange16_acq(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i16 @test_InterlockedExchange16_acq(ptr{{[a-z_ ]*}}%value, i16{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i16 @test_InterlockedExchange16_acq(ptr{{.*}}%value, i16{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw xchg ptr %value, i16 %mask acquire, align 2
 // CHECK-ARM-ARM64:   ret i16 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
 short test_InterlockedExchange16_rel(short volatile *value, short mask) {
   return _InterlockedExchange16_rel(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i16 @test_InterlockedExchange16_rel(ptr{{[a-z_ ]*}}%value, i16{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i16 @test_InterlockedExchange16_rel(ptr{{.*}}%value, i16{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw xchg ptr %value, i16 %mask release, align 2
 // CHECK-ARM-ARM64:   ret i16 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
 short test_InterlockedExchange16_nf(short volatile *value, short mask) {
   return _InterlockedExchange16_nf(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i16 @test_InterlockedExchange16_nf(ptr{{[a-z_ ]*}}%value, i16{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i16 @test_InterlockedExchange16_nf(ptr{{.*}}%value, i16{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw xchg ptr %value, i16 %mask monotonic, align 2
 // CHECK-ARM-ARM64:   ret i16 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
 long test_InterlockedExchange_acq(long volatile *value, long mask) {
   return _InterlockedExchange_acq(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i32 @test_InterlockedExchange_acq(ptr{{[a-z_ ]*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i32 @test_InterlockedExchange_acq(ptr{{.*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw xchg ptr %value, i32 %mask acquire, align 4
 // CHECK-ARM-ARM64:   ret i32 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
 long test_InterlockedExchange_rel(long volatile *value, long mask) {
   return _InterlockedExchange_rel(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i32 @test_InterlockedExchange_rel(ptr{{[a-z_ ]*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i32 @test_InterlockedExchange_rel(ptr{{.*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw xchg ptr %value, i32 %mask release, align 4
 // CHECK-ARM-ARM64:   ret i32 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
 long test_InterlockedExchange_nf(long volatile *value, long mask) {
   return _InterlockedExchange_nf(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i32 @test_InterlockedExchange_nf(ptr{{[a-z_ ]*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i32 @test_InterlockedExchange_nf(ptr{{.*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw xchg ptr %value, i32 %mask monotonic, align 4
 // CHECK-ARM-ARM64:   ret i32 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
 __int64 test_InterlockedExchange64_acq(__int64 volatile *value, __int64 mask) {
   return _InterlockedExchange64_acq(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i64 @test_InterlockedExchange64_acq(ptr{{[a-z_ ]*}}%value, i64{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i64 @test_InterlockedExchange64_acq(ptr{{.*}}%value, i64{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw xchg ptr %value, i64 %mask acquire, align 8
 // CHECK-ARM-ARM64:   ret i64 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
 __int64 test_InterlockedExchange64_rel(__int64 volatile *value, __int64 mask) {
   return _InterlockedExchange64_rel(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i64 @test_InterlockedExchange64_rel(ptr{{[a-z_ ]*}}%value, i64{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i64 @test_InterlockedExchange64_rel(ptr{{.*}}%value, i64{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw xchg ptr %value, i64 %mask release, align 8
 // CHECK-ARM-ARM64:   ret i64 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
 __int64 test_InterlockedExchange64_nf(__int64 volatile *value, __int64 mask) {
   return _InterlockedExchange64_nf(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i64 @test_InterlockedExchange64_nf(ptr{{[a-z_ ]*}}%value, i64{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i64 @test_InterlockedExchange64_nf(ptr{{.*}}%value, i64{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw xchg ptr %value, i64 %mask monotonic, align 8
 // CHECK-ARM-ARM64:   ret i64 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
@@ -907,7 +907,7 @@ __int64 test_InterlockedExchange64_nf(__int64 volatile *value, __int64 mask) {
 char test_InterlockedCompareExchange8_acq(char volatile *Destination, char Exchange, char Comperand) {
   return _InterlockedCompareExchange8_acq(Destination, Exchange, Comperand);
 }
-// CHECK-ARM-ARM64: define{{.*}}i8 @test_InterlockedCompareExchange8_acq(ptr{{[a-z_ ]*}}%Destination, i8{{[a-z_ ]*}}%Exchange, i8{{[a-z_ ]*}}%Comperand){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i8 @test_InterlockedCompareExchange8_acq(ptr{{.*}}%Destination, i8{{[a-z_ ]*}}%Exchange, i8{{[a-z_ ]*}}%Comperand){{.*}}{
 // CHECK-ARM-ARM64: [[TMP:%[0-9]+]] = cmpxchg volatile ptr %Destination, i8 %Comperand, i8 %Exchange acquire acquire, align 1
 // CHECK-ARM-ARM64: [[RESULT:%[0-9]+]] = extractvalue { i8, i1 } [[TMP]], 0
 // CHECK-ARM-ARM64: ret i8 [[RESULT]]
@@ -916,7 +916,7 @@ char test_InterlockedCompareExchange8_acq(char volatile *Destination, char Excha
 char test_InterlockedCompareExchange8_rel(char volatile *Destination, char Exchange, char Comperand) {
   return _InterlockedCompareExchange8_rel(Destination, Exchange, Comperand);
 }
-// CHECK-ARM-ARM64: define{{.*}}i8 @test_InterlockedCompareExchange8_rel(ptr{{[a-z_ ]*}}%Destination, i8{{[a-z_ ]*}}%Exchange, i8{{[a-z_ ]*}}%Comperand){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i8 @test_InterlockedCompareExchange8_rel(ptr{{.*}}%Destination, i8{{[a-z_ ]*}}%Exchange, i8{{[a-z_ ]*}}%Comperand){{.*}}{
 // CHECK-ARM-ARM64: [[TMP:%[0-9]+]] = cmpxchg volatile ptr %Destination, i8 %Comperand, i8 %Exchange release monotonic, align 1
 // CHECK-ARM-ARM64: [[RESULT:%[0-9]+]] = extractvalue { i8, i1 } [[TMP]], 0
 // CHECK-ARM-ARM64: ret i8 [[RESULT]]
@@ -925,7 +925,7 @@ char test_InterlockedCompareExchange8_rel(char volatile *Destination, char Excha
 char test_InterlockedCompareExchange8_nf(char volatile *Destination, char Exchange, char Comperand) {
   return _InterlockedCompareExchange8_nf(Destination, Exchange, Comperand);
 }
-// CHECK-ARM-ARM64: define{{.*}}i8 @test_InterlockedCompareExchange8_nf(ptr{{[a-z_ ]*}}%Destination, i8{{[a-z_ ]*}}%Exchange, i8{{[a-z_ ]*}}%Comperand){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i8 @test_InterlockedCompareExchange8_nf(ptr{{.*}}%Destination, i8{{[a-z_ ]*}}%Exchange, i8{{[a-z_ ]*}}%Comperand){{.*}}{
 // CHECK-ARM-ARM64: [[TMP:%[0-9]+]] = cmpxchg volatile ptr %Destination, i8 %Comperand, i8 %Exchange monotonic monotonic, align 1
 // CHECK-ARM-ARM64: [[RESULT:%[0-9]+]] = extractvalue { i8, i1 } [[TMP]], 0
 // CHECK-ARM-ARM64: ret i8 [[RESULT]]
@@ -934,7 +934,7 @@ char test_InterlockedCompareExchange8_nf(char volatile *Destination, char Exchan
 short test_InterlockedCompareExchange16_acq(short volatile *Destination, short Exchange, short Comperand) {
   return _InterlockedCompareExchange16_acq(Destination, Exchange, Comperand);
 }
-// CHECK-ARM-ARM64: define{{.*}}i16 @test_InterlockedCompareExchange16_acq(ptr{{[a-z_ ]*}}%Destination, i16{{[a-z_ ]*}}%Exchange, i16{{[a-z_ ]*}}%Comperand){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i16 @test_InterlockedCompareExchange16_acq(ptr{{.*}}%Destination, i16{{[a-z_ ]*}}%Exchange, i16{{[a-z_ ]*}}%Comperand){{.*}}{
 // CHECK-ARM-ARM64: [[TMP:%[0-9]+]] = cmpxchg volatile ptr %Destination, i16 %Comperand, i16 %Exchange acquire acquire, align 2
 // CHECK-ARM-ARM64: [[RESULT:%[0-9]+]] = extractvalue { i16, i1 } [[TMP]], 0
 // CHECK-ARM-ARM64: ret i16 [[RESULT]]
@@ -943,7 +943,7 @@ short test_InterlockedCompareExchange16_acq(short volatile *Destination, short E
 short test_InterlockedCompareExchange16_rel(short volatile *Destination, short Exchange, short Comperand) {
   return _InterlockedCompareExchange16_rel(Destination, Exchange, Comperand);
 }
-// CHECK-ARM-ARM64: define{{.*}}i16 @test_InterlockedCompareExchange16_rel(ptr{{[a-z_ ]*}}%Destination, i16{{[a-z_ ]*}}%Exchange, i16{{[a-z_ ]*}}%Comperand){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i16 @test_InterlockedCompareExchange16_rel(ptr{{.*}}%Destination, i16{{[a-z_ ]*}}%Exchange, i16{{[a-z_ ]*}}%Comperand){{.*}}{
 // CHECK-ARM-ARM64: [[TMP:%[0-9]+]] = cmpxchg volatile ptr %Destination, i16 %Comperand, i16 %Exchange release monotonic, align 2
 // CHECK-ARM-ARM64: [[RESULT:%[0-9]+]] = extractvalue { i16, i1 } [[TMP]], 0
 // CHECK-ARM-ARM64: ret i16 [[RESULT]]
@@ -952,7 +952,7 @@ short test_InterlockedCompareExchange16_rel(short volatile *Destination, short E
 short test_InterlockedCompareExchange16_nf(short volatile *Destination, short Exchange, short Comperand) {
   return _InterlockedCompareExchange16_nf(Destination, Exchange, Comperand);
 }
-// CHECK-ARM-ARM64: define{{.*}}i16 @test_InterlockedCompareExchange16_nf(ptr{{[a-z_ ]*}}%Destination, i16{{[a-z_ ]*}}%Exchange, i16{{[a-z_ ]*}}%Comperand){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i16 @test_InterlockedCompareExchange16_nf(ptr{{.*}}%Destination, i16{{[a-z_ ]*}}%Exchange, i16{{[a-z_ ]*}}%Comperand){{.*}}{
 // CHECK-ARM-ARM64: [[TMP:%[0-9]+]] = cmpxchg volatile ptr %Destination, i16 %Comperand, i16 %Exchange monotonic monotonic, align 2
 // CHECK-ARM-ARM64: [[RESULT:%[0-9]+]] = extractvalue { i16, i1 } [[TMP]], 0
 // CHECK-ARM-ARM64: ret i16 [[RESULT]]
@@ -961,7 +961,7 @@ short test_InterlockedCompareExchange16_nf(short volatile *Destination, short Ex
 long test_InterlockedCompareExchange_acq(long volatile *Destination, long Exchange, long Comperand) {
   return _InterlockedCompareExchange_acq(Destination, Exchange, Comperand);
 }
-// CHECK-ARM-ARM64: define{{.*}}i32 @test_InterlockedCompareExchange_acq(ptr{{[a-z_ ]*}}%Destination, i32{{[a-z_ ]*}}%Exchange, i32{{[a-z_ ]*}}%Comperand){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i32 @test_InterlockedCompareExchange_acq(ptr{{.*}}%Destination, i32{{[a-z_ ]*}}%Exchange, i32{{[a-z_ ]*}}%Comperand){{.*}}{
 // CHECK-ARM-ARM64: [[TMP:%[0-9]+]] = cmpxchg volatile ptr %Destination, i32 %Comperand, i32 %Exchange acquire acquire, align 4
 // CHECK-ARM-ARM64: [[RESULT:%[0-9]+]] = extractvalue { i32, i1 } [[TMP]], 0
 // CHECK-ARM-ARM64: ret i32 [[RESULT]]
@@ -970,7 +970,7 @@ long test_InterlockedCompareExchange_acq(long volatile *Destination, long Exchan
 long test_InterlockedCompareExchange_rel(long volatile *Destination, long Exchange, long Comperand) {
   return _InterlockedCompareExchange_rel(Destination, Exchange, Comperand);
 }
-// CHECK-ARM-ARM64: define{{.*}}i32 @test_InterlockedCompareExchange_rel(ptr{{[a-z_ ]*}}%Destination, i32{{[a-z_ ]*}}%Exchange, i32{{[a-z_ ]*}}%Comperand){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i32 @test_InterlockedCompareExchange_rel(ptr{{.*}}%Destination, i32{{[a-z_ ]*}}%Exchange, i32{{[a-z_ ]*}}%Comperand){{.*}}{
 // CHECK-ARM-ARM64: [[TMP:%[0-9]+]] = cmpxchg volatile ptr %Destination, i32 %Comperand, i32 %Exchange release monotonic, align 4
 // CHECK-ARM-ARM64: [[RESULT:%[0-9]+]] = extractvalue { i32, i1 } [[TMP]], 0
 // CHECK-ARM-ARM64: ret i32 [[RESULT]]
@@ -979,7 +979,7 @@ long test_InterlockedCompareExchange_rel(long volatile *Destination, long Exchan
 long test_InterlockedCompareExchange_nf(long volatile *Destination, long Exchange, long Comperand) {
   return _InterlockedCompareExchange_nf(Destination, Exchange, Comperand);
 }
-// CHECK-ARM-ARM64: define{{.*}}i32 @test_InterlockedCompareExchange_nf(ptr{{[a-z_ ]*}}%Destination, i32{{[a-z_ ]*}}%Exchange, i32{{[a-z_ ]*}}%Comperand){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i32 @test_InterlockedCompareExchange_nf(ptr{{.*}}%Destination, i32{{[a-z_ ]*}}%Exchange, i32{{[a-z_ ]*}}%Comperand){{.*}}{
 // CHECK-ARM-ARM64: [[TMP:%[0-9]+]] = cmpxchg volatile ptr %Destination, i32 %Comperand, i32 %Exchange monotonic monotonic, align 4
 // CHECK-ARM-ARM64: [[RESULT:%[0-9]+]] = extractvalue { i32, i1 } [[TMP]], 0
 // CHECK-ARM-ARM64: ret i32 [[RESULT]]
@@ -988,7 +988,7 @@ long test_InterlockedCompareExchange_nf(long volatile *Destination, long Exchang
 __int64 test_InterlockedCompareExchange64_acq(__int64 volatile *Destination, __int64 Exchange, __int64 Comperand) {
   return _InterlockedCompareExchange64_acq(Destination, Exchange, Comperand);
 }
-// CHECK-ARM-ARM64: define{{.*}}i64 @test_InterlockedCompareExchange64_acq(ptr{{[a-z_ ]*}}%Destination, i64{{[a-z_ ]*}}%Exchange, i64{{[a-z_ ]*}}%Comperand){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i64 @test_InterlockedCompareExchange64_acq(ptr{{.*}}%Destination, i64{{[a-z_ ]*}}%Exchange, i64{{[a-z_ ]*}}%Comperand){{.*}}{
 // CHECK-ARM-ARM64: [[TMP:%[0-9]+]] = cmpxchg volatile ptr %Destination, i64 %Comperand, i64 %Exchange acquire acquire, align 8
 // CHECK-ARM-ARM64: [[RESULT:%[0-9]+]] = extractvalue { i64, i1 } [[TMP]], 0
 // CHECK-ARM-ARM64: ret i64 [[RESULT]]
@@ -997,7 +997,7 @@ __int64 test_InterlockedCompareExchange64_acq(__int64 volatile *Destination, __i
 __int64 test_InterlockedCompareExchange64_rel(__int64 volatile *Destination, __int64 Exchange, __int64 Comperand) {
   return _InterlockedCompareExchange64_rel(Destination, Exchange, Comperand);
 }
-// CHECK-ARM-ARM64: define{{.*}}i64 @test_InterlockedCompareExchange64_rel(ptr{{[a-z_ ]*}}%Destination, i64{{[a-z_ ]*}}%Exchange, i64{{[a-z_ ]*}}%Comperand){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i64 @test_InterlockedCompareExchange64_rel(ptr{{.*}}%Destination, i64{{[a-z_ ]*}}%Exchange, i64{{[a-z_ ]*}}%Comperand){{.*}}{
 // CHECK-ARM-ARM64: [[TMP:%[0-9]+]] = cmpxchg volatile ptr %Destination, i64 %Comperand, i64 %Exchange release monotonic, align 8
 // CHECK-ARM-ARM64: [[RESULT:%[0-9]+]] = extractvalue { i64, i1 } [[TMP]], 0
 // CHECK-ARM-ARM64: ret i64 [[RESULT]]
@@ -1006,7 +1006,7 @@ __int64 test_InterlockedCompareExchange64_rel(__int64 volatile *Destination, __i
 __int64 test_InterlockedCompareExchange64_nf(__int64 volatile *Destination, __int64 Exchange, __int64 Comperand) {
   return _InterlockedCompareExchange64_nf(Destination, Exchange, Comperand);
 }
-// CHECK-ARM-ARM64: define{{.*}}i64 @test_InterlockedCompareExchange64_nf(ptr{{[a-z_ ]*}}%Destination, i64{{[a-z_ ]*}}%Exchange, i64{{[a-z_ ]*}}%Comperand){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i64 @test_InterlockedCompareExchange64_nf(ptr{{.*}}%Destination, i64{{[a-z_ ]*}}%Exchange, i64{{[a-z_ ]*}}%Comperand){{.*}}{
 // CHECK-ARM-ARM64: [[TMP:%[0-9]+]] = cmpxchg volatile ptr %Destination, i64 %Comperand, i64 %Exchange monotonic monotonic, align 8
 // CHECK-ARM-ARM64: [[RESULT:%[0-9]+]] = extractvalue { i64, i1 } [[TMP]], 0
 // CHECK-ARM-ARM64: ret i64 [[RESULT]]
@@ -1015,7 +1015,7 @@ __int64 test_InterlockedCompareExchange64_nf(__int64 volatile *Destination, __in
 char test_InterlockedOr8_acq(char volatile *value, char mask) {
   return _InterlockedOr8_acq(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i8 @test_InterlockedOr8_acq(ptr{{[a-z_ ]*}}%value, i8{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i8 @test_InterlockedOr8_acq(ptr{{.*}}%value, i8{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw or ptr %value, i8 %mask acquire, align 1
 // CHECK-ARM-ARM64:   ret i8 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
@@ -1023,7 +1023,7 @@ char test_InterlockedOr8_acq(char volatile *value, char mask) {
 char test_InterlockedOr8_rel(char volatile *value, char mask) {
   return _InterlockedOr8_rel(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i8 @test_InterlockedOr8_rel(ptr{{[a-z_ ]*}}%value, i8{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i8 @test_InterlockedOr8_rel(ptr{{.*}}%value, i8{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw or ptr %value, i8 %mask release, align 1
 // CHECK-ARM-ARM64:   ret i8 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
@@ -1031,7 +1031,7 @@ char test_InterlockedOr8_rel(char volatile *value, char mask) {
 char test_InterlockedOr8_nf(char volatile *value, char mask) {
   return _InterlockedOr8_nf(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i8 @test_InterlockedOr8_nf(ptr{{[a-z_ ]*}}%value, i8{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i8 @test_InterlockedOr8_nf(ptr{{.*}}%value, i8{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw or ptr %value, i8 %mask monotonic, align 1
 // CHECK-ARM-ARM64:   ret i8 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
@@ -1039,7 +1039,7 @@ char test_InterlockedOr8_nf(char volatile *value, char mask) {
 short test_InterlockedOr16_acq(short volatile *value, short mask) {
   return _InterlockedOr16_acq(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i16 @test_InterlockedOr16_acq(ptr{{[a-z_ ]*}}%value, i16{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i16 @test_InterlockedOr16_acq(ptr{{.*}}%value, i16{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw or ptr %value, i16 %mask acquire, align 2
 // CHECK-ARM-ARM64:   ret i16 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
@@ -1047,7 +1047,7 @@ short test_InterlockedOr16_acq(short volatile *value, short mask) {
 short test_InterlockedOr16_rel(short volatile *value, short mask) {
   return _InterlockedOr16_rel(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i16 @test_InterlockedOr16_rel(ptr{{[a-z_ ]*}}%value, i16{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i16 @test_InterlockedOr16_rel(ptr{{.*}}%value, i16{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw or ptr %value, i16 %mask release, align 2
 // CHECK-ARM-ARM64:   ret i16 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
@@ -1055,7 +1055,7 @@ short test_InterlockedOr16_rel(short volatile *value, short mask) {
 short test_InterlockedOr16_nf(short volatile *value, short mask) {
   return _InterlockedOr16_nf(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i16 @test_InterlockedOr16_nf(ptr{{[a-z_ ]*}}%value, i16{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i16 @test_InterlockedOr16_nf(ptr{{.*}}%value, i16{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw or ptr %value, i16 %mask monotonic, align 2
 // CHECK-ARM-ARM64:   ret i16 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
@@ -1063,7 +1063,7 @@ short test_InterlockedOr16_nf(short volatile *value, short mask) {
 long test_InterlockedOr_acq(long volatile *value, long mask) {
   return _InterlockedOr_acq(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i32 @test_InterlockedOr_acq(ptr{{[a-z_ ]*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i32 @test_InterlockedOr_acq(ptr{{.*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw or ptr %value, i32 %mask acquire, align 4
 // CHECK-ARM-ARM64:   ret i32 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
@@ -1071,7 +1071,7 @@ long test_InterlockedOr_acq(long volatile *value, long mask) {
 long test_InterlockedOr_rel(long volatile *value, long mask) {
   return _InterlockedOr_rel(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i32 @test_InterlockedOr_rel(ptr{{[a-z_ ]*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i32 @test_InterlockedOr_rel(ptr{{.*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw or ptr %value, i32 %mask release, align 4
 // CHECK-ARM-ARM64:   ret i32 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
@@ -1079,7 +1079,7 @@ long test_InterlockedOr_rel(long volatile *value, long mask) {
 long test_InterlockedOr_nf(long volatile *value, long mask) {
   return _InterlockedOr_nf(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i32 @test_InterlockedOr_nf(ptr{{[a-z_ ]*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i32 @test_InterlockedOr_nf(ptr{{.*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw or ptr %value, i32 %mask monotonic, align 4
 // CHECK-ARM-ARM64:   ret i32 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
@@ -1087,7 +1087,7 @@ long test_InterlockedOr_nf(long volatile *value, long mask) {
 __int64 test_InterlockedOr64_acq(__int64 volatile *value, __int64 mask) {
   return _InterlockedOr64_acq(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i64 @test_InterlockedOr64_acq(ptr{{[a-z_ ]*}}%value, i64{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i64 @test_InterlockedOr64_acq(ptr{{.*}}%value, i64{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw or ptr %value, i64 %mask acquire, align 8
 // CHECK-ARM-ARM64:   ret i64 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
@@ -1095,7 +1095,7 @@ __int64 test_InterlockedOr64_acq(__int64 volatile *value, __int64 mask) {
 __int64 test_InterlockedOr64_rel(__int64 volatile *value, __int64 mask) {
   return _InterlockedOr64_rel(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i64 @test_InterlockedOr64_rel(ptr{{[a-z_ ]*}}%value, i64{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i64 @test_InterlockedOr64_rel(ptr{{.*}}%value, i64{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw or ptr %value, i64 %mask release, align 8
 // CHECK-ARM-ARM64:   ret i64 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
@@ -1103,7 +1103,7 @@ __int64 test_InterlockedOr64_rel(__int64 volatile *value, __int64 mask) {
 __int64 test_InterlockedOr64_nf(__int64 volatile *value, __int64 mask) {
   return _InterlockedOr64_nf(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i64 @test_InterlockedOr64_nf(ptr{{[a-z_ ]*}}%value, i64{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i64 @test_InterlockedOr64_nf(ptr{{.*}}%value, i64{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw or ptr %value, i64 %mask monotonic, align 8
 // CHECK-ARM-ARM64:   ret i64 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
@@ -1111,7 +1111,7 @@ __int64 test_InterlockedOr64_nf(__int64 volatile *value, __int64 mask) {
 char test_InterlockedXor8_acq(char volatile *value, char mask) {
   return _InterlockedXor8_acq(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i8 @test_InterlockedXor8_acq(ptr{{[a-z_ ]*}}%value, i8{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i8 @test_InterlockedXor8_acq(ptr{{.*}}%value, i8{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw xor ptr %value, i8 %mask acquire, align 1
 // CHECK-ARM-ARM64:   ret i8 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
@@ -1119,7 +1119,7 @@ char test_InterlockedXor8_acq(char volatile *value, char mask) {
 char test_InterlockedXor8_rel(char volatile *value, char mask) {
   return _InterlockedXor8_rel(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i8 @test_InterlockedXor8_rel(ptr{{[a-z_ ]*}}%value, i8{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i8 @test_InterlockedXor8_rel(ptr{{.*}}%value, i8{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw xor ptr %value, i8 %mask release, align 1
 // CHECK-ARM-ARM64:   ret i8 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
@@ -1127,7 +1127,7 @@ char test_InterlockedXor8_rel(char volatile *value, char mask) {
 char test_InterlockedXor8_nf(char volatile *value, char mask) {
   return _InterlockedXor8_nf(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i8 @test_InterlockedXor8_nf(ptr{{[a-z_ ]*}}%value, i8{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i8 @test_InterlockedXor8_nf(ptr{{.*}}%value, i8{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw xor ptr %value, i8 %mask monotonic, align 1
 // CHECK-ARM-ARM64:   ret i8 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
@@ -1135,7 +1135,7 @@ char test_InterlockedXor8_nf(char volatile *value, char mask) {
 short test_InterlockedXor16_acq(short volatile *value, short mask) {
   return _InterlockedXor16_acq(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i16 @test_InterlockedXor16_acq(ptr{{[a-z_ ]*}}%value, i16{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i16 @test_InterlockedXor16_acq(ptr{{.*}}%value, i16{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw xor ptr %value, i16 %mask acquire, align 2
 // CHECK-ARM-ARM64:   ret i16 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
@@ -1143,7 +1143,7 @@ short test_InterlockedXor16_acq(short volatile *value, short mask) {
 short test_InterlockedXor16_rel(short volatile *value, short mask) {
   return _InterlockedXor16_rel(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i16 @test_InterlockedXor16_rel(ptr{{[a-z_ ]*}}%value, i16{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i16 @test_InterlockedXor16_rel(ptr{{.*}}%value, i16{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw xor ptr %value, i16 %mask release, align 2
 // CHECK-ARM-ARM64:   ret i16 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
@@ -1151,7 +1151,7 @@ short test_InterlockedXor16_rel(short volatile *value, short mask) {
 short test_InterlockedXor16_nf(short volatile *value, short mask) {
   return _InterlockedXor16_nf(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i16 @test_InterlockedXor16_nf(ptr{{[a-z_ ]*}}%value, i16{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i16 @test_InterlockedXor16_nf(ptr{{.*}}%value, i16{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw xor ptr %value, i16 %mask monotonic, align 2
 // CHECK-ARM-ARM64:   ret i16 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
@@ -1159,7 +1159,7 @@ short test_InterlockedXor16_nf(short volatile *value, short mask) {
 long test_InterlockedXor_acq(long volatile *value, long mask) {
   return _InterlockedXor_acq(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i32 @test_InterlockedXor_acq(ptr{{[a-z_ ]*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i32 @test_InterlockedXor_acq(ptr{{.*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw xor ptr %value, i32 %mask acquire, align 4
 // CHECK-ARM-ARM64:   ret i32 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
@@ -1167,7 +1167,7 @@ long test_InterlockedXor_acq(long volatile *value, long mask) {
 long test_InterlockedXor_rel(long volatile *value, long mask) {
   return _InterlockedXor_rel(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i32 @test_InterlockedXor_rel(ptr{{[a-z_ ]*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i32 @test_InterlockedXor_rel(ptr{{.*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw xor ptr %value, i32 %mask release, align 4
 // CHECK-ARM-ARM64:   ret i32 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
@@ -1175,7 +1175,7 @@ long test_InterlockedXor_rel(long volatile *value, long mask) {
 long test_InterlockedXor_nf(long volatile *value, long mask) {
   return _InterlockedXor_nf(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i32 @test_InterlockedXor_nf(ptr{{[a-z_ ]*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i32 @test_InterlockedXor_nf(ptr{{.*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw xor ptr %value, i32 %mask monotonic, align 4
 // CHECK-ARM-ARM64:   ret i32 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
@@ -1183,7 +1183,7 @@ long test_InterlockedXor_nf(long volatile *value, long mask) {
 __int64 test_InterlockedXor64_acq(__int64 volatile *value, __int64 mask) {
   return _InterlockedXor64_acq(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i64 @test_InterlockedXor64_acq(ptr{{[a-z_ ]*}}%value, i64{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i64 @test_InterlockedXor64_acq(ptr{{.*}}%value, i64{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw xor ptr %value, i64 %mask acquire, align 8
 // CHECK-ARM-ARM64:   ret i64 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
@@ -1191,7 +1191,7 @@ __int64 test_InterlockedXor64_acq(__int64 volatile *value, __int64 mask) {
 __int64 test_InterlockedXor64_rel(__int64 volatile *value, __int64 mask) {
   return _InterlockedXor64_rel(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i64 @test_InterlockedXor64_rel(ptr{{[a-z_ ]*}}%value, i64{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i64 @test_InterlockedXor64_rel(ptr{{.*}}%value, i64{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw xor ptr %value, i64 %mask release, align 8
 // CHECK-ARM-ARM64:   ret i64 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
@@ -1199,7 +1199,7 @@ __int64 test_InterlockedXor64_rel(__int64 volatile *value, __int64 mask) {
 __int64 test_InterlockedXor64_nf(__int64 volatile *value, __int64 mask) {
   return _InterlockedXor64_nf(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i64 @test_InterlockedXor64_nf(ptr{{[a-z_ ]*}}%value, i64{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i64 @test_InterlockedXor64_nf(ptr{{.*}}%value, i64{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw xor ptr %value, i64 %mask monotonic, align 8
 // CHECK-ARM-ARM64:   ret i64 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
@@ -1207,7 +1207,7 @@ __int64 test_InterlockedXor64_nf(__int64 volatile *value, __int64 mask) {
 char test_InterlockedAnd8_acq(char volatile *value, char mask) {
   return _InterlockedAnd8_acq(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i8 @test_InterlockedAnd8_acq(ptr{{[a-z_ ]*}}%value, i8{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i8 @test_InterlockedAnd8_acq(ptr{{.*}}%value, i8{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw and ptr %value, i8 %mask acquire, align 1
 // CHECK-ARM-ARM64:   ret i8 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
@@ -1215,7 +1215,7 @@ char test_InterlockedAnd8_acq(char volatile *value, char mask) {
 char test_InterlockedAnd8_rel(char volatile *value, char mask) {
   return _InterlockedAnd8_rel(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i8 @test_InterlockedAnd8_rel(ptr{{[a-z_ ]*}}%value, i8{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i8 @test_InterlockedAnd8_rel(ptr{{.*}}%value, i8{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw and ptr %value, i8 %mask release, align 1
 // CHECK-ARM-ARM64:   ret i8 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
@@ -1223,7 +1223,7 @@ char test_InterlockedAnd8_rel(char volatile *value, char mask) {
 char test_InterlockedAnd8_nf(char volatile *value, char mask) {
   return _InterlockedAnd8_nf(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i8 @test_InterlockedAnd8_nf(ptr{{[a-z_ ]*}}%value, i8{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i8 @test_InterlockedAnd8_nf(ptr{{.*}}%value, i8{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw and ptr %value, i8 %mask monotonic, align 1
 // CHECK-ARM-ARM64:   ret i8 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
@@ -1231,7 +1231,7 @@ char test_InterlockedAnd8_nf(char volatile *value, char mask) {
 short test_InterlockedAnd16_acq(short volatile *value, short mask) {
   return _InterlockedAnd16_acq(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i16 @test_InterlockedAnd16_acq(ptr{{[a-z_ ]*}}%value, i16{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i16 @test_InterlockedAnd16_acq(ptr{{.*}}%value, i16{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw and ptr %value, i16 %mask acquire, align 2
 // CHECK-ARM-ARM64:   ret i16 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
@@ -1239,7 +1239,7 @@ short test_InterlockedAnd16_acq(short volatile *value, short mask) {
 short test_InterlockedAnd16_rel(short volatile *value, short mask) {
   return _InterlockedAnd16_rel(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i16 @test_InterlockedAnd16_rel(ptr{{[a-z_ ]*}}%value, i16{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i16 @test_InterlockedAnd16_rel(ptr{{.*}}%value, i16{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw and ptr %value, i16 %mask release, align 2
 // CHECK-ARM-ARM64:   ret i16 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
@@ -1247,7 +1247,7 @@ short test_InterlockedAnd16_rel(short volatile *value, short mask) {
 short test_InterlockedAnd16_nf(short volatile *value, short mask) {
   return _InterlockedAnd16_nf(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i16 @test_InterlockedAnd16_nf(ptr{{[a-z_ ]*}}%value, i16{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i16 @test_InterlockedAnd16_nf(ptr{{.*}}%value, i16{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw and ptr %value, i16 %mask monotonic, align 2
 // CHECK-ARM-ARM64:   ret i16 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
@@ -1255,7 +1255,7 @@ short test_InterlockedAnd16_nf(short volatile *value, short mask) {
 long test_InterlockedAnd_acq(long volatile *value, long mask) {
   return _InterlockedAnd_acq(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i32 @test_InterlockedAnd_acq(ptr{{[a-z_ ]*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i32 @test_InterlockedAnd_acq(ptr{{.*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw and ptr %value, i32 %mask acquire, align 4
 // CHECK-ARM-ARM64:   ret i32 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
@@ -1263,7 +1263,7 @@ long test_InterlockedAnd_acq(long volatile *value, long mask) {
 long test_InterlockedAnd_rel(long volatile *value, long mask) {
   return _InterlockedAnd_rel(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i32 @test_InterlockedAnd_rel(ptr{{[a-z_ ]*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i32 @test_InterlockedAnd_rel(ptr{{.*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw and ptr %value, i32 %mask release, align 4
 // CHECK-ARM-ARM64:   ret i32 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
@@ -1271,7 +1271,7 @@ long test_InterlockedAnd_rel(long volatile *value, long mask) {
 long test_InterlockedAnd_nf(long volatile *value, long mask) {
   return _InterlockedAnd_nf(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i32 @test_InterlockedAnd_nf(ptr{{[a-z_ ]*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i32 @test_InterlockedAnd_nf(ptr{{.*}}%value, i32{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw and ptr %value, i32 %mask monotonic, align 4
 // CHECK-ARM-ARM64:   ret i32 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
@@ -1279,7 +1279,7 @@ long test_InterlockedAnd_nf(long volatile *value, long mask) {
 __int64 test_InterlockedAnd64_acq(__int64 volatile *value, __int64 mask) {
   return _InterlockedAnd64_acq(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i64 @test_InterlockedAnd64_acq(ptr{{[a-z_ ]*}}%value, i64{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i64 @test_InterlockedAnd64_acq(ptr{{.*}}%value, i64{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw and ptr %value, i64 %mask acquire, align 8
 // CHECK-ARM-ARM64:   ret i64 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
@@ -1287,7 +1287,7 @@ __int64 test_InterlockedAnd64_acq(__int64 volatile *value, __int64 mask) {
 __int64 test_InterlockedAnd64_rel(__int64 volatile *value, __int64 mask) {
   return _InterlockedAnd64_rel(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i64 @test_InterlockedAnd64_rel(ptr{{[a-z_ ]*}}%value, i64{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i64 @test_InterlockedAnd64_rel(ptr{{.*}}%value, i64{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw and ptr %value, i64 %mask release, align 8
 // CHECK-ARM-ARM64:   ret i64 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
@@ -1295,7 +1295,7 @@ __int64 test_InterlockedAnd64_rel(__int64 volatile *value, __int64 mask) {
 __int64 test_InterlockedAnd64_nf(__int64 volatile *value, __int64 mask) {
   return _InterlockedAnd64_nf(value, mask);
 }
-// CHECK-ARM-ARM64: define{{.*}}i64 @test_InterlockedAnd64_nf(ptr{{[a-z_ ]*}}%value, i64{{[a-z_ ]*}}%mask){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i64 @test_InterlockedAnd64_nf(ptr{{.*}}%value, i64{{[a-z_ ]*}}%mask){{.*}}{
 // CHECK-ARM-ARM64:   [[RESULT:%[0-9]+]] = atomicrmw and ptr %value, i64 %mask monotonic, align 8
 // CHECK-ARM-ARM64:   ret i64 [[RESULT:%[0-9]+]]
 // CHECK-ARM-ARM64: }
@@ -1303,7 +1303,7 @@ __int64 test_InterlockedAnd64_nf(__int64 volatile *value, __int64 mask) {
 short test_InterlockedIncrement16_acq(short volatile *Addend) {
   return _InterlockedIncrement16_acq(Addend);
 }
-// CHECK-ARM-ARM64: define{{.*}}i16 @test_InterlockedIncrement16_acq(ptr{{[a-z_ ]*}}%Addend){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i16 @test_InterlockedIncrement16_acq(ptr{{.*}}%Addend){{.*}}{
 // CHECK-ARM-ARM64: [[TMP:%[0-9]+]] = atomicrmw add ptr %Addend, i16 1 acquire, align 2
 // CHECK-ARM-ARM64: [[RESULT:%[0-9]+]] = add i16 [[TMP]], 1
 // CHECK-ARM-ARM64: ret i16 [[RESULT]]
@@ -1312,7 +1312,7 @@ short test_InterlockedIncrement16_acq(short volatile *Addend) {
 short test_InterlockedIncrement16_rel(short volatile *Addend) {
   return _InterlockedIncrement16_rel(Addend);
 }
-// CHECK-ARM-ARM64: define{{.*}}i16 @test_InterlockedIncrement16_rel(ptr{{[a-z_ ]*}}%Addend){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i16 @test_InterlockedIncrement16_rel(ptr{{.*}}%Addend){{.*}}{
 // CHECK-ARM-ARM64: [[TMP:%[0-9]+]] = atomicrmw add ptr %Addend, i16 1 release, align 2
 // CHECK-ARM-ARM64: [[RESULT:%[0-9]+]] = add i16 [[TMP]], 1
 // CHECK-ARM-ARM64: ret i16 [[RESULT]]
@@ -1321,7 +1321,7 @@ short test_InterlockedIncrement16_rel(short volatile *Addend) {
 short test_InterlockedIncrement16_nf(short volatile *Addend) {
   return _InterlockedIncrement16_nf(Addend);
 }
-// CHECK-ARM-ARM64: define{{.*}}i16 @test_InterlockedIncrement16_nf(ptr{{[a-z_ ]*}}%Addend){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i16 @test_InterlockedIncrement16_nf(ptr{{.*}}%Addend){{.*}}{
 // CHECK-ARM-ARM64: [[TMP:%[0-9]+]] = atomicrmw add ptr %Addend, i16 1 monotonic, align 2
 // CHECK-ARM-ARM64: [[RESULT:%[0-9]+]] = add i16 [[TMP]], 1
 // CHECK-ARM-ARM64: ret i16 [[RESULT]]
@@ -1330,7 +1330,7 @@ short test_InterlockedIncrement16_nf(short volatile *Addend) {
 long test_InterlockedIncrement_acq(long volatile *Addend) {
   return _InterlockedIncrement_acq(Addend);
 }
-// CHECK-ARM-ARM64: define{{.*}}i32 @test_InterlockedIncrement_acq(ptr{{[a-z_ ]*}}%Addend){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i32 @test_InterlockedIncrement_acq(ptr{{.*}}%Addend){{.*}}{
 // CHECK-ARM-ARM64: [[TMP:%[0-9]+]] = atomicrmw add ptr %Addend, i32 1 acquire, align 4
 // CHECK-ARM-ARM64: [[RESULT:%[0-9]+]] = add i32 [[TMP]], 1
 // CHECK-ARM-ARM64: ret i32 [[RESULT]]
@@ -1339,7 +1339,7 @@ long test_InterlockedIncrement_acq(long volatile *Addend) {
 long test_InterlockedIncrement_rel(long volatile *Addend) {
   return _InterlockedIncrement_rel(Addend);
 }
-// CHECK-ARM-ARM64: define{{.*}}i32 @test_InterlockedIncrement_rel(ptr{{[a-z_ ]*}}%Addend){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i32 @test_InterlockedIncrement_rel(ptr{{.*}}%Addend){{.*}}{
 // CHECK-ARM-ARM64: [[TMP:%[0-9]+]] = atomicrmw add ptr %Addend, i32 1 release, align 4
 // CHECK-ARM-ARM64: [[RESULT:%[0-9]+]] = add i32 [[TMP]], 1
 // CHECK-ARM-ARM64: ret i32 [[RESULT]]
@@ -1348,7 +1348,7 @@ long test_InterlockedIncrement_rel(long volatile *Addend) {
 long test_InterlockedIncrement_nf(long volatile *Addend) {
   return _InterlockedIncrement_nf(Addend);
 }
-// CHECK-ARM-ARM64: define{{.*}}i32 @test_InterlockedIncrement_nf(ptr{{[a-z_ ]*}}%Addend){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i32 @test_InterlockedIncrement_nf(ptr{{.*}}%Addend){{.*}}{
 // CHECK-ARM-ARM64: [[TMP:%[0-9]+]] = atomicrmw add ptr %Addend, i32 1 monotonic, align 4
 // CHECK-ARM-ARM64: [[RESULT:%[0-9]+]] = add i32 [[TMP]], 1
 // CHECK-ARM-ARM64: ret i32 [[RESULT]]
@@ -1357,7 +1357,7 @@ long test_InterlockedIncrement_nf(long volatile *Addend) {
 __int64 test_InterlockedIncrement64_acq(__int64 volatile *Addend) {
   return _InterlockedIncrement64_acq(Addend);
 }
-// CHECK-ARM-ARM64: define{{.*}}i64 @test_InterlockedIncrement64_acq(ptr{{[a-z_ ]*}}%Addend){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i64 @test_InterlockedIncrement64_acq(ptr{{.*}}%Addend){{.*}}{
 // CHECK-ARM-ARM64: [[TMP:%[0-9]+]] = atomicrmw add ptr %Addend, i64 1 acquire, align 8
 // CHECK-ARM-ARM64: [[RESULT:%[0-9]+]] = add i64 [[TMP]], 1
 // CHECK-ARM-ARM64: ret i64 [[RESULT]]
@@ -1366,7 +1366,7 @@ __int64 test_InterlockedIncrement64_acq(__int64 volatile *Addend) {
 __int64 test_InterlockedIncrement64_rel(__int64 volatile *Addend) {
   return _InterlockedIncrement64_rel(Addend);
 }
-// CHECK-ARM-ARM64: define{{.*}}i64 @test_InterlockedIncrement64_rel(ptr{{[a-z_ ]*}}%Addend){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i64 @test_InterlockedIncrement64_rel(ptr{{.*}}%Addend){{.*}}{
 // CHECK-ARM-ARM64: [[TMP:%[0-9]+]] = atomicrmw add ptr %Addend, i64 1 release, align 8
 // CHECK-ARM-ARM64: [[RESULT:%[0-9]+]] = add i64 [[TMP]], 1
 // CHECK-ARM-ARM64: ret i64 [[RESULT]]
@@ -1375,7 +1375,7 @@ __int64 test_InterlockedIncrement64_rel(__int64 volatile *Addend) {
 __int64 test_InterlockedIncrement64_nf(__int64 volatile *Addend) {
   return _InterlockedIncrement64_nf(Addend);
 }
-// CHECK-ARM-ARM64: define{{.*}}i64 @test_InterlockedIncrement64_nf(ptr{{[a-z_ ]*}}%Addend){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i64 @test_InterlockedIncrement64_nf(ptr{{.*}}%Addend){{.*}}{
 // CHECK-ARM-ARM64: [[TMP:%[0-9]+]] = atomicrmw add ptr %Addend, i64 1 monotonic, align 8
 // CHECK-ARM-ARM64: [[RESULT:%[0-9]+]] = add i64 [[TMP]], 1
 // CHECK-ARM-ARM64: ret i64 [[RESULT]]
@@ -1384,7 +1384,7 @@ __int64 test_InterlockedIncrement64_nf(__int64 volatile *Addend) {
 short test_InterlockedDecrement16_acq(short volatile *Addend) {
   return _InterlockedDecrement16_acq(Addend);
 }
-// CHECK-ARM-ARM64: define{{.*}}i16 @test_InterlockedDecrement16_acq(ptr{{[a-z_ ]*}}%Addend){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i16 @test_InterlockedDecrement16_acq(ptr{{.*}}%Addend){{.*}}{
 // CHECK-ARM-ARM64: [[TMP:%[0-9]+]] = atomicrmw sub ptr %Addend, i16 1 acquire, align 2
 // CHECK-ARM-ARM64: [[RESULT:%[0-9]+]] = add i16 [[TMP]], -1
 // CHECK-ARM-ARM64: ret i16 [[RESULT]]
@@ -1393,7 +1393,7 @@ short test_InterlockedDecrement16_acq(short volatile *Addend) {
 short test_InterlockedDecrement16_rel(short volatile *Addend) {
   return _InterlockedDecrement16_rel(Addend);
 }
-// CHECK-ARM-ARM64: define{{.*}}i16 @test_InterlockedDecrement16_rel(ptr{{[a-z_ ]*}}%Addend){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i16 @test_InterlockedDecrement16_rel(ptr{{.*}}%Addend){{.*}}{
 // CHECK-ARM-ARM64: [[TMP:%[0-9]+]] = atomicrmw sub ptr %Addend, i16 1 release, align 2
 // CHECK-ARM-ARM64: [[RESULT:%[0-9]+]] = add i16 [[TMP]], -1
 // CHECK-ARM-ARM64: ret i16 [[RESULT]]
@@ -1402,7 +1402,7 @@ short test_InterlockedDecrement16_rel(short volatile *Addend) {
 short test_InterlockedDecrement16_nf(short volatile *Addend) {
   return _InterlockedDecrement16_nf(Addend);
 }
-// CHECK-ARM-ARM64: define{{.*}}i16 @test_InterlockedDecrement16_nf(ptr{{[a-z_ ]*}}%Addend){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i16 @test_InterlockedDecrement16_nf(ptr{{.*}}%Addend){{.*}}{
 // CHECK-ARM-ARM64: [[TMP:%[0-9]+]] = atomicrmw sub ptr %Addend, i16 1 monotonic, align 2
 // CHECK-ARM-ARM64: [[RESULT:%[0-9]+]] = add i16 [[TMP]], -1
 // CHECK-ARM-ARM64: ret i16 [[RESULT]]
@@ -1411,7 +1411,7 @@ short test_InterlockedDecrement16_nf(short volatile *Addend) {
 long test_InterlockedDecrement_acq(long volatile *Addend) {
   return _InterlockedDecrement_acq(Addend);
 }
-// CHECK-ARM-ARM64: define{{.*}}i32 @test_InterlockedDecrement_acq(ptr{{[a-z_ ]*}}%Addend){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i32 @test_InterlockedDecrement_acq(ptr{{.*}}%Addend){{.*}}{
 // CHECK-ARM-ARM64: [[TMP:%[0-9]+]] = atomicrmw sub ptr %Addend, i32 1 acquire, align 4
 // CHECK-ARM-ARM64: [[RESULT:%[0-9]+]] = add i32 [[TMP]], -1
 // CHECK-ARM-ARM64: ret i32 [[RESULT]]
@@ -1420,7 +1420,7 @@ long test_InterlockedDecrement_acq(long volatile *Addend) {
 long test_InterlockedDecrement_rel(long volatile *Addend) {
   return _InterlockedDecrement_rel(Addend);
 }
-// CHECK-ARM-ARM64: define{{.*}}i32 @test_InterlockedDecrement_rel(ptr{{[a-z_ ]*}}%Addend){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i32 @test_InterlockedDecrement_rel(ptr{{.*}}%Addend){{.*}}{
 // CHECK-ARM-ARM64: [[TMP:%[0-9]+]] = atomicrmw sub ptr %Addend, i32 1 release, align 4
 // CHECK-ARM-ARM64: [[RESULT:%[0-9]+]] = add i32 [[TMP]], -1
 // CHECK-ARM-ARM64: ret i32 [[RESULT]]
@@ -1429,7 +1429,7 @@ long test_InterlockedDecrement_rel(long volatile *Addend) {
 long test_InterlockedDecrement_nf(long volatile *Addend) {
   return _InterlockedDecrement_nf(Addend);
 }
-// CHECK-ARM-ARM64: define{{.*}}i32 @test_InterlockedDecrement_nf(ptr{{[a-z_ ]*}}%Addend){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i32 @test_InterlockedDecrement_nf(ptr{{.*}}%Addend){{.*}}{
 // CHECK-ARM-ARM64: [[TMP:%[0-9]+]] = atomicrmw sub ptr %Addend, i32 1 monotonic, align 4
 // CHECK-ARM-ARM64: [[RESULT:%[0-9]+]] = add i32 [[TMP]], -1
 // CHECK-ARM-ARM64: ret i32 [[RESULT]]
@@ -1438,7 +1438,7 @@ long test_InterlockedDecrement_nf(long volatile *Addend) {
 __int64 test_InterlockedDecrement64_acq(__int64 volatile *Addend) {
   return _InterlockedDecrement64_acq(Addend);
 }
-// CHECK-ARM-ARM64: define{{.*}}i64 @test_InterlockedDecrement64_acq(ptr{{[a-z_ ]*}}%Addend){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i64 @test_InterlockedDecrement64_acq(ptr{{.*}}%Addend){{.*}}{
 // CHECK-ARM-ARM64: [[TMP:%[0-9]+]] = atomicrmw sub ptr %Addend, i64 1 acquire, align 8
 // CHECK-ARM-ARM64: [[RESULT:%[0-9]+]] = add i64 [[TMP]], -1
 // CHECK-ARM-ARM64: ret i64 [[RESULT]]
@@ -1447,7 +1447,7 @@ __int64 test_InterlockedDecrement64_acq(__int64 volatile *Addend) {
 __int64 test_InterlockedDecrement64_rel(__int64 volatile *Addend) {
   return _InterlockedDecrement64_rel(Addend);
 }
-// CHECK-ARM-ARM64: define{{.*}}i64 @test_InterlockedDecrement64_rel(ptr{{[a-z_ ]*}}%Addend){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i64 @test_InterlockedDecrement64_rel(ptr{{.*}}%Addend){{.*}}{
 // CHECK-ARM-ARM64: [[TMP:%[0-9]+]] = atomicrmw sub ptr %Addend, i64 1 release, align 8
 // CHECK-ARM-ARM64: [[RESULT:%[0-9]+]] = add i64 [[TMP]], -1
 // CHECK-ARM-ARM64: ret i64 [[RESULT]]
@@ -1456,7 +1456,7 @@ __int64 test_InterlockedDecrement64_rel(__int64 volatile *Addend) {
 __int64 test_InterlockedDecrement64_nf(__int64 volatile *Addend) {
   return _InterlockedDecrement64_nf(Addend);
 }
-// CHECK-ARM-ARM64: define{{.*}}i64 @test_InterlockedDecrement64_nf(ptr{{[a-z_ ]*}}%Addend){{.*}}{
+// CHECK-ARM-ARM64: define{{.*}}i64 @test_InterlockedDecrement64_nf(ptr{{.*}}%Addend){{.*}}{
 // CHECK-ARM-ARM64: [[TMP:%[0-9]+]] = atomicrmw sub ptr %Addend, i64 1 monotonic, align 8
 // CHECK-ARM-ARM64: [[RESULT:%[0-9]+]] = add i64 [[TMP]], -1
 // CHECK-ARM-ARM64: ret i64 [[RESULT]]
diff --git clang/test/CodeGen/msan-param-retval.c clang/test/CodeGen/msan-param-retval.c
index 269a759fac10..439ae98799c6 100644
--- clang/test/CodeGen/msan-param-retval.c
+++ clang/test/CodeGen/msan-param-retval.c
@@ -1,12 +1,12 @@
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -fsanitize=memory -no-enable-noundef-analysis -o - %s | \
+// RUN: %clang_cc1 -Wno-error=return-type -triple x86_64-linux-gnu -emit-llvm -fsanitize=memory -no-enable-noundef-analysis -o - %s | \
 // RUN:     FileCheck %s --check-prefixes=CLEAN,CHECK
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -fsanitize=memory -fno-sanitize-memory-param-retval -o - %s | \
+// RUN: %clang_cc1 -Wno-error=return-type -triple x86_64-linux-gnu -emit-llvm -fsanitize=memory -fno-sanitize-memory-param-retval -o - %s | \
 // RUN:     FileCheck %s --check-prefixes=NOUNDEF,NOUNDEF_ONLY,CHECK
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -fsanitize=memory -mllvm -msan-eager-checks -o - %s | \
+// RUN: %clang_cc1 -Wno-error=return-type -triple x86_64-linux-gnu -emit-llvm -fsanitize=memory -mllvm -msan-eager-checks -o - %s | \
 // RUN:     FileCheck %s --check-prefixes=NOUNDEF,EAGER,CHECK
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -fsanitize=memory -no-enable-noundef-analysis -fsanitize-memory-param-retval -o - %s | \
+// RUN: %clang_cc1 -Wno-error=return-type -triple x86_64-linux-gnu -emit-llvm -fsanitize=memory -no-enable-noundef-analysis -fsanitize-memory-param-retval -o - %s | \
 // RUN:     FileCheck %s --check-prefixes=CLEAN,CHECK
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -fsanitize=memory -o - %s | \
+// RUN: %clang_cc1 -Wno-error=return-type -triple x86_64-linux-gnu -emit-llvm -fsanitize=memory -o - %s | \
 // RUN:     FileCheck %s --check-prefixes=NOUNDEF,EAGER,CHECK
 
 void bar(int x) {
diff --git clang/test/CodeGen/msan-param-retval.cpp clang/test/CodeGen/msan-param-retval.cpp
index c4960a4702f6..caebb38fa50c 100644
--- clang/test/CodeGen/msan-param-retval.cpp
+++ clang/test/CodeGen/msan-param-retval.cpp
@@ -1,12 +1,12 @@
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -fsanitize=memory -no-enable-noundef-analysis -o - %s | \
+// RUN: %clang_cc1 -Wno-error=return-type -triple x86_64-linux-gnu -emit-llvm -fsanitize=memory -no-enable-noundef-analysis -o - %s | \
 // RUN:     FileCheck %s --check-prefixes=CLEAN,CHECK
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -fsanitize=memory -fno-sanitize-memory-param-retval -o - %s | \
+// RUN: %clang_cc1 -Wno-error=return-type -triple x86_64-linux-gnu -emit-llvm -fsanitize=memory -fno-sanitize-memory-param-retval -o - %s | \
 // RUN:     FileCheck %s --check-prefixes=NOUNDEF,NOUNDEF_ONLY,CHECK
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -fsanitize=memory -mllvm -msan-eager-checks -o - %s | \
+// RUN: %clang_cc1 -Wno-error=return-type -triple x86_64-linux-gnu -emit-llvm -fsanitize=memory -mllvm -msan-eager-checks -o - %s | \
 // RUN:     FileCheck %s --check-prefixes=NOUNDEF,EAGER,CHECK
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -fsanitize=memory -no-enable-noundef-analysis -fsanitize-memory-param-retval -o - %s | \
+// RUN: %clang_cc1 -Wno-error=return-type -triple x86_64-linux-gnu -emit-llvm -fsanitize=memory -no-enable-noundef-analysis -fsanitize-memory-param-retval -o - %s | \
 // RUN:     FileCheck %s --check-prefixes=CLEAN,CHECK
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -fsanitize=memory -o - %s | \
+// RUN: %clang_cc1 -Wno-error=return-type -triple x86_64-linux-gnu -emit-llvm -fsanitize=memory -o - %s | \
 // RUN:     FileCheck %s --check-prefixes=NOUNDEF,EAGER,CHECK
 
 void bar(int x) {
diff --git clang/test/CodeGen/object-size.c clang/test/CodeGen/object-size.c
index 58561a5470f7..2a3dfc8f4b12 100644
--- clang/test/CodeGen/object-size.c
+++ clang/test/CodeGen/object-size.c
@@ -592,7 +592,7 @@ void PR30346(void) {
 
 extern char incomplete_char_array[];
 // CHECK-LABEL: @incomplete_and_function_types
-int incomplete_and_function_types(void) {
+void incomplete_and_function_types(void) {
   // CHECK: call i64 @llvm.objectsize.i64.p0
   gi = OBJECT_SIZE_BUILTIN(incomplete_char_array, 0);
   // CHECK: call i64 @llvm.objectsize.i64.p0
diff --git clang/test/CodeGen/pragma-comment.c clang/test/CodeGen/pragma-comment.c
index a4746f5c47bf..a966840f7c26 100644
--- clang/test/CodeGen/pragma-comment.c
+++ clang/test/CodeGen/pragma-comment.c
@@ -6,6 +6,7 @@
 // RUN: %clang_cc1 %s -triple x86_64-scei-ps4 -fms-extensions -emit-llvm -o - | FileCheck -check-prefix ELF %s --implicit-check-not llvm.linker.options
 // RUN: %clang_cc1 %s -triple x86_64-sie-ps5 -fms-extensions -emit-llvm -o - | FileCheck -check-prefix ELF %s --implicit-check-not llvm.linker.options
 // RUN: %clang_cc1 %s -triple aarch64-windows-msvc -fms-extensions -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 %s -triple mipsel-windows-msvc -fms-extensions -emit-llvm -o - | FileCheck %s
 
 #pragma comment(lib, "msvcrt.lib")
 #pragma comment(lib, "kernel32")
diff --git clang/test/CodeGen/sanitize-metadata-nosanitize.c clang/test/CodeGen/sanitize-metadata-nosanitize.c
index da0c80914801..fd2fdce31b52 100644
--- clang/test/CodeGen/sanitize-metadata-nosanitize.c
+++ clang/test/CodeGen/sanitize-metadata-nosanitize.c
@@ -95,3 +95,20 @@ __attribute__((no_sanitize("all"))) int test_no_sanitize_all(int *x, int *y) {
 // CHECK: attributes #[[ATTR3]] = { mustprogress nofree norecurse nounwind willreturn memory(write, argmem: readwrite, inaccessiblemem: none) "min-legal-vector-width"="0" "no-trapping-math"="true" "no_sanitize_thread" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" }
 // CHECK: attributes #[[ATTR4:[0-9]+]] = { nounwind "target-features"="+cx8,+mmx,+sse,+sse2,+x87" }
 //.
+// CHECK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// CHECK: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+// CHECK: [[META2]] = !{!"sanmd_covered2!C", [[META3:![0-9]+]]}
+// CHECK: [[META3]] = !{i64 0}
+// CHECK: [[META4]] = !{!"sanmd_covered2!C", [[META5:![0-9]+]]}
+// CHECK: [[META5]] = !{i64 3}
+// CHECK: [[TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
+// CHECK: [[META7]] = !{!"p1 int", [[META8:![0-9]+]], i64 0}
+// CHECK: [[META8]] = !{!"any pointer", [[META9:![0-9]+]], i64 0}
+// CHECK: [[META9]] = !{!"omnipotent char", [[META10:![0-9]+]], i64 0}
+// CHECK: [[META10]] = !{!"Simple C/C++ TBAA"}
+// CHECK: [[META11]] = !{!"sanmd_atomics2!C"}
+// CHECK: [[TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0}
+// CHECK: [[META13]] = !{!"int", [[META9]], i64 0}
+// CHECK: [[META14]] = !{!"sanmd_covered2!C", [[META15:![0-9]+]]}
+// CHECK: [[META15]] = !{i64 2}
+//.
diff --git clang/test/CodeGen/sparcv8-abi.c clang/test/CodeGen/sparcv8-abi.c
index 7a9371764327..c5faf130890f 100644
--- clang/test/CodeGen/sparcv8-abi.c
+++ clang/test/CodeGen/sparcv8-abi.c
@@ -4,16 +4,19 @@
 float __complex__
 p (float __complex__  a, float __complex__  b)
 {
+  return 0;
 }
 
 // CHECK-LABEL: define{{.*}} { double, double } @q(ptr noundef byval({ double, double }) align 8 %a, ptr noundef byval({ double, double }) align 8 %b) #0 {
 double __complex__
 q (double __complex__  a, double __complex__  b)
 {
+  return 0;
 }
 
 // CHECK-LABEL: define{{.*}} { i64, i64 } @r(ptr noundef byval({ i64, i64 }) align 8 %a, ptr noundef byval({ i64, i64 }) align 8 %b) #0 {
 long long __complex__
 r (long long __complex__  a, long long __complex__  b)
 {
+  return 0;
 }
diff --git clang/test/CodeGen/sret.c clang/test/CodeGen/sret.c
index 6d905e89b2c6..83dce80aa279 100644
--- clang/test/CodeGen/sret.c
+++ clang/test/CodeGen/sret.c
@@ -12,7 +12,7 @@ struct abc foo1(void);
 // CHECK-DAG: declare {{.*}} @foo1(ptr dead_on_unwind writable sret(%struct.abc)
 struct abc foo2();
 // CHECK-DAG: declare {{.*}} @foo2(ptr dead_on_unwind writable sret(%struct.abc)
-struct abc foo3(void){}
+struct abc foo3(void) { return (struct abc){0}; }
 // CHECK-DAG: define {{.*}} @foo3(ptr dead_on_unwind noalias writable sret(%struct.abc)
 
 void bar(void) {
diff --git clang/test/CodeGen/static-order.c clang/test/CodeGen/static-order.c
index dbd22db96b52..e128c2f74245 100644
--- clang/test/CodeGen/static-order.c
+++ clang/test/CodeGen/static-order.c
@@ -19,4 +19,5 @@ void *f(void)
 {
   if (a.a)
     return v;
+  return 0;
 }
diff --git clang/test/CodeGen/staticinit.c clang/test/CodeGen/staticinit.c
index 90b8fa5edb02..ec9b5b34d3ad 100644
--- clang/test/CodeGen/staticinit.c
+++ clang/test/CodeGen/staticinit.c
@@ -27,7 +27,7 @@ void foo(void) {
 }
 
 // CHECK: @f1.l0 = internal global i32 ptrtoint (ptr @f1 to i32)
-int f1(void) { static int l0 = (unsigned) f1; }
+void f1(void) { static int l0 = (unsigned) f1; }
 
 // PR7044
 char *f2(char key) {
diff --git clang/test/CodeGen/struct.c clang/test/CodeGen/struct.c
index c98357160add..0da9b748ea3b 100644
--- clang/test/CodeGen/struct.c
+++ clang/test/CodeGen/struct.c
@@ -136,7 +136,7 @@ struct a14 { short a; int b; } x = {1, 1};
 
 /* flexible array members */
 struct a15 {char a; int b[];} c15;
-int a16(void) {c15.a = 1;}
+void a16(void) {c15.a = 1;}
 
 /* compound literals */
 void f13(void) {
diff --git clang/test/CodeGen/ubsan-debuglog-return.c clang/test/CodeGen/ubsan-debuglog-return.c
index ecbebc34d223..0a96465d330e 100644
--- clang/test/CodeGen/ubsan-debuglog-return.c
+++ clang/test/CodeGen/ubsan-debuglog-return.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -x c -debug-info-kind=line-tables-only -emit-llvm -fsanitize=returns-nonnull-attribute -o - %s | FileCheck %s
+// RUN: %clang_cc1 -Wno-error=return-type -x c -debug-info-kind=line-tables-only -emit-llvm -fsanitize=returns-nonnull-attribute -o - %s | FileCheck %s
 // The UBSAN function call in the epilogue needs to have a debug location.
 
 __attribute__((returns_nonnull)) void *allocate(void) {}
diff --git clang/test/CodeGen/union.c clang/test/CodeGen/union.c
index 60e9e2d771ff..fb47d7140bed 100644
--- clang/test/CodeGen/union.c
+++ clang/test/CodeGen/union.c
@@ -34,7 +34,7 @@ void fS65(void) { enum E9 e = s65.a; }
 typedef union{
   unsigned char x[65536];
 } q;
-int qfunc(void) {q buf; unsigned char* x = buf.x;}
+void qfunc(void) {q buf; unsigned char* x = buf.x;}
 
 union RR {_Bool a : 1;} RRU;
 int RRF(void) {return RRU.a;}
diff --git clang/test/CodeGen/ve-abi.c clang/test/CodeGen/ve-abi.c
index 25ec3ed07b7d..4e73b7eec16c 100644
--- clang/test/CodeGen/ve-abi.c
+++ clang/test/CodeGen/ve-abi.c
@@ -52,6 +52,7 @@ unsigned long fun_zi64(unsigned long a, unsigned long b) {
 
 // CHECK-LABEL: define{{.*}} i128 @fun_si128(i128 noundef %a, i128 noundef %b) #0 {
 __int128 fun_si128(__int128 a, __int128 b) {
+  return a;
 }
 
 // CHECK-LABEL: define{{.*}} i128 @fun_zi128(i128 noundef %a, i128 noundef %b) #0 {
diff --git clang/test/CodeGenCXX/2007-01-06-PtrMethodInit.cpp clang/test/CodeGenCXX/2007-01-06-PtrMethodInit.cpp
index 37005c5e9df7..465f55f9f8ec 100644
--- clang/test/CodeGenCXX/2007-01-06-PtrMethodInit.cpp
+++ clang/test/CodeGenCXX/2007-01-06-PtrMethodInit.cpp
@@ -20,6 +20,8 @@ extern "C++"
   {
     static const nsIID & GetIID ()
     {
+      static const nsIID i = {};
+      return i;
     }
   };
 }
@@ -31,6 +33,8 @@ class nsIDOMEventListener:public nsISupports
 {
 public:static const nsIID & GetIID ()
   {
+    static const nsIID i = {};
+    return i;
   }
   virtual nsresult
     __attribute__ ((regparm (0), cdecl)) HandleEvent (nsIDOMEvent * event) =
@@ -42,6 +46,7 @@ public:static const nsIID & GetIID ()
   {
     static const nsIID iid = {
     };
+    return iid;
   }
   virtual nsresult
     __attribute__ ((regparm (0),
diff --git clang/test/CodeGenCXX/2007-09-10-RecursiveTypeResolution.cpp clang/test/CodeGenCXX/2007-09-10-RecursiveTypeResolution.cpp
index ec8a516c696b..b8d55ced2c2a 100644
--- clang/test/CodeGenCXX/2007-09-10-RecursiveTypeResolution.cpp
+++ clang/test/CodeGenCXX/2007-09-10-RecursiveTypeResolution.cpp
@@ -55,6 +55,7 @@ namespace Manta
       vector < _Tp, _Alloc > > iterator;
     iterator end ()
     {
+      return {};
     }
   };
   class MantaInterface
diff --git clang/test/CodeGenCXX/2007-10-01-StructResize.cpp clang/test/CodeGenCXX/2007-10-01-StructResize.cpp
index 8e5750d3c4ef..ce1886c2e503 100644
--- clang/test/CodeGenCXX/2007-10-01-StructResize.cpp
+++ clang/test/CodeGenCXX/2007-10-01-StructResize.cpp
@@ -7,7 +7,7 @@ struct Bork {
   unsigned int f2 : 30;
 };
 
-int Foo(Bork *hdr) {
+void Foo(Bork *hdr) {
   hdr->f1 = 7;
   hdr->f2 = 927;
 }
diff --git clang/test/CodeGenCXX/2009-08-11-VectorRetTy.cpp clang/test/CodeGenCXX/2009-08-11-VectorRetTy.cpp
index fe6f32a6969c..04b1067aabde 100644
--- clang/test/CodeGenCXX/2009-08-11-VectorRetTy.cpp
+++ clang/test/CodeGenCXX/2009-08-11-VectorRetTy.cpp
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 %s -emit-llvm -o /dev/null
 typedef void (*Func) ();
 typedef long long m64 __attribute__((__vector_size__(8), __may_alias__));
-static inline m64 __attribute__((__always_inline__, __nodebug__)) _mm_set1_pi16() {}
+static inline m64 __attribute__((__always_inline__, __nodebug__)) _mm_set1_pi16() { return {}; }
 template <class MM>
 static void Bork() {
   const m64 mmx_0x00ff = _mm_set1_pi16();
diff --git clang/test/CodeGenCXX/2010-07-23-DeclLoc.cpp clang/test/CodeGenCXX/2010-07-23-DeclLoc.cpp
index a88d605f565e..ef589e053797 100644
--- clang/test/CodeGenCXX/2010-07-23-DeclLoc.cpp
+++ clang/test/CodeGenCXX/2010-07-23-DeclLoc.cpp
@@ -16,7 +16,7 @@ namespace std {
   public:
     typedef _Tp element_type;
     auto_ptr(element_type* __p = 0) throw() : _M_ptr(__p) {    }
-    element_type&     operator*() const throw()     {    }
+    element_type&     operator*() const throw()     { return *_M_ptr; }
   };
 }
 class Pointer32 {
@@ -69,17 +69,17 @@ template <typename SIZE_AND_ENDIANNESS> void extract_dwarf_data_from_header(TExt
                                                                             TRawSymbolOwnerData<typename SIZE_AND_ENDIANNESS::SIZE>& symbol_owner_data,
                                                                             TAddressRelocator<typename SIZE_AND_ENDIANNESS::SIZE>* address_relocator) {}
 struct CSCppSymbolOwnerHashFunctor {
-  size_t operator()(const CSCppSymbolOwner& symbol_owner) const {
+  void operator()(const CSCppSymbolOwner& symbol_owner) const {
 # 97 "wrong_place_for_decl.cpp"
   }
 };
-template <typename SIZE_AND_ENDIANNESS> CSCppSymbolOwnerData* create_symbol_owner_data_arch_specific(CSCppSymbolOwner* symbol_owner, const char* dsym_path) {
+template <typename SIZE_AND_ENDIANNESS> void create_symbol_owner_data_arch_specific(CSCppSymbolOwner* symbol_owner, const char* dsym_path) {
   typedef typename SIZE_AND_ENDIANNESS::SIZE SIZE;
   std::auto_ptr< TRawSymbolOwnerData<SIZE> > data(new TRawSymbolOwnerData<SIZE>());
   std::auto_ptr< TExtendedMachOHeader<SIZE_AND_ENDIANNESS> > header;
   extract_dwarf_data_from_header(*header, *data, (TAddressRelocator<typename SIZE_AND_ENDIANNESS::SIZE>*)__null);
 }
-CSCppSymbolOwnerData* create_symbol_owner_data2(CSCppSymbolOwner* symbol_owner, const char* dsym_path) {
+void create_symbol_owner_data2(CSCppSymbolOwner* symbol_owner, const char* dsym_path) {
   create_symbol_owner_data_arch_specific< ISA32Little >(symbol_owner, dsym_path);
   create_symbol_owner_data_arch_specific< ISA32Big >(symbol_owner, dsym_path);
   create_symbol_owner_data_arch_specific< ISA64Little >(symbol_owner, dsym_path);
diff --git clang/test/CodeGenCXX/address-space-cast-coerce.cpp clang/test/CodeGenCXX/address-space-cast-coerce.cpp
index 1ad46042b6ef..ed575e14244f 100644
--- clang/test/CodeGenCXX/address-space-cast-coerce.cpp
+++ clang/test/CodeGenCXX/address-space-cast-coerce.cpp
@@ -41,7 +41,7 @@ template<typename T, unsigned int n>
 
 using char1 = my_vector_type<char, 1>;
 
-int mane() {
+void mane() {
 
     char1 f1{1};
     char1 f2{1};
diff --git clang/test/CodeGenCXX/array-value-initialize.cpp clang/test/CodeGenCXX/array-value-initialize.cpp
index 27607c1c7544..b3dc24871b49 100644
--- clang/test/CodeGenCXX/array-value-initialize.cpp
+++ clang/test/CodeGenCXX/array-value-initialize.cpp
@@ -22,7 +22,7 @@ struct Foo {
         S sbar_[5];
 };
 
-int test1(void) {
+void test1(void) {
         Foo a;
 }
 
diff --git clang/test/CodeGenCXX/attr.cpp clang/test/CodeGenCXX/attr.cpp
index 3890f2018c9a..78df87c6e088 100644
--- clang/test/CodeGenCXX/attr.cpp
+++ clang/test/CodeGenCXX/attr.cpp
@@ -4,7 +4,7 @@
 
 // CHECK: define{{.*}} i32 @_Z3foov() [[NUW:#[0-9]+]] align 1024
 int foo() __attribute__((aligned(1024)));
-int foo() { }
+int foo() { return 0; }
 
 class C {
   virtual void bar1() __attribute__((aligned(1)));
diff --git clang/test/CodeGenCXX/c-linkage.cpp clang/test/CodeGenCXX/c-linkage.cpp
index 0f4c3277253f..bc2c0c5380f4 100644
--- clang/test/CodeGenCXX/c-linkage.cpp
+++ clang/test/CodeGenCXX/c-linkage.cpp
@@ -29,5 +29,6 @@ extern "C" {
   struct test3_s {
   };
   bool operator==(const int& a, const test3_s& b)  {
+    return false;
   }
 }
diff --git clang/test/CodeGenCXX/catch-undef-behavior.cpp clang/test/CodeGenCXX/catch-undef-behavior.cpp
index 419d1292551a..15feebe0c099 100644
--- clang/test/CodeGenCXX/catch-undef-behavior.cpp
+++ clang/test/CodeGenCXX/catch-undef-behavior.cpp
@@ -1,8 +1,8 @@
-// RUN: %clang_cc1 -no-enable-noundef-analysis -std=c++11 -fsanitize=signed-integer-overflow,integer-divide-by-zero,float-divide-by-zero,shift-base,shift-exponent,unreachable,return,vla-bound,alignment,null,vptr,object-size,float-cast-overflow,bool,enum,array-bounds,function -fsanitize-recover=signed-integer-overflow,integer-divide-by-zero,float-divide-by-zero,shift-base,shift-exponent,vla-bound,alignment,null,vptr,object-size,float-cast-overflow,bool,enum,array-bounds,function -emit-llvm %s -o - -triple x86_64-linux-gnu | FileCheck %s --check-prefixes=CHECK,CHECK-FUNCSAN
-// RUN: %clang_cc1 -no-enable-noundef-analysis -std=c++11 -fsanitize=vptr,address -fsanitize-recover=vptr,address -emit-llvm %s -o - -triple x86_64-linux-gnu | FileCheck %s --check-prefix=CHECK-ASAN
-// RUN: %clang_cc1 -no-enable-noundef-analysis -std=c++11 -fsanitize=vptr -fsanitize-recover=vptr -emit-llvm %s -o - -triple x86_64-linux-gnu | FileCheck %s --check-prefix=DOWNCAST-NULL
-// RUN: %clang_cc1 -no-enable-noundef-analysis -std=c++11 -fsanitize=function -emit-llvm %s -o - -triple x86_64-linux-gnux32 | FileCheck %s --check-prefix=CHECK-FUNCSAN
-// RUN: %clang_cc1 -no-enable-noundef-analysis -std=c++11 -fsanitize=function -emit-llvm %s -o - -triple i386-linux-gnu | FileCheck %s --check-prefix=CHECK-FUNCSAN
+// RUN: %clang_cc1 -no-enable-noundef-analysis -std=c++11 -Wno-error=return-type -fsanitize=signed-integer-overflow,integer-divide-by-zero,float-divide-by-zero,shift-base,shift-exponent,unreachable,return,vla-bound,alignment,null,vptr,object-size,float-cast-overflow,bool,enum,array-bounds,function -fsanitize-recover=signed-integer-overflow,integer-divide-by-zero,float-divide-by-zero,shift-base,shift-exponent,vla-bound,alignment,null,vptr,object-size,float-cast-overflow,bool,enum,array-bounds,function -emit-llvm %s -o - -triple x86_64-linux-gnu | FileCheck %s --check-prefixes=CHECK,CHECK-FUNCSAN
+// RUN: %clang_cc1 -no-enable-noundef-analysis -std=c++11 -Wno-error=return-type -fsanitize=vptr,address -fsanitize-recover=vptr,address -emit-llvm %s -o - -triple x86_64-linux-gnu | FileCheck %s --check-prefix=CHECK-ASAN
+// RUN: %clang_cc1 -no-enable-noundef-analysis -std=c++11 -Wno-error=return-type -fsanitize=vptr -fsanitize-recover=vptr -emit-llvm %s -o - -triple x86_64-linux-gnu | FileCheck %s --check-prefix=DOWNCAST-NULL
+// RUN: %clang_cc1 -no-enable-noundef-analysis -std=c++11 -Wno-error=return-type -fsanitize=function -emit-llvm %s -o - -triple x86_64-linux-gnux32 | FileCheck %s --check-prefix=CHECK-FUNCSAN
+// RUN: %clang_cc1 -no-enable-noundef-analysis -std=c++11 -Wno-error=return-type -fsanitize=function -emit-llvm %s -o - -triple i386-linux-gnu | FileCheck %s --check-prefix=CHECK-FUNCSAN
 
 struct S {
   double d;
diff --git clang/test/CodeGenCXX/conditional-gnu-ext.cpp clang/test/CodeGenCXX/conditional-gnu-ext.cpp
index 3d3d210f22f6..4411d4269b8c 100644
--- clang/test/CodeGenCXX/conditional-gnu-ext.cpp
+++ clang/test/CodeGenCXX/conditional-gnu-ext.cpp
@@ -11,7 +11,7 @@ void test0() {
 namespace radar8446940 {
 extern "C" void abort();
 
-int main () {
+void main () {
   char x[1];
   char *y = x ? : 0;
 
diff --git clang/test/CodeGenCXX/cxx1y-variable-template-linkage.cpp clang/test/CodeGenCXX/cxx1y-variable-template-linkage.cpp
index ac542870f1f4..7bc82eff6679 100644
--- clang/test/CodeGenCXX/cxx1y-variable-template-linkage.cpp
+++ clang/test/CodeGenCXX/cxx1y-variable-template-linkage.cpp
@@ -55,6 +55,9 @@ static const int &foo() {
      // CHECK-DAG: @_Z1tIKiE
      return t<const int>;
    }
+
+   static int x;
+   return x;
 }
 
 
diff --git clang/test/CodeGenCXX/debug-info-cxx0x.cpp clang/test/CodeGenCXX/debug-info-cxx0x.cpp
index 4c31f60c0d96..c21a0bb1ad5a 100644
--- clang/test/CodeGenCXX/debug-info-cxx0x.cpp
+++ clang/test/CodeGenCXX/debug-info-cxx0x.cpp
@@ -12,7 +12,7 @@ namespace PR13570 {
   template<typename T, typename U> struct P {};
   template<typename T> struct A {
     template<typename U> static P<T,U> isa(U);
-    decltype(isa(int())) f() {}
+    decltype(isa(int())) f() { return {}; }
   };
   template struct A<int>;
 }
diff --git clang/test/CodeGenCXX/debug-info-object-pointer.cpp clang/test/CodeGenCXX/debug-info-object-pointer.cpp
index 594d4da791ee..49079f599099 100644
--- clang/test/CodeGenCXX/debug-info-object-pointer.cpp
+++ clang/test/CodeGenCXX/debug-info-object-pointer.cpp
@@ -5,12 +5,11 @@
 // CHECK: !DIDerivedType(tag: DW_TAG_pointer_type
 // CHECK-SAME:           flags: DIFlagArtificial | DIFlagObjectPointer
 //
-// // FIXME: DIFlagObjectPointer not attached to the explicit object
-// // argument in the subprogram declaration.
 // CHECK: !DISubprogram(name: "explicit_this",
 //                      flags: DIFlagPrototyped
-// CHECK-NOT: DIFlagObjectPointer
-// CHECK-NOT: DIFlagArtificial
+//
+// CHECK: !DIDerivedType(tag: DW_TAG_rvalue_reference_type
+// CHECK-SAME:           flags: DIFlagObjectPointer)
 //
 // CHECK: !DILocalVariable(name: "this", arg: 1
 // CHECK-SAME:             flags: DIFlagArtificial | DIFlagObjectPointer
diff --git clang/test/CodeGenCXX/debug-info-scoped-class.cpp clang/test/CodeGenCXX/debug-info-scoped-class.cpp
index de4aee9a1b44..7424487df418 100644
--- clang/test/CodeGenCXX/debug-info-scoped-class.cpp
+++ clang/test/CodeGenCXX/debug-info-scoped-class.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -emit-llvm -debug-info-kind=standalone -std=c++11 \
+// RUN: %clang_cc1 -Wno-error=return-type -emit-llvm -debug-info-kind=standalone -std=c++11 \
 // RUN:   -triple thumbv7-apple-ios %s -o - | FileCheck %s
 
 // This forward-declared scoped enum will be created while building its own
diff --git clang/test/CodeGenCXX/debug-lambda-this.cpp clang/test/CodeGenCXX/debug-lambda-this.cpp
index eecbac6520ac..019d09c48f85 100644
--- clang/test/CodeGenCXX/debug-lambda-this.cpp
+++ clang/test/CodeGenCXX/debug-lambda-this.cpp
@@ -4,9 +4,9 @@ struct D {
   D();
   D(const D&);
   int x;
-  int d(int x);
+  void d(int x);
 };
-int D::d(int x) {
+void D::d(int x) {
   [=] {
     return this->x;
   }();
diff --git clang/test/CodeGenCXX/dynamic-cast-address-space.cpp clang/test/CodeGenCXX/dynamic-cast-address-space.cpp
index 271d9ede79d0..0460352cf7ff 100644
--- clang/test/CodeGenCXX/dynamic-cast-address-space.cpp
+++ clang/test/CodeGenCXX/dynamic-cast-address-space.cpp
@@ -112,9 +112,9 @@ const B& f(A *a) {
 // CHECK: attributes #[[ATTR3]] = { nounwind }
 // CHECK: attributes #[[ATTR4]] = { noreturn }
 //.
-// WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR0]] = { mustprogress noinline optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gws,+image-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32,+wavefrontsize64" }
+// WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR0]] = { mustprogress noinline optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize32,+wavefrontsize64" }
 // WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR1:[0-9]+]] = { nounwind willreturn memory(read) }
-// WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR2:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gws,+image-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32,+wavefrontsize64" }
+// WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR2:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize32,+wavefrontsize64" }
 // WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR3]] = { nounwind }
 // WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR4]] = { noreturn }
 //.
diff --git clang/test/CodeGenCXX/expr.cpp clang/test/CodeGenCXX/expr.cpp
index 33e8e63de2f4..d1af538d8c6c 100644
--- clang/test/CodeGenCXX/expr.cpp
+++ clang/test/CodeGenCXX/expr.cpp
@@ -24,7 +24,7 @@ int test3g = test3(__PRETTY_FUNCTION__);
 struct test4A {
   int j : 2;
 };
-int test4() {
+void test4() {
   test4A a;
   (a.j = 2) = 3;
 }
diff --git clang/test/CodeGenCXX/inline-then-fold-variadics.cpp clang/test/CodeGenCXX/inline-then-fold-variadics.cpp
index 4aa79a28dd7d..855787731c8b 100644
--- clang/test/CodeGenCXX/inline-then-fold-variadics.cpp
+++ clang/test/CodeGenCXX/inline-then-fold-variadics.cpp
@@ -34,21 +34,21 @@ template <typename X, typename Y> static Y second(...) {
 extern "C" {
 
 // CHECK-LABEL: define {{[^@]+}}@first_pair_i32
-// CHECK-SAME: (i32 noundef returned [[X:%.*]], i32 noundef [[Y:%.*]])
+// CHECK-SAME: (i32 noundef returned [[X:%.*]], i32 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 [[X]]
 //
 int first_pair_i32(int x, int y) { return first<int, int>(x, y); }
 
 // CHECK-LABEL: define {{[^@]+}}@second_pair_i32
-// CHECK-SAME: (i32 noundef [[X:%.*]], i32 noundef returned [[Y:%.*]])
+// CHECK-SAME: (i32 noundef [[X:%.*]], i32 noundef returned [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 [[Y]]
 //
 int second_pair_i32(int x, int y) { return second<int, int>(x, y); }
 
 // CHECK-LABEL: define {{[^@]+}}@first_pair_f64
-// CHECK-SAME: (double noundef returned [[X:%.*]], double noundef [[Y:%.*]])
+// CHECK-SAME: (double noundef returned [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret double [[X]]
 //
@@ -57,7 +57,7 @@ double first_pair_f64(double x, double y) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@second_pair_f64
-// CHECK-SAME: (double noundef [[X:%.*]], double noundef returned [[Y:%.*]])
+// CHECK-SAME: (double noundef [[X:%.*]], double noundef returned [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret double [[Y]]
 //
@@ -69,28 +69,28 @@ double second_pair_f64(double x, double y) {
 extern "C" {
 
 // CHECK-LABEL: define {{[^@]+}}@first_i32_f64
-// CHECK-SAME: (i32 noundef returned [[X:%.*]], double noundef [[Y:%.*]])
+// CHECK-SAME: (i32 noundef returned [[X:%.*]], double noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 [[X]]
 //
 int first_i32_f64(int x, double y) { return first<int, double>(x, y); }
 
 // CHECK-LABEL: define {{[^@]+}}@second_i32_f64
-// CHECK-SAME: (i32 noundef [[X:%.*]], double noundef returned [[Y:%.*]])
+// CHECK-SAME: (i32 noundef [[X:%.*]], double noundef returned [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret double [[Y]]
 //
 double second_i32_f64(int x, double y) { return second<int, double>(x, y); }
 
 // CHECK-LABEL: define {{[^@]+}}@first_f64_i32
-// CHECK-SAME: (double noundef returned [[X:%.*]], i32 noundef [[Y:%.*]])
+// CHECK-SAME: (double noundef returned [[X:%.*]], i32 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret double [[X]]
 //
 double first_f64_i32(double x, int y) { return first<double, int>(x, y); }
 
 // CHECK-LABEL: define {{[^@]+}}@second_f64_i32
-// CHECK-SAME: (double noundef [[X:%.*]], i32 noundef returned [[Y:%.*]])
+// CHECK-SAME: (double noundef [[X:%.*]], i32 noundef returned [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 [[Y]]
 //
@@ -101,7 +101,7 @@ extern "C" {
 typedef uint64_t ulong2 __attribute__((__vector_size__(16), __aligned__(16)));
 
 // CHECK-LABEL: define {{[^@]+}}@first_i32_ulong2
-// CHECK-SAME: (i32 noundef returned [[X:%.*]], ptr nocapture noundef readonly [[Y:%.*]])
+// CHECK-SAME: (i32 noundef returned [[X:%.*]], ptr nocapture noundef readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 [[X]]
 //
@@ -130,7 +130,7 @@ void first_ulong2_i32(ulong2 *x, int y, ulong2 *r) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@second_ulong2_i32
-// CHECK-SAME: (ptr nocapture noundef readonly [[X:%.*]], i32 noundef returned [[Y:%.*]])
+// CHECK-SAME: (ptr nocapture noundef readonly [[X:%.*]], i32 noundef returned [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 [[Y]]
 //
@@ -150,7 +150,7 @@ typedef struct {
 extern "C" {
 
 // CHECK-LABEL: define {{[^@]+}}@first_i32_asc
-// CHECK-SAME: (i32 noundef returned [[X:%.*]], ptr nocapture noundef readonly [[Y:%.*]])
+// CHECK-SAME: (i32 noundef returned [[X:%.*]], ptr nocapture noundef readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 [[X]]
 //
@@ -173,7 +173,7 @@ void second_i32_asc(int x, asc *y, asc *r) { *r = second<int, asc>(x, *y); }
 void first_asc_i32(asc *x, int y, asc *r) { *r = first<asc, int>(*x, y); }
 
 // CHECK-LABEL: define {{[^@]+}}@second_asc_i32
-// CHECK-SAME: (ptr nocapture noundef readonly [[X:%.*]], i32 noundef returned [[Y:%.*]])
+// CHECK-SAME: (ptr nocapture noundef readonly [[X:%.*]], i32 noundef returned [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 [[Y]]
 //
diff --git clang/test/CodeGenCXX/lambda-expressions-nested-linkage.cpp clang/test/CodeGenCXX/lambda-expressions-nested-linkage.cpp
index 18d2080a611e..bb21c87eed8f 100644
--- clang/test/CodeGenCXX/lambda-expressions-nested-linkage.cpp
+++ clang/test/CodeGenCXX/lambda-expressions-nested-linkage.cpp
@@ -56,6 +56,7 @@ inline int foo() {
     };
   };
   L(3)('a');
+  return 0;
 }
 int use = foo();
 }
diff --git clang/test/CodeGenCXX/mangle-exprs.cpp clang/test/CodeGenCXX/mangle-exprs.cpp
index b666eaadf457..b75f2cf3607b 100644
--- clang/test/CodeGenCXX/mangle-exprs.cpp
+++ clang/test/CodeGenCXX/mangle-exprs.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -std=c++11 -fclang-abi-compat=latest -emit-llvm %s -o - -triple=x86_64-apple-darwin9 | FileCheck %s
+// RUN: %clang_cc1 -Wno-error=return-type -std=c++11 -fclang-abi-compat=latest -emit-llvm %s -o - -triple=x86_64-apple-darwin9 | FileCheck %s
 
 namespace std {
   typedef decltype(sizeof(int)) size_t;
@@ -327,7 +327,7 @@ namespace test7 {
   template<class T> decltype(T{{1,2}}) fTB(T t) {}
   template<class T> decltype(T({1,2})) fTC(T t) {}
 
-  int main() {
+  void main() {
     fA1(1); // CHECK-LABEL: define {{.*}} @_ZN5test73fA1IiEEDTcmtlNS_1AELi1ELi2EEcvT__EES2_
     fA2(1); // CHECK-LABEL: define {{.*}} @_ZN5test73fA2IiEEDTcmcvNS_1AEilLi1ELi2EEcvT__EES2_
     fB1(1); // CHECK-LABEL: define {{.*}} @_ZN5test73fB1IiEEDTcmtlNS_1BELi1ELi2EEcvT__EES2_
diff --git clang/test/CodeGenCXX/mangle-variadic-templates.cpp clang/test/CodeGenCXX/mangle-variadic-templates.cpp
index d2c1b7726590..2fc2cb262a44 100644
--- clang/test/CodeGenCXX/mangle-variadic-templates.cpp
+++ clang/test/CodeGenCXX/mangle-variadic-templates.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -std=c++11 -emit-llvm -triple=x86_64-apple-darwin9 -o - %s | FileCheck %s
+// RUN: %clang_cc1 -Wno-error=return-type -std=c++11 -emit-llvm -triple=x86_64-apple-darwin9 -o - %s | FileCheck %s
 
 template<unsigned I, typename ...Types>
 struct X { };
@@ -47,7 +47,7 @@ template void f3<int>(const int*);
 template void f3<int, float>(const int*, const float*);
 
 // Mangling of type pack expansions in a template argument
-template<typename ...Types> tuple<Types...> f4() {}
+template<typename ...Types> tuple<Types...> f4() { return {}; }
 // CHECK-LABEL: define weak_odr void @_Z2f4IJifdEE5tupleIJDpT_EEv
 template tuple<int, float, double> f4();
 
diff --git clang/test/CodeGenCXX/mangle.cpp clang/test/CodeGenCXX/mangle.cpp
index d0800af55c87..c5b472670e8c 100644
--- clang/test/CodeGenCXX/mangle.cpp
+++ clang/test/CodeGenCXX/mangle.cpp
@@ -645,7 +645,7 @@ namespace test24 {
     foo();
   }
 
-  static char bar() {}
+  static char bar() { return 0; }
   void test1() {
     // CHECK: call noundef signext i8 @_ZN6test24L3barEv()
     bar();
@@ -839,7 +839,7 @@ namespace test36 {
   template<unsigned> struct A { };
 
   template<typename ...Types>
-  auto f1(Types... values) -> A<sizeof...(values)> { }
+  auto f1(Types... values) -> A<sizeof...(values)> { return {}; }
 
   // CHECK: define weak_odr {{.*}} @_ZN6test362f1IJifEEENS_1AIXsZfp_EEEDpT_
   template A<2> f1(int, float);
diff --git clang/test/CodeGenCXX/matrix-type-operators.cpp clang/test/CodeGenCXX/matrix-type-operators.cpp
index 8854a718fb13..739008d3d0d3 100644
--- clang/test/CodeGenCXX/matrix-type-operators.cpp
+++ clang/test/CodeGenCXX/matrix-type-operators.cpp
@@ -300,7 +300,11 @@ int test_extract_template(MyMatrix<int, 2, 2> Mat1) {
 using double4x4 = double __attribute__((matrix_type(4, 4)));
 
 template <class R, class C>
-auto matrix_subscript(double4x4 m, R r, C c) -> decltype(m[r][c]) {}
+auto matrix_subscript(double4x4 m, R r, C c) -> decltype(m[r][c]) {
+  // FIXME: We can't actually do 'return m[r][c]' here currently.
+  static double d;
+  return d;
+}
 
 double test_matrix_subscript(double4x4 m) {
   // CHECK-LABEL: @_Z21test_matrix_subscriptu11matrix_typeILm4ELm4EdE(
diff --git clang/test/CodeGenCXX/matrix-type.cpp clang/test/CodeGenCXX/matrix-type.cpp
index c3a299e7feee..de28dcd24507 100644
--- clang/test/CodeGenCXX/matrix-type.cpp
+++ clang/test/CodeGenCXX/matrix-type.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -no-enable-noundef-analysis -fenable-matrix -fclang-abi-compat=latest -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - -std=c++17 | FileCheck %s
+// RUN: %clang_cc1 -Wno-error=return-type -no-enable-noundef-analysis -fenable-matrix -fclang-abi-compat=latest -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - -std=c++17 | FileCheck %s
 
 typedef double dx5x5_t __attribute__((matrix_type(5, 5)));
 typedef float fx3x4_t __attribute__((matrix_type(3, 4)));
diff --git clang/test/CodeGenCXX/matrix-vector-bit-int.cpp clang/test/CodeGenCXX/matrix-vector-bit-int.cpp
index ffbce9ff8d6f..7dc3b6bd5982 100644
--- clang/test/CodeGenCXX/matrix-vector-bit-int.cpp
+++ clang/test/CodeGenCXX/matrix-vector-bit-int.cpp
@@ -15,14 +15,14 @@ using i512x3x3 = _BitInt(512) __attribute__((matrix_type(3, 3)));
 // CHECK-NEXT:    [[A:%.*]] = alloca <3 x i8>, align 4
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <3 x i8>, align 4
 // CHECK-NEXT:    store i32 [[A_COERCE]], ptr [[A]], align 4
-// CHECK-NEXT:    [[LOADVEC4:%.*]] = load <4 x i8>, ptr [[A]], align 4
-// CHECK-NEXT:    [[A1:%.*]] = shufflevector <4 x i8> [[LOADVEC4]], <4 x i8> poison, <3 x i32> <i32 0, i32 1, i32 2>
+// CHECK-NEXT:    [[LOADVECN:%.*]] = load <4 x i8>, ptr [[A]], align 4
+// CHECK-NEXT:    [[A1:%.*]] = shufflevector <4 x i8> [[LOADVECN]], <4 x i8> poison, <3 x i32> <i32 0, i32 1, i32 2>
 // CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x i8> [[A1]], <3 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
 // CHECK-NEXT:    store <4 x i8> [[EXTRACTVEC]], ptr [[A_ADDR]], align 4
-// CHECK-NEXT:    [[LOADVEC42:%.*]] = load <4 x i8>, ptr [[A_ADDR]], align 4
-// CHECK-NEXT:    [[EXTRACTVEC3:%.*]] = shufflevector <4 x i8> [[LOADVEC42]], <4 x i8> poison, <3 x i32> <i32 0, i32 1, i32 2>
-// CHECK-NEXT:    [[LOADVEC44:%.*]] = load <4 x i8>, ptr [[A_ADDR]], align 4
-// CHECK-NEXT:    [[EXTRACTVEC5:%.*]] = shufflevector <4 x i8> [[LOADVEC44]], <4 x i8> poison, <3 x i32> <i32 0, i32 1, i32 2>
+// CHECK-NEXT:    [[LOADVECN2:%.*]] = load <4 x i8>, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[EXTRACTVEC3:%.*]] = shufflevector <4 x i8> [[LOADVECN2]], <4 x i8> poison, <3 x i32> <i32 0, i32 1, i32 2>
+// CHECK-NEXT:    [[LOADVECN4:%.*]] = load <4 x i8>, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[EXTRACTVEC5:%.*]] = shufflevector <4 x i8> [[LOADVECN4]], <4 x i8> poison, <3 x i32> <i32 0, i32 1, i32 2>
 // CHECK-NEXT:    [[ADD:%.*]] = add <3 x i8> [[EXTRACTVEC3]], [[EXTRACTVEC5]]
 // CHECK-NEXT:    store <3 x i8> [[ADD]], ptr [[RETVAL]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[RETVAL]], align 4
@@ -38,10 +38,10 @@ i8x3 v1(i8x3 a) {
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <3 x i32>, align 16
 // CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x i32> [[A]], <3 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
 // CHECK-NEXT:    store <4 x i32> [[EXTRACTVEC]], ptr [[A_ADDR]], align 16
-// CHECK-NEXT:    [[LOADVEC4:%.*]] = load <4 x i32>, ptr [[A_ADDR]], align 16
-// CHECK-NEXT:    [[EXTRACTVEC1:%.*]] = shufflevector <4 x i32> [[LOADVEC4]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
-// CHECK-NEXT:    [[LOADVEC42:%.*]] = load <4 x i32>, ptr [[A_ADDR]], align 16
-// CHECK-NEXT:    [[EXTRACTVEC3:%.*]] = shufflevector <4 x i32> [[LOADVEC42]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
+// CHECK-NEXT:    [[LOADVECN:%.*]] = load <4 x i32>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[EXTRACTVEC1:%.*]] = shufflevector <4 x i32> [[LOADVECN]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
+// CHECK-NEXT:    [[LOADVECN2:%.*]] = load <4 x i32>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[EXTRACTVEC3:%.*]] = shufflevector <4 x i32> [[LOADVECN2]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
 // CHECK-NEXT:    [[ADD:%.*]] = add <3 x i32> [[EXTRACTVEC1]], [[EXTRACTVEC3]]
 // CHECK-NEXT:    ret <3 x i32> [[ADD]]
 //
@@ -53,14 +53,14 @@ i32x3 v2(i32x3 a) {
 // CHECK-SAME: ptr noundef byval(<3 x i512>) align 256 [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <3 x i512>, align 256
-// CHECK-NEXT:    [[LOADVEC4:%.*]] = load <4 x i512>, ptr [[TMP0]], align 256
-// CHECK-NEXT:    [[A:%.*]] = shufflevector <4 x i512> [[LOADVEC4]], <4 x i512> poison, <3 x i32> <i32 0, i32 1, i32 2>
+// CHECK-NEXT:    [[LOADVECN:%.*]] = load <4 x i512>, ptr [[TMP0]], align 256
+// CHECK-NEXT:    [[A:%.*]] = shufflevector <4 x i512> [[LOADVECN]], <4 x i512> poison, <3 x i32> <i32 0, i32 1, i32 2>
 // CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x i512> [[A]], <3 x i512> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
 // CHECK-NEXT:    store <4 x i512> [[EXTRACTVEC]], ptr [[A_ADDR]], align 256
-// CHECK-NEXT:    [[LOADVEC41:%.*]] = load <4 x i512>, ptr [[A_ADDR]], align 256
-// CHECK-NEXT:    [[EXTRACTVEC2:%.*]] = shufflevector <4 x i512> [[LOADVEC41]], <4 x i512> poison, <3 x i32> <i32 0, i32 1, i32 2>
-// CHECK-NEXT:    [[LOADVEC43:%.*]] = load <4 x i512>, ptr [[A_ADDR]], align 256
-// CHECK-NEXT:    [[EXTRACTVEC4:%.*]] = shufflevector <4 x i512> [[LOADVEC43]], <4 x i512> poison, <3 x i32> <i32 0, i32 1, i32 2>
+// CHECK-NEXT:    [[LOADVECN1:%.*]] = load <4 x i512>, ptr [[A_ADDR]], align 256
+// CHECK-NEXT:    [[EXTRACTVEC2:%.*]] = shufflevector <4 x i512> [[LOADVECN1]], <4 x i512> poison, <3 x i32> <i32 0, i32 1, i32 2>
+// CHECK-NEXT:    [[LOADVECN3:%.*]] = load <4 x i512>, ptr [[A_ADDR]], align 256
+// CHECK-NEXT:    [[EXTRACTVEC4:%.*]] = shufflevector <4 x i512> [[LOADVECN3]], <4 x i512> poison, <3 x i32> <i32 0, i32 1, i32 2>
 // CHECK-NEXT:    [[ADD:%.*]] = add <3 x i512> [[EXTRACTVEC2]], [[EXTRACTVEC4]]
 // CHECK-NEXT:    ret <3 x i512> [[ADD]]
 //
diff --git clang/test/CodeGenCXX/microsoft-abi-static-initializers.cpp clang/test/CodeGenCXX/microsoft-abi-static-initializers.cpp
index fa7670c74481..55dc71a00985 100644
--- clang/test/CodeGenCXX/microsoft-abi-static-initializers.cpp
+++ clang/test/CodeGenCXX/microsoft-abi-static-initializers.cpp
@@ -194,6 +194,7 @@ inline int switch_test(int x) {
       return b + c++;
     }
   };
+  return 0;
 }
 
 int f();
diff --git clang/test/CodeGenCXX/new-alias.cpp clang/test/CodeGenCXX/new-alias.cpp
index 5310d47297bb..2496e3836218 100644
--- clang/test/CodeGenCXX/new-alias.cpp
+++ clang/test/CodeGenCXX/new-alias.cpp
@@ -3,11 +3,12 @@
 using size_t = decltype(sizeof(0));
 
 extern "C" char *something(long long x) {
+  return nullptr;
 }
 
 // CHECK: @_Znwm ={{.*}} alias ptr (i64), ptr @something
 void *operator new(size_t) __attribute__((alias("something")));
 
 // PR16715: don't assert here.
-// CHECK: call noalias noundef nonnull ptr @_Znwm(i64 noundef 4) #3{{$}}
+// CHECK: call noalias noundef nonnull ptr @_Znwm(i64 noundef 4)
 int *pr16715 = new int;
diff --git clang/test/CodeGenCXX/nrvo.cpp clang/test/CodeGenCXX/nrvo.cpp
index d5bb1c0e45a5..1141bc35de58 100644
--- clang/test/CodeGenCXX/nrvo.cpp
+++ clang/test/CodeGenCXX/nrvo.cpp
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple i386-unknown-unknown -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple i386-unknown-unknown -emit-llvm -fcxx-exceptions -fexceptions -disable-llvm-passes -std=c++03 -o - %s | FileCheck --check-prefixes=CHECK-EH-03 %s
-// RUN: %clang_cc1 -triple i386-unknown-unknown -emit-llvm -fcxx-exceptions -fexceptions -disable-llvm-passes -std=c++11 -DCXX11 -o - %s | FileCheck --check-prefixes=CHECK-EH-11 %s
+// RUN: %clang_cc1 -triple i386-unknown-unknown -Wno-error=return-type -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple i386-unknown-unknown -Wno-error=return-type -emit-llvm -fcxx-exceptions -fexceptions -disable-llvm-passes -std=c++03 -o - %s | FileCheck --check-prefixes=CHECK-EH-03 %s
+// RUN: %clang_cc1 -triple i386-unknown-unknown -Wno-error=return-type -emit-llvm -fcxx-exceptions -fexceptions -disable-llvm-passes -std=c++11 -DCXX11 -o - %s | FileCheck --check-prefixes=CHECK-EH-11 %s
 
 // Test code generation for the named return value optimization.
 class X {
diff --git clang/test/CodeGenCXX/reference-field.cpp clang/test/CodeGenCXX/reference-field.cpp
index 54e914d0f3fc..7c78c99eb3d8 100644
--- clang/test/CodeGenCXX/reference-field.cpp
+++ clang/test/CodeGenCXX/reference-field.cpp
@@ -3,6 +3,6 @@
 // Make sure the call to b() doesn't get optimized out.
 extern struct x {char& x,y;}y;
 int b();      
-int a() { if (!&y.x) b(); }
+void a() { if (!&y.x) b(); }
 
 // CHECK: @_Z1bv
diff --git clang/test/CodeGenCXX/return.cpp clang/test/CodeGenCXX/return.cpp
index 584c2921c1e7..a281d71a271c 100644
--- clang/test/CodeGenCXX/return.cpp
+++ clang/test/CodeGenCXX/return.cpp
@@ -1,8 +1,8 @@
-// RUN: %clang_cc1 -emit-llvm -triple %itanium_abi_triple -std=c++11 -o - %s | FileCheck --check-prefixes=CHECK,CHECK-COMMON %s
-// RUN: %clang_cc1 -emit-llvm -triple %itanium_abi_triple -std=c++11 -O -o - %s | FileCheck %s --check-prefixes=CHECK-OPT,CHECK-COMMON
-// RUN: %clang_cc1 -emit-llvm -triple %itanium_abi_triple -std=c++11 -fno-strict-return -o - %s | FileCheck %s --check-prefixes=CHECK-NOSTRICT,CHECK-COMMON
+// RUN: %clang_cc1 -emit-llvm -triple %itanium_abi_triple -std=c++11 -Wno-error=return-type -o - %s | FileCheck --check-prefixes=CHECK,CHECK-COMMON %s
+// RUN: %clang_cc1 -emit-llvm -triple %itanium_abi_triple -std=c++11 -Wno-error=return-type -O -o - %s | FileCheck %s --check-prefixes=CHECK-OPT,CHECK-COMMON
+// RUN: %clang_cc1 -emit-llvm -triple %itanium_abi_triple -std=c++11 -fno-strict-return -Wno-error=return-type -o - %s | FileCheck %s --check-prefixes=CHECK-NOSTRICT,CHECK-COMMON
 // RUN: %clang_cc1 -emit-llvm -triple %itanium_abi_triple -std=c++11 -fno-strict-return -Wno-return-type -o - %s | FileCheck %s --check-prefixes=CHECK-NOSTRICT,CHECK-COMMON
-// RUN: %clang_cc1 -emit-llvm -triple %itanium_abi_triple -std=c++11 -fno-strict-return -O -o - %s | FileCheck %s --check-prefixes=CHECK-NOSTRICT-OPT,CHECK-COMMON
+// RUN: %clang_cc1 -emit-llvm -triple %itanium_abi_triple -std=c++11 -fno-strict-return -Wno-error=return-type -O -o - %s | FileCheck %s --check-prefixes=CHECK-NOSTRICT-OPT,CHECK-COMMON
 
 // CHECK-COMMON-LABEL: @_Z9no_return
 int no_return() {
diff --git clang/test/CodeGenCXX/template-instantiation.cpp clang/test/CodeGenCXX/template-instantiation.cpp
index 8453bcefc7eb..f1797cb69870 100644
--- clang/test/CodeGenCXX/template-instantiation.cpp
+++ clang/test/CodeGenCXX/template-instantiation.cpp
@@ -140,7 +140,7 @@ namespace test4 {
     B b;
   }
 
-  unsigned test() {
+  void test() {
     A<int>::foo();
   }
 }
diff --git clang/test/CodeGenCXX/trap-fnattr.cpp clang/test/CodeGenCXX/trap-fnattr.cpp
index 21422bc94f6f..ed7162fc43a6 100644
--- clang/test/CodeGenCXX/trap-fnattr.cpp
+++ clang/test/CodeGenCXX/trap-fnattr.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -O0 -emit-llvm -ftrapv -ftrap-function=mytrap %s -o - | FileCheck %s -check-prefix=TRAPFUNC
-// RUN: %clang_cc1 -O0 -emit-llvm -ftrapv %s -o - | FileCheck %s -check-prefix=NOOPTION
+// RUN: %clang_cc1 -O0 -emit-llvm -Wno-error=return-type -ftrapv -ftrap-function=mytrap %s -o - | FileCheck %s -check-prefix=TRAPFUNC
+// RUN: %clang_cc1 -O0 -emit-llvm -Wno-error=return-type -ftrapv %s -o - | FileCheck %s -check-prefix=NOOPTION
 
 // TRAPFUNC-LABEL: define {{(dso_local )?}}void @{{_Z12test_builtinv|\"\?test_builtin@@YAXXZ\"}}
 // TRAPFUNC: call void @llvm.trap() [[ATTR0:#[0-9]+]]
diff --git clang/test/CodeGenCXX/ubsan-check-debuglocs.cpp clang/test/CodeGenCXX/ubsan-check-debuglocs.cpp
index 96a697aca5eb..81db5c491261 100644
--- clang/test/CodeGenCXX/ubsan-check-debuglocs.cpp
+++ clang/test/CodeGenCXX/ubsan-check-debuglocs.cpp
@@ -8,7 +8,7 @@
 // CHECK-SAME: !dbg
 
 struct SourceLocation {
-  SourceLocation acquire() {};
+  SourceLocation acquire() { return {}; };
 };
 extern "C" void __ubsan_handle_type_mismatch_v1(SourceLocation *Loc);
 static void handleTypeMismatchImpl(SourceLocation *Loc) { Loc->acquire(); }
diff --git clang/test/CodeGenObjC/2007-10-18-ProDescriptor.m clang/test/CodeGenObjC/2007-10-18-ProDescriptor.m
index 35a0df3a1a9f..b3eda57eec5d 100644
--- clang/test/CodeGenObjC/2007-10-18-ProDescriptor.m
+++ clang/test/CodeGenObjC/2007-10-18-ProDescriptor.m
@@ -14,5 +14,6 @@ struct A {
 @end
 @implementation AGy
 - (unsigned) ver {
+    return 0;
 }
 @end
diff --git clang/test/CodeGenObjC/2008-11-25-Blocks.m clang/test/CodeGenObjC/2008-11-25-Blocks.m
index f0be6de33898..60615f78a049 100644
--- clang/test/CodeGenObjC/2008-11-25-Blocks.m
+++ clang/test/CodeGenObjC/2008-11-25-Blocks.m
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fblocks -emit-llvm %s -o /dev/null
+// RUN: %clang_cc1 -Wno-error=return-type -fblocks -emit-llvm %s -o /dev/null
 
 @interface bork
 - (id)B:(void (^)(void))blk;
diff --git clang/test/CodeGenObjC/debug-info-crash.m clang/test/CodeGenObjC/debug-info-crash.m
index 85b9b7d334f9..845a65ec41d5 100644
--- clang/test/CodeGenObjC/debug-info-crash.m
+++ clang/test/CodeGenObjC/debug-info-crash.m
@@ -36,5 +36,6 @@
 }
 + (NSAttributedString *)attributedStringWithString:(id)string image:(NSImage *)image  {
   NSMutableAttributedString *attrStr;
+  return 0;
 }
 @end
diff --git clang/test/CodeGenObjC/encode-test.m clang/test/CodeGenObjC/encode-test.m
index 78a70a00e072..b07e9e9926dc 100644
--- clang/test/CodeGenObjC/encode-test.m
+++ clang/test/CodeGenObjC/encode-test.m
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple i686-apple-darwin9 -fobjc-runtime=macosx-fragile-10.5 -emit-llvm -o %t %s
+// RUN: %clang_cc1 -Wno-error=return-type -triple i686-apple-darwin9 -fobjc-runtime=macosx-fragile-10.5 -emit-llvm -o %t %s
 // RUN: FileCheck < %t %s
 //
 // CHECK: @OBJC_METH_VAR_TYPE_{{.*}} = private unnamed_addr constant [16 x i8] c"v12@0:4[3[4@]]8\00"
diff --git clang/test/CodeGenObjC/message-arrays.m clang/test/CodeGenObjC/message-arrays.m
index 3e8697fc9376..bd5b05fda31d 100644
--- clang/test/CodeGenObjC/message-arrays.m
+++ clang/test/CodeGenObjC/message-arrays.m
@@ -9,7 +9,7 @@ void f0(id a) {
 -(void) m: (int) arg0, ...;
 @end
 
-int f1(A *a) {
+void f1(A *a) {
   // This should also get an implicit cast (for the vararg)
   [a m: 1, "test"];
 }
diff --git clang/test/CodeGenObjC/metadata-symbols-32.m clang/test/CodeGenObjC/metadata-symbols-32.m
index 825b2c61c55d..716b26790bd3 100644
--- clang/test/CodeGenObjC/metadata-symbols-32.m
+++ clang/test/CodeGenObjC/metadata-symbols-32.m
@@ -80,7 +80,7 @@ llvm-gcc -m32 -emit-llvm -S -o - metadata-symbols-32.m | \
 
 @implementation J0(Category) @end
 
-void *f0(void) {
+void f0(void) {
    [B im0];
    [C im1];
 }
diff --git clang/test/CodeGenObjC/metadata-symbols-64.m clang/test/CodeGenObjC/metadata-symbols-64.m
index 96a79470bfdd..944acd20f0d5 100644
--- clang/test/CodeGenObjC/metadata-symbols-64.m
+++ clang/test/CodeGenObjC/metadata-symbols-64.m
@@ -116,7 +116,7 @@ id    ID;
    }
 @end
 
-void *f0(id x) {
+void f0(id x) {
    Example* pe;
    double dd = [pe RET_DOUBLE];
    dd = [pe RET_FLOAT];
diff --git clang/test/CodeGenObjC/objc2-weak-compare.m clang/test/CodeGenObjC/objc2-weak-compare.m
index 093a78abc489..b42fc5b396ed 100644
--- clang/test/CodeGenObjC/objc2-weak-compare.m
+++ clang/test/CodeGenObjC/objc2-weak-compare.m
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple i386-apple-darwin9 -fobjc-runtime=macosx-fragile-10.5 -fobjc-gc -emit-llvm -o %t %s
-// RUN: %clang_cc1 -x objective-c++ -triple i386-apple-darwin9 -fobjc-runtime=macosx-fragile-10.5 -fobjc-gc -emit-llvm -o %t %s
+// RUN: %clang_cc1 -Wno-error=return-type -triple i386-apple-darwin9 -fobjc-runtime=macosx-fragile-10.5 -fobjc-gc -emit-llvm -o %t %s
+// RUN: %clang_cc1 -Wno-error=return-type -x objective-c++ -triple i386-apple-darwin9 -fobjc-runtime=macosx-fragile-10.5 -fobjc-gc -emit-llvm -o %t %s
 
 @interface PBXTarget 
 {
diff --git clang/test/CodeGenObjC/objc2-write-barrier-2.m clang/test/CodeGenObjC/objc2-write-barrier-2.m
index 6bc2f509083b..68a9e7726e5c 100644
--- clang/test/CodeGenObjC/objc2-write-barrier-2.m
+++ clang/test/CodeGenObjC/objc2-write-barrier-2.m
@@ -78,6 +78,7 @@ __strong CFStringRef *_documentNames;
   inner.inner_most.nestedDeeperNames[filteredPos] = 0;
   inner.inner_most.arrI[3].is1[5] = 0;
   inner.inner_most.arrI[3].is2[5] = 0;
+  return 0;
 }
 @end
 
diff --git clang/test/CodeGenObjC/protocols-lazy.m clang/test/CodeGenObjC/protocols-lazy.m
index 6764d0487fe3..5e5e78c4be75 100644
--- clang/test/CodeGenObjC/protocols-lazy.m
+++ clang/test/CodeGenObjC/protocols-lazy.m
@@ -41,7 +41,7 @@ void f0(void) { id x = @protocol(P2); }
 @protocol P5 -im1; @end
 @implementation UserP5
 
-- im1 { }
+- im1 { __builtin_unreachable(); }
 
 @end
 
diff --git clang/test/CodeGenObjC/strong-in-c-struct.m clang/test/CodeGenObjC/strong-in-c-struct.m
index d7febd274121..36f50aa98561 100644
--- clang/test/CodeGenObjC/strong-in-c-struct.m
+++ clang/test/CodeGenObjC/strong-in-c-struct.m
@@ -819,10 +819,10 @@ id test_conditional0(int c) {
   return (c ? g2 : g1).f1;
 }
 
-// CHECK-LABEL: define{{.*}} ptr @test_conditional1(
+// CHECK-LABEL: define{{.*}} void @test_conditional1(
 // CHECK-NOT: call void @__destructor
 
-id test_conditional1(int c) {
+void test_conditional1(int c) {
   calleeStrongSmall(c ? g2 : g1);
 }
 
@@ -836,10 +836,10 @@ id test_assignment0(void) {
   return (g2 = g1).f1;
 }
 
-// CHECK-LABEL: define{{.*}} ptr @test_assignment1(
+// CHECK-LABEL: define{{.*}} void @test_assignment1(
 // CHECK-NOT: call void @__destructor
 
-id test_assignment1(void) {
+void test_assignment1(void) {
   calleeStrongSmall(g2 = g1);
 }
 
diff --git clang/test/CodeGenObjCXX/debug-info-line.mm clang/test/CodeGenObjCXX/debug-info-line.mm
index bb6eaa50d7cc..9f543dbaf962 100644
--- clang/test/CodeGenObjCXX/debug-info-line.mm
+++ clang/test/CodeGenObjCXX/debug-info-line.mm
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-windows-gnu -fcxx-exceptions -fexceptions -debug-info-kind=line-tables-only -fblocks -emit-llvm %s -o - | FileCheck %s
-// RUN: %clang_cc1 -triple x86_64-unknown-windows-gnu -fcxx-exceptions -fexceptions -debug-info-kind=line-directives-only -fblocks -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -Wno-error=return-type -triple x86_64-unknown-windows-gnu -fcxx-exceptions -fexceptions -debug-info-kind=line-tables-only -fblocks -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -Wno-error=return-type -triple x86_64-unknown-windows-gnu -fcxx-exceptions -fexceptions -debug-info-kind=line-directives-only -fblocks -emit-llvm %s -o - | FileCheck %s
 
 void fn();
 
diff --git clang/test/CodeGenObjCXX/instantiate-return.mm clang/test/CodeGenObjCXX/instantiate-return.mm
index fe59602b395d..6e41d9b3d266 100644
--- clang/test/CodeGenObjCXX/instantiate-return.mm
+++ clang/test/CodeGenObjCXX/instantiate-return.mm
@@ -18,5 +18,6 @@ struct S0;
 
 @implementation C1
 - (TemplateClass<S0>)m1 {
+    __builtin_unreachable();
 }
 @end
diff --git clang/test/CodeGenObjCXX/pr14474-gline-tables-only.mm clang/test/CodeGenObjCXX/pr14474-gline-tables-only.mm
index cfc6d78b6b88..01fa98bd72f5 100644
--- clang/test/CodeGenObjCXX/pr14474-gline-tables-only.mm
+++ clang/test/CodeGenObjCXX/pr14474-gline-tables-only.mm
@@ -1,7 +1,7 @@
 // PR 14474
-// RUN: %clang_cc1 -triple i386-apple-macosx10.6.0 -emit-llvm \
+// RUN: %clang_cc1 -Wno-error=return-type -triple i386-apple-macosx10.6.0 -emit-llvm \
 // RUN:   -debug-info-kind=line-tables-only -x objective-c++ -o /dev/null %s
-// RUN: %clang_cc1 -triple i386-apple-macosx10.6.0 -emit-llvm \
+// RUN: %clang_cc1 -Wno-error=return-type -triple i386-apple-macosx10.6.0 -emit-llvm \
 // RUN:   -debug-info-kind=line-directives-only -x objective-c++ -o /dev/null %s
 
 typedef signed char BOOL;
diff --git clang/test/CodeGenObjCXX/property-dot-reference.mm clang/test/CodeGenObjCXX/property-dot-reference.mm
index 245aa4176c7d..ad745b566bea 100644
--- clang/test/CodeGenObjCXX/property-dot-reference.mm
+++ clang/test/CodeGenObjCXX/property-dot-reference.mm
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple x86_64-apple-darwin10 -emit-llvm -fexceptions -o - %s | FileCheck %s
+// RUN: %clang_cc1 -Wno-error=return-type -triple x86_64-apple-darwin10 -emit-llvm -fexceptions -o - %s | FileCheck %s
 
 struct TFENode {
 void GetURL() const;
diff --git clang/test/CodeGenObjCXX/return.mm clang/test/CodeGenObjCXX/return.mm
index fb77f336dfc0..c544e5413190 100644
--- clang/test/CodeGenObjCXX/return.mm
+++ clang/test/CodeGenObjCXX/return.mm
@@ -1,6 +1,6 @@
 /// -fstrict-return is the default.
-// RUN: %clang_cc1 -emit-llvm -fblocks -triple x86_64-apple-darwin -o - %s | FileCheck %s
-// RUN: %clang_cc1 -emit-llvm -fblocks -triple x86_64-apple-darwin -O -o - %s | FileCheck %s
+// RUN: %clang_cc1 -Wno-error=return-type -emit-llvm -fblocks -triple x86_64-apple-darwin -o - %s | FileCheck %s
+// RUN: %clang_cc1 -Wno-error=return-type -emit-llvm -fblocks -triple x86_64-apple-darwin -O -o - %s | FileCheck %s
 
 @interface I
 @end
diff --git clang/test/CodeGenOpenCL/amdgpu-alignment.cl clang/test/CodeGenOpenCL/amdgpu-alignment.cl
index 8f57713fe1f0..3c2653bf3412 100644
--- clang/test/CodeGenOpenCL/amdgpu-alignment.cl
+++ clang/test/CodeGenOpenCL/amdgpu-alignment.cl
@@ -106,7 +106,7 @@ typedef double __attribute__((ext_vector_type(16))) double16;
 // CHECK: store volatile <16 x i16> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v16i16, align 32
 // CHECK: store volatile i32 0, ptr addrspace(3) @local_memory_alignment_global.lds_i32, align 4
 // CHECK: store volatile <2 x i32> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v2i32, align 8
-// CHECK: store volatile <4 x i32> <i32 0, i32 0, i32 0, i32 undef>, ptr addrspace(3) @local_memory_alignment_global.lds_v3i32, align 16
+// CHECK: store volatile <3 x i32> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v3i32, align 16
 // CHECK: store volatile <4 x i32> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v4i32, align 16
 // CHECK: store volatile <8 x i32> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v8i32, align 32
 // CHECK: store volatile <16 x i32> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v16i32, align 64
@@ -124,7 +124,7 @@ typedef double __attribute__((ext_vector_type(16))) double16;
 // CHECK: store volatile <16 x half> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v16f16, align 32
 // CHECK: store volatile float 0.000000e+00, ptr addrspace(3) @local_memory_alignment_global.lds_f32, align 4
 // CHECK: store volatile <2 x float> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v2f32, align 8
-// CHECK: store volatile <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef>, ptr addrspace(3) @local_memory_alignment_global.lds_v3f32, align 16
+// CHECK: store volatile <3 x float> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v3f32, align 16
 // CHECK: store volatile <4 x float> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v4f32, align 16
 // CHECK: store volatile <8 x float> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v8f32, align 32
 // CHECK: store volatile <16 x float> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v16f32, align 64
@@ -393,7 +393,7 @@ kernel void local_memory_alignment_arg(
 // CHECK: store volatile <16 x i16> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 32
 // CHECK: store volatile i32 0, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 4
 // CHECK: store volatile <2 x i32> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 8
-// CHECK: store volatile <4 x i32> <i32 0, i32 0, i32 0, i32 undef>, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 16
+// CHECK: store volatile <3 x i32> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 16
 // CHECK: store volatile <4 x i32> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 16
 // CHECK: store volatile <8 x i32> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 32
 // CHECK: store volatile <16 x i32> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 64
@@ -411,7 +411,7 @@ kernel void local_memory_alignment_arg(
 // CHECK: store volatile <16 x half> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 32
 // CHECK: store volatile float 0.000000e+00, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 4
 // CHECK: store volatile <2 x float> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 8
-// CHECK: store volatile <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef>, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 16
+// CHECK: store volatile <3 x float> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 16
 // CHECK: store volatile <4 x float> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 16
 // CHECK: store volatile <8 x float> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 32
 // CHECK: store volatile <16 x float> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 64
diff --git clang/test/CodeGenOpenCL/preserve_vec3.cl clang/test/CodeGenOpenCL/preserve_vec3.cl
index c84effe0c4b6..747cc301feff 100644
--- clang/test/CodeGenOpenCL/preserve_vec3.cl
+++ clang/test/CodeGenOpenCL/preserve_vec3.cl
@@ -1,5 +1,5 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
-// RUN: %clang_cc1 %s -emit-llvm -o - -triple spir-unknown-unknown -fpreserve-vec3-type | FileCheck %s
+// RUN: %clang_cc1 %s -emit-llvm -o - -triple spir-unknown-unknown | FileCheck %s
 
 typedef char char3 __attribute__((ext_vector_type(3)));
 typedef char char8 __attribute__((ext_vector_type(8)));
@@ -9,10 +9,11 @@ typedef float float3 __attribute__((ext_vector_type(3)));
 typedef float float4 __attribute__((ext_vector_type(4)));
 
 // CHECK-LABEL: define dso_local spir_kernel void @foo(
-// CHECK-SAME: ptr addrspace(1) nocapture noundef readonly align 16 [[A:%.*]], ptr addrspace(1) nocapture noundef writeonly align 16 initializes((0, 12)) [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META3:![0-9]+]] !kernel_arg_access_qual [[META4:![0-9]+]] !kernel_arg_type [[META5:![0-9]+]] !kernel_arg_base_type [[META6:![0-9]+]] !kernel_arg_type_qual [[META7:![0-9]+]] {
+// CHECK-SAME: ptr addrspace(1) nocapture noundef readonly align 16 [[A:%.*]], ptr addrspace(1) nocapture noundef writeonly align 16 initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META3:![0-9]+]] !kernel_arg_access_qual [[META4:![0-9]+]] !kernel_arg_type [[META5:![0-9]+]] !kernel_arg_base_type [[META6:![0-9]+]] !kernel_arg_type_qual [[META7:![0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16, !tbaa [[TBAA8:![0-9]+]]
-// CHECK-NEXT:    store <3 x float> [[TMP0]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
+// CHECK-NEXT:    [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
+// CHECK-NEXT:    [[EXTRACTVEC1:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    store <4 x float> [[EXTRACTVEC1]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8:![0-9]+]]
 // CHECK-NEXT:    ret void
 //
 void kernel foo(global float3 *a, global float3 *b) {
@@ -20,11 +21,11 @@ void kernel foo(global float3 *a, global float3 *b) {
 }
 
 // CHECK-LABEL: define dso_local spir_kernel void @float4_to_float3(
-// CHECK-SAME: ptr addrspace(1) nocapture noundef writeonly align 16 initializes((0, 12)) [[A:%.*]], ptr addrspace(1) nocapture noundef readonly align 16 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11:![0-9]+]] !kernel_arg_base_type [[META12:![0-9]+]] !kernel_arg_type_qual [[META7]] {
+// CHECK-SAME: ptr addrspace(1) nocapture noundef writeonly align 16 initializes((0, 16)) [[A:%.*]], ptr addrspace(1) nocapture noundef readonly align 16 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11:![0-9]+]] !kernel_arg_base_type [[META12:![0-9]+]] !kernel_arg_type_qual [[META7]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
-// CHECK-NEXT:    [[ASTYPE:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
-// CHECK-NEXT:    store <3 x float> [[ASTYPE]], ptr addrspace(1) [[A]], align 16, !tbaa [[TBAA8]]
+// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    store <4 x float> [[EXTRACTVEC]], ptr addrspace(1) [[A]], align 16, !tbaa [[TBAA8]]
 // CHECK-NEXT:    ret void
 //
 void kernel float4_to_float3(global float3 *a, global float4 *b) {
@@ -34,8 +35,8 @@ void kernel float4_to_float3(global float3 *a, global float4 *b) {
 // CHECK-LABEL: define dso_local spir_kernel void @float3_to_float4(
 // CHECK-SAME: ptr addrspace(1) nocapture noundef readonly align 16 [[A:%.*]], ptr addrspace(1) nocapture noundef writeonly align 16 initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META7]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16, !tbaa [[TBAA8]]
-// CHECK-NEXT:    [[ASTYPE:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
+// CHECK-NEXT:    [[ASTYPE:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
 // CHECK-NEXT:    store <4 x float> [[ASTYPE]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
 // CHECK-NEXT:    ret void
 //
@@ -46,9 +47,9 @@ void kernel float3_to_float4(global float3 *a, global float4 *b) {
 // CHECK-LABEL: define dso_local spir_kernel void @float3_to_double2(
 // CHECK-SAME: ptr addrspace(1) nocapture noundef readonly align 16 [[A:%.*]], ptr addrspace(1) nocapture noundef writeonly align 16 initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META13:![0-9]+]] !kernel_arg_base_type [[META14:![0-9]+]] !kernel_arg_type_qual [[META7]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16, !tbaa [[TBAA8]]
-// CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-// CHECK-NEXT:    store <4 x float> [[TMP1]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
+// CHECK-NEXT:    [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
 // CHECK-NEXT:    ret void
 //
 void kernel float3_to_double2(global float3 *a, global double2 *b) {
@@ -56,11 +57,11 @@ void kernel float3_to_double2(global float3 *a, global double2 *b) {
 }
 
 // CHECK-LABEL: define dso_local spir_kernel void @char8_to_short3(
-// CHECK-SAME: ptr addrspace(1) nocapture noundef writeonly align 8 initializes((0, 6)) [[A:%.*]], ptr addrspace(1) nocapture noundef readonly align 8 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META16:![0-9]+]] !kernel_arg_type_qual [[META7]] {
+// CHECK-SAME: ptr addrspace(1) nocapture noundef writeonly align 8 initializes((0, 8)) [[A:%.*]], ptr addrspace(1) nocapture noundef readonly align 8 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META16:![0-9]+]] !kernel_arg_type_qual [[META7]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr addrspace(1) [[B]], align 8, !tbaa [[TBAA8]]
-// CHECK-NEXT:    [[ASTYPE:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <3 x i32> <i32 0, i32 1, i32 2>
-// CHECK-NEXT:    store <3 x i16> [[ASTYPE]], ptr addrspace(1) [[A]], align 8, !tbaa [[TBAA8]]
+// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    store <4 x i16> [[EXTRACTVEC]], ptr addrspace(1) [[A]], align 8, !tbaa [[TBAA8]]
 // CHECK-NEXT:    ret void
 //
 void kernel char8_to_short3(global short3 *a, global char8 *b) {
@@ -70,8 +71,8 @@ void kernel char8_to_short3(global short3 *a, global char8 *b) {
 // CHECK-LABEL: define dso_local spir_func void @from_char3(
 // CHECK-SAME: <3 x i8> noundef [[A:%.*]], ptr addrspace(1) nocapture noundef writeonly initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i8> [[A]], <3 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-// CHECK-NEXT:    store <4 x i8> [[TMP0]], ptr addrspace(1) [[OUT]], align 4, !tbaa [[TBAA17:![0-9]+]]
+// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x i8> [[A]], <3 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    store <4 x i8> [[EXTRACTVEC]], ptr addrspace(1) [[OUT]], align 4, !tbaa [[TBAA17:![0-9]+]]
 // CHECK-NEXT:    ret void
 //
 void from_char3(char3 a, global int *out) {
@@ -81,8 +82,8 @@ void from_char3(char3 a, global int *out) {
 // CHECK-LABEL: define dso_local spir_func void @from_short3(
 // CHECK-SAME: <3 x i16> noundef [[A:%.*]], ptr addrspace(1) nocapture noundef writeonly initializes((0, 8)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i16> [[A]], <3 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-// CHECK-NEXT:    store <4 x i16> [[TMP0]], ptr addrspace(1) [[OUT]], align 8, !tbaa [[TBAA19:![0-9]+]]
+// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x i16> [[A]], <3 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    store <4 x i16> [[EXTRACTVEC]], ptr addrspace(1) [[OUT]], align 8, !tbaa [[TBAA19:![0-9]+]]
 // CHECK-NEXT:    ret void
 //
 void from_short3(short3 a, global long *out) {
@@ -90,11 +91,11 @@ void from_short3(short3 a, global long *out) {
 }
 
 // CHECK-LABEL: define dso_local spir_func void @scalar_to_char3(
-// CHECK-SAME: i32 noundef [[A:%.*]], ptr addrspace(1) nocapture noundef writeonly initializes((0, 3)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// CHECK-SAME: i32 noundef [[A:%.*]], ptr addrspace(1) nocapture noundef writeonly initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A]] to <4 x i8>
-// CHECK-NEXT:    [[ASTYPE:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <3 x i32> <i32 0, i32 1, i32 2>
-// CHECK-NEXT:    store <3 x i8> [[ASTYPE]], ptr addrspace(1) [[OUT]], align 4, !tbaa [[TBAA8]]
+// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    store <4 x i8> [[EXTRACTVEC]], ptr addrspace(1) [[OUT]], align 4, !tbaa [[TBAA8]]
 // CHECK-NEXT:    ret void
 //
 void scalar_to_char3(int a, global char3 *out) {
@@ -102,11 +103,11 @@ void scalar_to_char3(int a, global char3 *out) {
 }
 
 // CHECK-LABEL: define dso_local spir_func void @scalar_to_short3(
-// CHECK-SAME: i64 noundef [[A:%.*]], ptr addrspace(1) nocapture noundef writeonly initializes((0, 6)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// CHECK-SAME: i64 noundef [[A:%.*]], ptr addrspace(1) nocapture noundef writeonly initializes((0, 8)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A]] to <4 x i16>
-// CHECK-NEXT:    [[ASTYPE:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <3 x i32> <i32 0, i32 1, i32 2>
-// CHECK-NEXT:    store <3 x i16> [[ASTYPE]], ptr addrspace(1) [[OUT]], align 8, !tbaa [[TBAA8]]
+// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    store <4 x i16> [[EXTRACTVEC]], ptr addrspace(1) [[OUT]], align 8, !tbaa [[TBAA8]]
 // CHECK-NEXT:    ret void
 //
 void scalar_to_short3(long a, global short3 *out) {
diff --git clang/test/CoverageMapping/switch.cpp clang/test/CoverageMapping/switch.cpp
index db4cddbc6b94..1d44e7adeee4 100644
--- clang/test/CoverageMapping/switch.cpp
+++ clang/test/CoverageMapping/switch.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -mllvm -emptyline-comment-coverage=false -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -std=c++1z -triple %itanium_abi_triple -main-file-name switch.cpp %s | FileCheck %s
+// RUN: %clang_cc1 -Wno-error=return-type -mllvm -emptyline-comment-coverage=false -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -std=c++1z -triple %itanium_abi_triple -main-file-name switch.cpp %s | FileCheck %s
 
 // CHECK: foo
 void foo(int i) {   // CHECK-NEXT: File 0, [[@LINE]]:17 -> [[@LINE+11]]:2 = #0
diff --git clang/test/Driver/Inputs/multilib/multilib-custom-flags.yaml clang/test/Driver/Inputs/multilib/multilib-custom-flags.yaml
new file mode 100644
index 000000000000..153680bce7b8
--- /dev/null
+++ clang/test/Driver/Inputs/multilib/multilib-custom-flags.yaml
@@ -0,0 +1,13 @@
+MultilibVersion: 1.0
+
+Variants:
+
+Mappings:
+
+Flags:
+- Name: flag
+  Values:
+  - Name: foo
+  - Name: bar
+  Default: foo
+...
diff --git clang/test/Driver/cc-log-diagnostics.c clang/test/Driver/cc-log-diagnostics.c
index 1e21c6a76022..8cca0e9535c3 100644
--- clang/test/Driver/cc-log-diagnostics.c
+++ clang/test/Driver/cc-log-diagnostics.c
@@ -4,7 +4,7 @@
 // RUN: %clang -Wfoobar --target=x86_64-apple-darwin11 -fsyntax-only %s
 // RUN: FileCheck %s < %t.log
 
-int f0(void) {}
+int;
 
 // CHECK: <dict>
 // CHECK:   <key>main-file</key>
@@ -27,9 +27,9 @@ int f0(void) {}
 // CHECK:       <key>line</key>
 // CHECK:       <integer>7</integer>
 // CHECK:       <key>column</key>
-// CHECK:       <integer>15</integer>
+// CHECK:       <integer>1</integer>
 // CHECK:       <key>message</key>
-// CHECK:       <string>non-void function does not return a value</string>
+// CHECK:       <string>declaration does not declare anything</string>
 // CHECK:     </dict>
 // CHECK:   </array>
 // CHECK: </dict>
diff --git clang/test/Driver/clang-offload-bundler-zlib.c clang/test/Driver/clang-offload-bundler-zlib.c
index 7e5857296756..b026e2ec9987 100644
--- clang/test/Driver/clang-offload-bundler-zlib.c
+++ clang/test/Driver/clang-offload-bundler-zlib.c
@@ -42,6 +42,30 @@
 // NOHOST-DAG: hip-amdgcn-amd-amdhsa--gfx906
 //
 
+// Check compression/decompression of offload bundle using version 3 format.
+//
+// RUN: env OFFLOAD_BUNDLER_COMPRESS=1 OFFLOAD_BUNDLER_VERBOSE=1 COMPRESSED_BUNDLE_FORMAT_VERSION=3 \
+// RUN:   clang-offload-bundler -type=bc -targets=hip-amdgcn-amd-amdhsa--gfx900,hip-amdgcn-amd-amdhsa--gfx906 \
+// RUN:   -input=%t.tgt1 -input=%t.tgt2 -output=%t.hip.bundle.bc 2>&1 | \
+// RUN:   FileCheck -check-prefix=COMPRESS-V3 %s
+// RUN: clang-offload-bundler -type=bc -list -input=%t.hip.bundle.bc | FileCheck -check-prefix=NOHOST-V3 %s
+// RUN: env OFFLOAD_BUNDLER_VERBOSE=1 \
+// RUN:   clang-offload-bundler -type=bc -targets=hip-amdgcn-amd-amdhsa--gfx900,hip-amdgcn-amd-amdhsa--gfx906 \
+// RUN:   -output=%t.res.tgt1 -output=%t.res.tgt2 -input=%t.hip.bundle.bc -unbundle 2>&1 | \
+// RUN:   FileCheck -check-prefix=DECOMPRESS-V3 %s
+// RUN: diff %t.tgt1 %t.res.tgt1
+// RUN: diff %t.tgt2 %t.res.tgt2
+//
+// COMPRESS-V3: Compressed bundle format version: 3
+// COMPRESS-V3: Compression method used: zlib
+// COMPRESS-V3: Compression level: 6
+// DECOMPRESS-V3: Compressed bundle format version: 3
+// DECOMPRESS-V3: Decompression method: zlib
+// DECOMPRESS-V3: Hashes match: Yes
+// NOHOST-V3-NOT: host-
+// NOHOST-V3-DAG: hip-amdgcn-amd-amdhsa--gfx900
+// NOHOST-V3-DAG: hip-amdgcn-amd-amdhsa--gfx906
+
 // Check -compression-level= option
 
 // RUN: clang-offload-bundler -type=bc -targets=hip-amdgcn-amd-amdhsa--gfx900,hip-amdgcn-amd-amdhsa--gfx906 \
diff --git clang/test/Driver/freebsd.c clang/test/Driver/freebsd.c
index a0787bab4feb..94db63278bfe 100644
--- clang/test/Driver/freebsd.c
+++ clang/test/Driver/freebsd.c
@@ -79,9 +79,6 @@
 //
 // Check that LoongArch passes the correct linker emulation.
 //
-// RUN: %clang --target=loongarch32-freebsd -### %s %s 2>&1 \
-// RUN:   | FileCheck --check-prefix=CHECK-LA32-LD %s
-// CHECK-LA32-LD: ld{{.*}}" {{.*}} "-m" "elf32loongarch"
 // RUN: %clang --target=loongarch64-freebsd -### %s %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-LA64-LD %s
 // CHECK-LA64-LD: ld{{.*}}" {{.*}} "-m" "elf64loongarch"
diff --git clang/test/Driver/print-enabled-extensions/aarch64-ampere1b.c clang/test/Driver/print-enabled-extensions/aarch64-ampere1b.c
index d9bee7093bee..444ac4526200 100644
--- clang/test/Driver/print-enabled-extensions/aarch64-ampere1b.c
+++ clang/test/Driver/print-enabled-extensions/aarch64-ampere1b.c
@@ -51,7 +51,6 @@
 // CHECK-NEXT:     FEAT_SHA3, FEAT_SHA512                                 Enable SHA512 and SHA3 support
 // CHECK-NEXT:     FEAT_SM4, FEAT_SM3                                     Enable SM3 and SM4 support
 // CHECK-NEXT:     FEAT_SPECRES                                           Enable Armv8.5-A execution and data prediction invalidation instructions
-// CHECK-NEXT:     FEAT_SPEv1p2                                           Enable extra register in the Statistical Profiling Extension
 // CHECK-NEXT:     FEAT_SSBS, FEAT_SSBS2                                  Enable Speculative Store Bypass Safe bit
 // CHECK-NEXT:     FEAT_TLBIOS, FEAT_TLBIRANGE                            Enable Armv8.4-A TLB Range and Maintenance instructions
 // CHECK-NEXT:     FEAT_TRF                                               Enable Armv8.4-A Trace extension
diff --git clang/test/Driver/print-enabled-extensions/aarch64-cortex-a520.c clang/test/Driver/print-enabled-extensions/aarch64-cortex-a520.c
index b906074ce765..6ddd52a4a708 100644
--- clang/test/Driver/print-enabled-extensions/aarch64-cortex-a520.c
+++ clang/test/Driver/print-enabled-extensions/aarch64-cortex-a520.c
@@ -46,7 +46,6 @@
 // CHECK-NEXT:     FEAT_SB                                                Enable Armv8.5-A Speculation Barrier
 // CHECK-NEXT:     FEAT_SEL2                                              Enable Armv8.4-A Secure Exception Level 2 extension
 // CHECK-NEXT:     FEAT_SPECRES                                           Enable Armv8.5-A execution and data prediction invalidation instructions
-// CHECK-NEXT:     FEAT_SPEv1p2                                           Enable extra register in the Statistical Profiling Extension
 // CHECK-NEXT:     FEAT_SSBS, FEAT_SSBS2                                  Enable Speculative Store Bypass Safe bit
 // CHECK-NEXT:     FEAT_SVE                                               Enable Scalable Vector Extension (SVE) instructions
 // CHECK-NEXT:     FEAT_SVE2                                              Enable Scalable Vector Extension 2 (SVE2) instructions
diff --git clang/test/Driver/print-enabled-extensions/aarch64-cortex-a520ae.c clang/test/Driver/print-enabled-extensions/aarch64-cortex-a520ae.c
index 2e147732d5c6..35399a3c85c6 100644
--- clang/test/Driver/print-enabled-extensions/aarch64-cortex-a520ae.c
+++ clang/test/Driver/print-enabled-extensions/aarch64-cortex-a520ae.c
@@ -46,7 +46,6 @@
 // CHECK-NEXT:     FEAT_SB                                                Enable Armv8.5-A Speculation Barrier
 // CHECK-NEXT:     FEAT_SEL2                                              Enable Armv8.4-A Secure Exception Level 2 extension
 // CHECK-NEXT:     FEAT_SPECRES                                           Enable Armv8.5-A execution and data prediction invalidation instructions
-// CHECK-NEXT:     FEAT_SPEv1p2                                           Enable extra register in the Statistical Profiling Extension
 // CHECK-NEXT:     FEAT_SSBS, FEAT_SSBS2                                  Enable Speculative Store Bypass Safe bit
 // CHECK-NEXT:     FEAT_SVE                                               Enable Scalable Vector Extension (SVE) instructions
 // CHECK-NEXT:     FEAT_SVE2                                              Enable Scalable Vector Extension 2 (SVE2) instructions
diff --git clang/test/Driver/print-enabled-extensions/aarch64-fujitsu-monaka.c clang/test/Driver/print-enabled-extensions/aarch64-fujitsu-monaka.c
index 01a97a00de54..a80d0f5c79ec 100644
--- clang/test/Driver/print-enabled-extensions/aarch64-fujitsu-monaka.c
+++ clang/test/Driver/print-enabled-extensions/aarch64-fujitsu-monaka.c
@@ -63,7 +63,6 @@
 // CHECK-NEXT:     FEAT_SM4, FEAT_SM3                                     Enable SM3 and SM4 support
 // CHECK-NEXT:     FEAT_SPECRES                                           Enable Armv8.5-A execution and data prediction invalidation instructions
 // CHECK-NEXT:     FEAT_SPECRES2                                          Enable Speculation Restriction Instruction
-// CHECK-NEXT:     FEAT_SPEv1p2                                           Enable extra register in the Statistical Profiling Extension
 // CHECK-NEXT:     FEAT_SSBS, FEAT_SSBS2                                  Enable Speculative Store Bypass Safe bit
 // CHECK-NEXT:     FEAT_SVE                                               Enable Scalable Vector Extension (SVE) instructions
 // CHECK-NEXT:     FEAT_SVE2                                              Enable Scalable Vector Extension 2 (SVE2) instructions
@@ -77,4 +76,4 @@
 // CHECK-NEXT:     FEAT_UAO                                               Enable Armv8.2-A UAO PState
 // CHECK-NEXT:     FEAT_VHE                                               Enable Armv8.1-A Virtual Host extension
 // CHECK-NEXT:     FEAT_WFxT                                              Enable Armv8.7-A WFET and WFIT instruction
-// CHECK-NEXT:     FEAT_XS                                                Enable Armv8.7-A limited-TLB-maintenance instruction
\ No newline at end of file
+// CHECK-NEXT:     FEAT_XS                                                Enable Armv8.7-A limited-TLB-maintenance instruction
diff --git clang/test/Driver/print-multi-selection-flags.c clang/test/Driver/print-multi-selection-flags.c
index cf9522aa0685..5a35ae374f01 100644
--- clang/test/Driver/print-multi-selection-flags.c
+++ clang/test/Driver/print-multi-selection-flags.c
@@ -91,8 +91,8 @@
 // CHECK-RV32E-ORDER: -mabi=ilp32e
 // CHECK-RV32E-ORDER: -march=rv32e{{[0-9]+p[0-9]+}}_c{{[0-9]+p[0-9]+}}_zicsr{{[0-9]+p[0-9]+}}
 
-// RUN: %clang -print-multi-flags-experimental --target=armv8m.main-none-eabi -fmultilib-flag=foo -fmultilib-flag=bar | FileCheck --check-prefixes=CHECK-MULTILIB-CUSTOM-FLAG,CHECK-ARM-MULTILIB-CUSTOM-FLAG %s
-// RUN: %clang -print-multi-flags-experimental --target=aarch64-none-eabi     -fmultilib-flag=foo -fmultilib-flag=bar | FileCheck --check-prefixes=CHECK-MULTILIB-CUSTOM-FLAG,CHECK-AARCH64-MULTILIB-CUSTOM-FLAG %s
+// RUN: %clang -multi-lib-config=%S/Inputs/multilib/multilib-custom-flags.yaml -print-multi-flags-experimental --target=armv8m.main-none-eabi -fmultilib-flag=foo -fmultilib-flag=bar | FileCheck --check-prefixes=CHECK-MULTILIB-CUSTOM-FLAG,CHECK-ARM-MULTILIB-CUSTOM-FLAG %s
+// RUN: %clang -multi-lib-config=%S/Inputs/multilib/multilib-custom-flags.yaml -print-multi-flags-experimental --target=aarch64-none-eabi     -fmultilib-flag=foo -fmultilib-flag=bar | FileCheck --check-prefixes=CHECK-MULTILIB-CUSTOM-FLAG,CHECK-AARCH64-MULTILIB-CUSTOM-FLAG %s
 // CHECK-ARM-MULTILIB-CUSTOM-FLAG:     --target=thumbv8m.main-unknown-none-eabi
 // CHECK-AARCH64-MULTILIB-CUSTOM-FLAG: --target=aarch64-unknown-none-eabi
 // CHECK-MULTILIB-CUSTOM-FLAG-DAG:     -fmultilib-flag=foo
diff --git clang/test/Driver/systemz-march.c clang/test/Driver/systemz-march.c
index 31079435d2c6..93a11c6c9c01 100644
--- clang/test/Driver/systemz-march.c
+++ clang/test/Driver/systemz-march.c
@@ -15,6 +15,7 @@
 // RUN: %clang -target s390x -### -S -emit-llvm -march=arch13 %s 2>&1 | FileCheck --check-prefix=CHECK-ARCH13 %s
 // RUN: %clang -target s390x -### -S -emit-llvm -march=z16 %s 2>&1 | FileCheck --check-prefix=CHECK-Z16 %s
 // RUN: %clang -target s390x -### -S -emit-llvm -march=arch14 %s 2>&1 | FileCheck --check-prefix=CHECK-ARCH14 %s
+// RUN: %clang -target s390x -### -S -emit-llvm -march=arch15 %s 2>&1 | FileCheck --check-prefix=CHECK-ARCH15 %s
 
 // CHECK-Z9: error: unknown target CPU 'z9'
 // CHECK-Z10: "-target-cpu" "z10"
@@ -31,5 +32,6 @@
 // CHECK-ARCH13: "-target-cpu" "arch13"
 // CHECK-Z16: "-target-cpu" "z16"
 // CHECK-ARCH14: "-target-cpu" "arch14"
+// CHECK-ARCH15: "-target-cpu" "arch15"
 
 int x;
diff --git clang/test/FixIt/fixit-availability-maccatalyst.m clang/test/FixIt/fixit-availability-maccatalyst.m
index 1b4cec8a9fe4..c63c9dd09c74 100644
--- clang/test/FixIt/fixit-availability-maccatalyst.m
+++ clang/test/FixIt/fixit-availability-maccatalyst.m
@@ -5,7 +5,7 @@ int function(void);
 
 void anotherFunction(int function);
 
-int use(void) {
+void use(void) {
   function();
 // CHECK: fix-it:{{.*}}:{[[@LINE-1]]:3-[[@LINE-1]]:3}:"if (@available(macCatalyst 13.2, *)) {\n      "
 // CHECK-NEXT: fix-it:{{.*}}:{[[@LINE-2]]:14-[[@LINE-2]]:14}:"\n  } else {\n      // Fallback on earlier versions\n  }"
diff --git clang/test/FixIt/fixit-c++11.cpp clang/test/FixIt/fixit-c++11.cpp
index e635c14e0c21..acbc1b11f88e 100644
--- clang/test/FixIt/fixit-c++11.cpp
+++ clang/test/FixIt/fixit-c++11.cpp
@@ -59,7 +59,7 @@ void S2::f(int i) {
   (void)[i, i]{ }; // expected-error{{'i' can appear only once in a capture list}}
   (void)[&, i, i]{ }; // expected-error{{'i' can appear only once in a capture list}}
   (void)[] mutable {};
-  (void)[]->int{};
+  (void)[]->int{ return 0; };
 #if __cplusplus <= 202002L
   // expected-warning@-3{{is a C++23 extension}}
   // expected-warning@-3{{is a C++23 extension}}
diff --git clang/test/Frontend/absolute-paths.c clang/test/Frontend/absolute-paths.c
index 8a9687195c36..e06cf262dd8e 100644
--- clang/test/Frontend/absolute-paths.c
+++ clang/test/Frontend/absolute-paths.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -fsyntax-only -I %S/Inputs/SystemHeaderPrefix/.. %s 2>&1 | FileCheck -DROOT_ABSOLUTE=%s -check-prefix=NORMAL -check-prefix=CHECK %s
-// RUN: %clang_cc1 -fsyntax-only -I %S/Inputs/SystemHeaderPrefix/.. -fdiagnostics-absolute-paths %s 2>&1 | FileCheck -DROOT_ABSOLUTE=%s -check-prefix=ABSOLUTE -check-prefix=CHECK %s
+// RUN: %clang_cc1 -Wno-error=return-type -fsyntax-only -I %S/Inputs/SystemHeaderPrefix/.. %s 2>&1 | FileCheck -DROOT_ABSOLUTE=%s -check-prefix=NORMAL -check-prefix=CHECK %s
+// RUN: %clang_cc1 -Wno-error=return-type -fsyntax-only -I %S/Inputs/SystemHeaderPrefix/.. -fdiagnostics-absolute-paths %s 2>&1 | FileCheck -DROOT_ABSOLUTE=%s -check-prefix=ABSOLUTE -check-prefix=CHECK %s
 
 #include "absolute-paths-import.h"
 // NORMAL: In file included from {{.*}}absolute-paths.c:4:
diff --git clang/test/Frontend/ast-codegen.c clang/test/Frontend/ast-codegen.c
index 53721a463217..2e2e3d360dd1 100644
--- clang/test/Frontend/ast-codegen.c
+++ clang/test/Frontend/ast-codegen.c
@@ -8,6 +8,6 @@ __asm__("foo");
 // CHECK: @g0 = dso_local global i32 0, align 4
 int g0;
 
-// CHECK: define dso_local i32 @f0()
-int f0(void) {
+// CHECK: define dso_local void @f0()
+void f0(void) {
 }
diff --git clang/test/Frontend/ast-main.cpp clang/test/Frontend/ast-main.cpp
index e6e2825bb332..fe47ce435f06 100644
--- clang/test/Frontend/ast-main.cpp
+++ clang/test/Frontend/ast-main.cpp
@@ -1,6 +1,6 @@
-// RUN: env SDKROOT="/" %clang -emit-llvm -S -o %t1.ll -x c++ - < %s
-// RUN: env SDKROOT="/" %clang -fno-delayed-template-parsing -emit-ast -o %t.ast %s
-// RUN: env SDKROOT="/" %clang -emit-llvm -S -o %t2.ll -x ast - < %t.ast
+// RUN: env SDKROOT="/" %clang -Wno-error=return-type -emit-llvm -S -o %t1.ll -x c++ - < %s
+// RUN: env SDKROOT="/" %clang -Wno-error=return-type -fno-delayed-template-parsing -emit-ast -o %t.ast %s
+// RUN: env SDKROOT="/" %clang -Wno-error=return-type -emit-llvm -S -o %t2.ll -x ast - < %t.ast
 // RUN: diff %t1.ll %t2.ll
 
 // http://llvm.org/bugs/show_bug.cgi?id=15377
diff --git clang/test/Misc/serialized-diags-stable.c clang/test/Misc/serialized-diags-stable.c
index e8ee83ec7499..f2278a3ed9dc 100644
--- clang/test/Misc/serialized-diags-stable.c
+++ clang/test/Misc/serialized-diags-stable.c
@@ -1,5 +1,5 @@
 // RUN: rm -f %t
-// RUN: not %clang -Wall -fsyntax-only %s --serialize-diagnostics %t.dia > /dev/null 2>&1
+// RUN: not %clang -Wno-error=return-type -Wall -fsyntax-only %s --serialize-diagnostics %t.dia > /dev/null 2>&1
 // RUN: c-index-test -read-diagnostics %t.dia 2>&1 | FileCheck %s
 
 // RUN: c-index-test -read-diagnostics %S/Inputs/serialized-diags-stable.dia 2>&1 | FileCheck %s
diff --git clang/test/Misc/target-invalid-cpu-note/nvptx.c clang/test/Misc/target-invalid-cpu-note/nvptx.c
index 3ea6c02d6b38..3afcdf8c9fe5 100644
--- clang/test/Misc/target-invalid-cpu-note/nvptx.c
+++ clang/test/Misc/target-invalid-cpu-note/nvptx.c
@@ -27,6 +27,7 @@
 // CHECK-SAME: {{^}}, sm_90
 // CHECK-SAME: {{^}}, sm_90a
 // CHECK-SAME: {{^}}, sm_100
+// CHECK-SAME: {{^}}, sm_100a
 // CHECK-SAME: {{^}}, gfx600
 // CHECK-SAME: {{^}}, gfx601
 // CHECK-SAME: {{^}}, gfx602
diff --git clang/test/Misc/target-invalid-cpu-note/systemz.c clang/test/Misc/target-invalid-cpu-note/systemz.c
index 22b0208eca90..b70173f5feec 100644
--- clang/test/Misc/target-invalid-cpu-note/systemz.c
+++ clang/test/Misc/target-invalid-cpu-note/systemz.c
@@ -19,4 +19,5 @@
 // CHECK-SAME: {{^}}, z15
 // CHECK-SAME: {{^}}, arch14
 // CHECK-SAME: {{^}}, z16
+// CHECK-SAME: {{^}}, arch15
 // CHECK-SAME: {{$}}
diff --git clang/test/Modules/pr61067.cppm clang/test/Modules/pr61067.cppm
index 50ab7ba20129..9e33123ee6db 100644
--- clang/test/Modules/pr61067.cppm
+++ clang/test/Modules/pr61067.cppm
@@ -49,6 +49,7 @@ import a;
 
 int c() {
     (void)(a() == a());
+    return 0;
 }
 
 // CHECK: define{{.*}}linkonce_odr{{.*}}@_ZW1aeqS_1aS0_(
diff --git clang/test/Modules/redecl-merge.m clang/test/Modules/redecl-merge.m
index 746be5ec4e77..d2a249057261 100644
--- clang/test/Modules/redecl-merge.m
+++ clang/test/Modules/redecl-merge.m
@@ -1,5 +1,5 @@
 // RUN: rm -rf %t
-// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -Wreturn-type -fmodules-cache-path=%t -I %S/Inputs %s -verify -Wno-objc-root-class
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -Wno-error=return-type -fmodules-cache-path=%t -I %S/Inputs %s -verify -Wno-objc-root-class
 
 @class C2;
 @class C3;
diff --git clang/test/PCH/irgen-rdar13114142.mm clang/test/PCH/irgen-rdar13114142.mm
index d3687637eefa..2257587290d7 100644
--- clang/test/PCH/irgen-rdar13114142.mm
+++ clang/test/PCH/irgen-rdar13114142.mm
@@ -20,11 +20,13 @@ class OOPattern {
 public:
     OOArray matchAll(const OOString &)const {
         __attribute__((__blocks__(byref))) OOArray out;
+        return {};
     }
 };
 
 OOArray operator & (const OOPattern & pattern) {
     pattern.matchAll(0);
+    return {};
 }
 OOArray operator & (OOString, OOString);
 
diff --git clang/test/PCH/late-parsed-instantiations.cpp clang/test/PCH/late-parsed-instantiations.cpp
index 9ae6b56a09be..080be09f6748 100644
--- clang/test/PCH/late-parsed-instantiations.cpp
+++ clang/test/PCH/late-parsed-instantiations.cpp
@@ -1,12 +1,12 @@
-// RUN: %clang_cc1 -fdelayed-template-parsing -std=c++14 -emit-pch %s -o %t.pch -verify
-// RUN: %clang_cc1 -fdelayed-template-parsing -std=c++14 -include-pch %t.pch %s -verify
+// RUN: %clang_cc1 -Wno-error=return-type -fdelayed-template-parsing -std=c++14 -emit-pch %s -o %t.pch -verify
+// RUN: %clang_cc1 -Wno-error=return-type -fdelayed-template-parsing -std=c++14 -include-pch %t.pch %s -verify
 
-// RUN: %clang_cc1 -fdelayed-template-parsing -std=c++14 -emit-pch -fpch-instantiate-templates %s -o %t.pch -verify
-// RUN: %clang_cc1 -fdelayed-template-parsing -std=c++14 -include-pch %t.pch %s -verify
+// RUN: %clang_cc1 -Wno-error=return-type -fdelayed-template-parsing -std=c++14 -emit-pch -fpch-instantiate-templates %s -o %t.pch -verify
+// RUN: %clang_cc1 -Wno-error=return-type -fdelayed-template-parsing -std=c++14 -include-pch %t.pch %s -verify
 
 // Run this test for i686 as this is the target that modifies default FP options.
-// RUN: %clang_cc1 -triple i686-pc-linux-gnu -fdelayed-template-parsing -std=c++14 -emit-pch -fpch-instantiate-templates %s -o %t.pch -verify
-// RUN: %clang_cc1 -triple i686-pc-linux-gnu -fdelayed-template-parsing -std=c++14 -include-pch %t.pch %s -verify
+// RUN: %clang_cc1 -Wno-error=return-type -triple i686-pc-linux-gnu -fdelayed-template-parsing -std=c++14 -emit-pch -fpch-instantiate-templates %s -o %t.pch -verify
+// RUN: %clang_cc1 -Wno-error=return-type -triple i686-pc-linux-gnu -fdelayed-template-parsing -std=c++14 -include-pch %t.pch %s -verify
 
 #ifndef HEADER_INCLUDED
 
diff --git clang/test/PCH/pr4489.c clang/test/PCH/pr4489.c
index 574e33f18c0d..a8f686ca4457 100644
--- clang/test/PCH/pr4489.c
+++ clang/test/PCH/pr4489.c
@@ -1,6 +1,6 @@
-// RUN: %clang -x c-header -o %t.pch %s
+// RUN: %clang -Wno-error=return-type -x c-header -o %t.pch %s
 // RUN: echo > %t.empty.c
-// RUN: %clang -include %t -x c %t.empty.c -emit-llvm -S -o -
+// RUN: %clang -Wno-error=return-type -include %t -x c %t.empty.c -emit-llvm -S -o -
 
 // PR 4489: Crash with PCH
 // PR 4492: Crash with PCH (round two)
diff --git clang/test/PCH/va_arg.c clang/test/PCH/va_arg.c
index 2bbf3c5ebd75..44a7d2b8ba5b 100644
--- clang/test/PCH/va_arg.c
+++ clang/test/PCH/va_arg.c
@@ -10,10 +10,12 @@ char *g0(char** argv, int argc) { return argv[argc]; }
 
 char *g(char **argv) {
   f(g0, argv, 1, 2, 3);
+  return argv[0];
 }
 
 char *i0(char **argv, int argc) { return argv[argc]; }
 
 char *i(char **argv) {
   h(i0, argv, 1, 2, 3);
+  return argv[0];
 }
diff --git clang/test/PCH/va_arg.h clang/test/PCH/va_arg.h
index 255c6589a7e8..beb49396f7b7 100644
--- clang/test/PCH/va_arg.h
+++ clang/test/PCH/va_arg.h
@@ -5,6 +5,7 @@ char *f (char * (*g) (char **, int), char **p, ...) {
     char *s;
     va_list v;
     s = g (p, __builtin_va_arg(v, int));
+    return s;
 }
 
 typedef __builtin_ms_va_list __ms_va_list;
@@ -12,4 +13,5 @@ char *__attribute__((ms_abi)) h(char *(*i)(char **, int), char **p, ...) {
   char *s;
   __ms_va_list v;
   s = i(p, __builtin_va_arg(v, int));
+  return s;
 }
diff --git clang/test/Parser/promote_types_in_proto.c clang/test/Parser/promote_types_in_proto.c
index 969ba28120f2..650dae0a5e72 100644
--- clang/test/Parser/promote_types_in_proto.c
+++ clang/test/Parser/promote_types_in_proto.c
@@ -2,7 +2,7 @@
 void functionPromotion(void f(char *const []));
 void arrayPromotion(char * const argv[]);
 
-int whatever(int argc, char *argv[])
+void whatever(int argc, char *argv[])
 {
         arrayPromotion(argv);
         functionPromotion(arrayPromotion);
diff --git clang/test/Preprocessor/predefined-arch-macros.c clang/test/Preprocessor/predefined-arch-macros.c
index 43f3454ed3c3..f267f1759cdb 100644
--- clang/test/Preprocessor/predefined-arch-macros.c
+++ clang/test/Preprocessor/predefined-arch-macros.c
@@ -4391,6 +4391,21 @@
 // CHECK_SYSTEMZ_ARCH14: #define __s390x__ 1
 // CHECK_SYSTEMZ_ARCH14: #define __zarch__ 1
 
+// RUN: %clang -march=arch15 -E -dM %s -o - 2>&1 \
+// RUN:     -target s390x-unknown-linux \
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_SYSTEMZ_ARCH15
+// CHECK_SYSTEMZ_ARCH15: #define __ARCH__ 15
+// CHECK_SYSTEMZ_ARCH15: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1 1
+// CHECK_SYSTEMZ_ARCH15: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2 1
+// CHECK_SYSTEMZ_ARCH15: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4 1
+// CHECK_SYSTEMZ_ARCH15: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8 1
+// CHECK_SYSTEMZ_ARCH15: #define __HTM__ 1
+// CHECK_SYSTEMZ_ARCH15: #define __LONG_DOUBLE_128__ 1
+// CHECK_SYSTEMZ_ARCH15: #define __VX__ 1
+// CHECK_SYSTEMZ_ARCH15: #define __s390__ 1
+// CHECK_SYSTEMZ_ARCH15: #define __s390x__ 1
+// CHECK_SYSTEMZ_ARCH15: #define __zarch__ 1
+
 // RUN: %clang -mhtm -E -dM %s -o - 2>&1 \
 // RUN:     -target s390x-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_SYSTEMZ_HTM
@@ -4407,7 +4422,7 @@
 // RUN: %clang -mzvector -E -dM %s -o - 2>&1 \
 // RUN:     -target s390x-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_SYSTEMZ_ZVECTOR
-// CHECK_SYSTEMZ_ZVECTOR: #define __VEC__ 10304
+// CHECK_SYSTEMZ_ZVECTOR: #define __VEC__ 10305
 
 // Begin nvptx tests ----------------
 
diff --git clang/test/Rewriter/rewrite-extern-c.mm clang/test/Rewriter/rewrite-extern-c.mm
index 2941504ab78d..daebb07f8fbf 100644
--- clang/test/Rewriter/rewrite-extern-c.mm
+++ clang/test/Rewriter/rewrite-extern-c.mm
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -x objective-c++ -fblocks -rewrite-objc -fobjc-runtime=macosx-fragile-10.5 -o - %s
 
 extern "C" {
-        short foo() { } 
+        short foo() { return 0; }
 }
 typedef unsigned char Boolean;
 
diff --git clang/test/Sema/freemain.c clang/test/Sema/freemain.c
index 83d7a4d56eac..7bd08c4ed790 100644
--- clang/test/Sema/freemain.c
+++ clang/test/Sema/freemain.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple i686-pc-openbsd -fsyntax-only -verify -ffreestanding %s
+// RUN: %clang_cc1 -Werror=return-type -triple i686-pc-openbsd -fsyntax-only -verify -ffreestanding %s
 
 // Tests that -ffreestanding disables all special treatment of main().
 
@@ -6,4 +6,4 @@ void* allocate(long size);
 
 void* main(void* context, long size) {
   if (context) return allocate(size);
-} // expected-warning {{non-void function does not return a value in all control paths}}
+} // expected-error {{non-void function does not return a value in all control paths}}
diff --git clang/test/Sema/return-type-mismatch.c clang/test/Sema/return-type-mismatch.c
index 79a625d7df1f..d3d0e37c06db 100644
--- clang/test/Sema/return-type-mismatch.c
+++ clang/test/Sema/return-type-mismatch.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -Wreturn-type -Wno-return-mismatch -fsyntax-only -verify=return-type %s
+// RUN: %clang_cc1 -Wno-error=return-type -Wno-return-mismatch -fsyntax-only -verify=return-type %s
 // RUN: %clang_cc1 -Wno-return-type -Wreturn-mismatch -fsyntax-only -verify=return-mismatch %s
 
 int foo(void) __attribute__((noreturn));
diff --git clang/test/Sema/warn-infinity-nan-disabled-lnx.cpp clang/test/Sema/warn-infinity-nan-disabled-lnx.cpp
index 357c9e5b6410..4f46b777c887 100644
--- clang/test/Sema/warn-infinity-nan-disabled-lnx.cpp
+++ clang/test/Sema/warn-infinity-nan-disabled-lnx.cpp
@@ -45,24 +45,48 @@ namespace std __attribute__((__visibility__("default"))) {
   isnan(double __x);
   bool
   isnan(long double __x);
-bool
+  bool
   isfinite(float __x);
   bool
   isfinite(double __x);
   bool
   isfinte(long double __x);
- bool
+  bool
   isunordered(float __x, float __y);
   bool
   isunordered(double __x, double __y);
   bool
   isunordered(long double __x, long double __y);
+
+template <class _Ty>
+class numeric_limits {
+public:
+    [[nodiscard]] static constexpr _Ty infinity() noexcept {
+        return _Ty();
+    }
+};
 } // namespace )
 }
 
 #define NAN (__builtin_nanf(""))
 #define INFINITY (__builtin_inff())
 
+template <>
+class std::numeric_limits<float>  {
+public:
+    [[nodiscard]] static constexpr float infinity() noexcept {
+        return __builtin_huge_val();
+    }
+};
+
+template <>
+class std::numeric_limits<double>  {
+public:
+    [[nodiscard]] static constexpr double infinity() noexcept {
+        return __builtin_huge_val();
+    }
+};
+
 template <class _Ty>
 class numeric_limits {
 public:
@@ -78,6 +102,7 @@ public:
         return __builtin_huge_val();
     }
 };
+
 template <>
 class numeric_limits<double>  {
 public:
@@ -86,6 +111,8 @@ public:
     }
 };
 
+double infinity() { return 0; }
+
 int compareit(float a, float b) {
   volatile int i, j, k, l, m, n, o, p;
 // no-inf-no-nan-warning@+4 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
@@ -225,11 +252,18 @@ int compareit(float a, float b) {
 
 // no-inf-no-nan-warning@+2 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
 // no-inf-warning@+1 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
-  double y = i * numeric_limits<double>::infinity();
+  double y = i * std::numeric_limits<double>::infinity();
+
+  y = i * numeric_limits<double>::infinity(); // expected-no-diagnostics
 
 // no-inf-no-nan-warning@+2 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
 // no-inf-warning@+1 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
-  j = numeric_limits<float>::infinity();
+  j = std::numeric_limits<float>::infinity();
+
+  j = numeric_limits<float>::infinity(); // expected-no-diagnostics
+
+  y = infinity(); // expected-no-diagnostics
+
   return 0;
 
 }
diff --git clang/test/Sema/warn-infinity-nan-disabled-win.cpp clang/test/Sema/warn-infinity-nan-disabled-win.cpp
index ee4eb33a16e4..655024f5909b 100644
--- clang/test/Sema/warn-infinity-nan-disabled-win.cpp
+++ clang/test/Sema/warn-infinity-nan-disabled-win.cpp
@@ -48,24 +48,49 @@ namespace std __attribute__((__visibility__("default"))) {
   isnan(double __x);
   bool
   isnan(long double __x);
-bool
+  bool
   isfinite(float __x);
   bool
   isfinite(double __x);
   bool
   isfinte(long double __x);
- bool
+  bool
   isunordered(float __x, float __y);
   bool
   isunordered(double __x, double __y);
   bool
   isunordered(long double __x, long double __y);
+
+template <class _Ty>
+class numeric_limits {
+public:
+    [[nodiscard]] static constexpr _Ty infinity() noexcept {
+        return _Ty();
+    }
+};
+
 } // namespace )
 }
 
 #define INFINITY ((float)(1e+300 * 1e+300))
 #define NAN      (-(float)(INFINITY * 0.0F))
 
+template <>
+class std::numeric_limits<float>  {
+public:
+    [[nodiscard]] static constexpr float infinity() noexcept {
+        return __builtin_huge_val();
+    }
+};
+
+template <>
+class std::numeric_limits<double>  {
+public:
+    [[nodiscard]] static constexpr double infinity() noexcept {
+        return __builtin_huge_val();
+    }
+};
+
 template <class _Ty>
 class numeric_limits {
 public:
@@ -81,6 +106,7 @@ public:
         return __builtin_huge_val();
     }
 };
+
 template <>
 class numeric_limits<double>  {
 public:
@@ -89,6 +115,8 @@ public:
     }
 };
 
+double infinity() { return 0; }
+
 int compareit(float a, float b) {
   volatile int i, j, k, l, m, n, o, p;
 // no-inf-no-nan-warning@+2 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
@@ -216,11 +244,18 @@ int compareit(float a, float b) {
 
 // no-inf-no-nan-warning@+2 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
 // no-inf-warning@+1 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
-  double y = i * numeric_limits<double>::infinity();
+  double y = i * std::numeric_limits<double>::infinity();
+
+  y = i * numeric_limits<double>::infinity(); // expected-no-diagnostics
 
 // no-inf-no-nan-warning@+2 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
 // no-inf-warning@+1 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
-  j = numeric_limits<float>::infinity();
+  j = std::numeric_limits<float>::infinity();
+
+  j = numeric_limits<float>::infinity(); // expected-no-diagnostics
+
+  y = infinity(); // expected-no-diagnostics
+
   return 0;
 
 }
diff --git clang/test/Sema/zvector.c clang/test/Sema/zvector.c
index 900c39adc2a3..e1e4ab532426 100644
--- clang/test/Sema/zvector.c
+++ clang/test/Sema/zvector.c
@@ -18,6 +18,10 @@ vector signed long long sl, sl2;
 vector unsigned long long ul, ul2;
 vector bool long long bl, bl2;
 
+vector signed __int128 slll, slll2;
+vector unsigned __int128 ulll, ulll2;
+vector bool __int128 blll, blll2;
+
 vector double fd, fd2;
 
 vector long ll; // expected-error {{cannot use 'long' with '__vector'}}
@@ -39,6 +43,9 @@ unsigned int ui_scalar;
 signed long sl_scalar;
 unsigned long ul_scalar;
 
+signed __int128 slll_scalar;
+unsigned __int128 ulll_scalar;
+
 double fd_scalar;
 
 // Verify that __vector is also recognized
@@ -54,6 +61,9 @@ __vector bool int bi3;
 __vector signed long long sl3;
 __vector unsigned long long ul3;
 __vector bool long long bl3;
+__vector signed __int128 slll3;
+__vector unsigned __int128 ulll3;
+__vector bool __int128 blll3;
 __vector double fd3;
 __vector long ll3; // expected-error {{cannot use 'long' with '__vector'}}
 __vector float ff3; // expected-error {{cannot use 'float' with '__vector'}}
@@ -85,6 +95,9 @@ int res_bi[vec_step(bi) == 4 ? 1 : -1];
 int res_sl[vec_step(sl) == 2 ? 1 : -1];
 int res_ul[vec_step(ul) == 2 ? 1 : -1];
 int res_bl[vec_step(bl) == 2 ? 1 : -1];
+int res_slll[vec_step(slll) == 1 ? 1 : -1];
+int res_ulll[vec_step(ulll) == 1 ? 1 : -1];
+int res_blll[vec_step(blll) == 1 ? 1 : -1];
 int res_fd[vec_step(fd) == 2 ? 1 : -1];
 
 
@@ -111,6 +124,10 @@ void foo(void)
   bl = bl2;
   fd = fd2;
 
+  slll = slll2;
+  ulll = ulll2;
+  blll = blll2;
+
   sc = uc2; // expected-error {{incompatible type}}
   sc = bc2; // expected-error {{incompatible type}}
   uc = sc2; // expected-error {{incompatible type}}
@@ -129,31 +146,37 @@ void foo(void)
   sc = si2; // expected-error {{incompatible type}}
   sc = sl2; // expected-error {{incompatible type}}
   sc = fd2; // expected-error {{incompatible type}}
+  sc = slll2; // expected-error {{incompatible type}}
 
   ss = sc2; // expected-error {{incompatible type}}
   si = sc2; // expected-error {{incompatible type}}
   sl = sc2; // expected-error {{incompatible type}}
   fd = sc2; // expected-error {{incompatible type}}
+  slll = sc2; // expected-error {{incompatible type}}
 
   uc = us2; // expected-error {{incompatible type}}
   uc = ui2; // expected-error {{incompatible type}}
   uc = ul2; // expected-error {{incompatible type}}
   uc = fd2; // expected-error {{incompatible type}}
+  uc = ulll2; // expected-error {{incompatible type}}
 
   us = uc2; // expected-error {{incompatible type}}
   ui = uc2; // expected-error {{incompatible type}}
   ul = uc2; // expected-error {{incompatible type}}
   fd = uc2; // expected-error {{incompatible type}}
+  ulll = uc2; // expected-error {{incompatible type}}
 
   bc = us2; // expected-error {{incompatible type}}
   bc = ui2; // expected-error {{incompatible type}}
   bc = ul2; // expected-error {{incompatible type}}
   bc = fd2; // expected-error {{incompatible type}}
+  bc = ulll2; // expected-error {{incompatible type}}
 
   bs = bc2; // expected-error {{incompatible type}}
   bi = bc2; // expected-error {{incompatible type}}
   bl = bc2; // expected-error {{incompatible type}}
   fd = bc2; // expected-error {{incompatible type}}
+  blll = bc2; // expected-error {{incompatible type}}
 
   // -------------------------------------------------------------------------
   // Test casts to same element width.
@@ -176,6 +199,10 @@ void foo(void)
   ul = (vector unsigned long long)fd2;
   fd = (vector double)sl2;
 
+  slll = (vector signed __int128)blll2;
+  blll = (vector bool __int128)ulll2;
+  ulll = (vector unsigned __int128)slll2;
+
   // -------------------------------------------------------------------------
   // Test casts to different element width.
   // -------------------------------------------------------------------------
@@ -197,6 +224,10 @@ void foo(void)
   ul = (vector unsigned long long)sc2;
   fd = (vector double)sc2;
 
+  slll = (vector signed __int128)bi2;
+  blll = (vector bool __int128)ui2;
+  ulll = (vector unsigned __int128)si2;
+
   // -------------------------------------------------------------------------
   // Test ++.
   // -------------------------------------------------------------------------
@@ -217,6 +248,10 @@ void foo(void)
   ++ul2;
   ++bl2; // expected-error {{cannot increment}}
 
+  ++slll2;
+  ++ulll2;
+  ++blll2; // expected-error {{cannot increment}}
+
   ++fd2;
 
   sc++;
@@ -235,6 +270,10 @@ void foo(void)
   ul++;
   bl++; // expected-error {{cannot increment}}
 
+  slll++;
+  ulll++;
+  blll++; // expected-error {{cannot increment}}
+
   fd++;
 
   // -------------------------------------------------------------------------
@@ -257,6 +296,10 @@ void foo(void)
   --ul2;
   --bl2; // expected-error {{cannot decrement}}
 
+  --slll2;
+  --ulll2;
+  --blll2; // expected-error {{cannot decrement}}
+
   --fd2;
 
   sc--;
@@ -275,6 +318,10 @@ void foo(void)
   ul--;
   bl--; // expected-error {{cannot decrement}}
 
+  slll--;
+  ulll--;
+  blll--; // expected-error {{cannot decrement}}
+
   fd--;
 
   // -------------------------------------------------------------------------
@@ -297,6 +344,10 @@ void foo(void)
   ul = +ul2;
   bl = +bl2; // expected-error {{invalid argument type}}
 
+  slll = +slll2;
+  ulll = +ulll2;
+  blll = +blll2; // expected-error {{invalid argument type}}
+
   fd = +fd2;
 
   sc = +si2; // expected-error {{assigning to}}
@@ -323,6 +374,10 @@ void foo(void)
   ul = -ul2;
   bl = -bl2; // expected-error {{invalid argument type}}
 
+  slll = -slll2;
+  ulll = -ulll2;
+  blll = -blll2; // expected-error {{invalid argument type}}
+
   fd = -fd2;
 
   sc = -si2; // expected-error {{assigning to}}
@@ -349,6 +404,10 @@ void foo(void)
   ul = ~ul2;
   bl = ~bl2;
 
+  slll = ~slll2;
+  ulll = ~ulll2;
+  blll = ~blll2;
+
   fd = ~fd2; // expected-error {{invalid argument}}
 
   sc = ~si2; // expected-error {{assigning to}}
@@ -398,6 +457,10 @@ void foo(void)
   ul = ul + ul2;
   bl = bl + bl2; // expected-error {{invalid operands}}
 
+  slll = slll + slll2;
+  ulll = ulll + ulll2;
+  blll = blll + blll2; // expected-error {{invalid operands}}
+
   fd = fd + fd2;
   fd = fd + ul2; // expected-error {{cannot convert}}
   fd = sl + fd2; // expected-error {{cannot convert}}
@@ -418,6 +481,7 @@ void foo(void)
   sc += si2; // expected-error {{cannot convert}}
   sc += sl2; // expected-error {{cannot convert}}
   sc += fd2; // expected-error {{cannot convert}}
+  sc += slll2; // expected-error {{cannot convert}}
 
   sc += sc_scalar;
   sc += uc_scalar; // expected-error {{cannot convert between scalar type 'unsigned char' and vector type '__vector signed char' (vector of 16 'signed char' values) as implicit conversion would cause truncation}}
@@ -436,6 +500,10 @@ void foo(void)
   ul += ul2;
   bl += bl2; // expected-error {{invalid operands}}
 
+  slll += slll2;
+  ulll += ulll2;
+  blll += blll2; // expected-error {{invalid operands}}
+
   fd += fd2;
 
   // -------------------------------------------------------------------------
@@ -470,6 +538,10 @@ void foo(void)
   ul -= ul2;
   bl -= bl2; // expected-error {{invalid operands}}
 
+  slll -= slll2;
+  ulll -= ulll2;
+  blll -= blll2; // expected-error {{invalid operands}}
+
   fd -= fd2;
 
   // -------------------------------------------------------------------------
@@ -505,6 +577,11 @@ void foo(void)
   ul *= ul2;
   bl *= bl2; // expected-error {{invalid operands}}
 
+  slll *= slll2;
+  ulll *= ulll2;
+  blll *= blll2; // expected-error {{invalid operands}}
+
+
   fd *= fd2;
 
   // -------------------------------------------------------------------------
@@ -539,6 +616,10 @@ void foo(void)
   ul /= ul2;
   bl /= bl2; // expected-error {{invalid operands}}
 
+  slll /= slll2;
+  ulll /= ulll2;
+  blll /= blll2; // expected-error {{invalid operands}}
+
   fd /= fd2;
 
   // -------------------------------------------------------------------------
@@ -573,6 +654,10 @@ void foo(void)
   ul %= ul2;
   bl %= bl2; // expected-error {{invalid operands}}
 
+  slll %= slll2;
+  ulll %= ulll2;
+  blll %= blll2; // expected-error {{invalid operands}}
+
   fd %= fd2; // expected-error {{invalid operands}}
 
   // -------------------------------------------------------------------------
@@ -637,6 +722,10 @@ void foo(void)
   ul &= ul2;
   bl &= bl2;
 
+  slll &= slll2;
+  ulll &= ulll2;
+  blll &= blll2;
+
   // -------------------------------------------------------------------------
   // Test that & rules apply to | too.
   // -------------------------------------------------------------------------
@@ -668,6 +757,10 @@ void foo(void)
   ul |= ul2;
   bl |= bl2;
 
+  slll |= slll2;
+  ulll |= ulll2;
+  blll |= blll2;
+
   fd |= bl2; // expected-error {{invalid operands}}
   fd |= fd2; // expected-error {{invalid operands}}
 
@@ -702,6 +795,10 @@ void foo(void)
   ul ^= ul2;
   bl ^= bl2;
 
+  slll ^= slll2;
+  ulll ^= ulll2;
+  blll ^= blll2;
+
   fd ^= bl2; // expected-error {{invalid operands}}
   fd ^= fd2; // expected-error {{invalid operands}}
 
@@ -762,6 +859,12 @@ void foo(void)
   ul = ul << ul_scalar;
   bl = bl << bl2; // expected-error {{invalid operands}}
 
+  slll = slll << slll2;
+  slll = slll << slll_scalar;
+  ulll = ulll << ulll2;
+  ulll = ulll << ulll_scalar;
+  blll = blll << blll2; // expected-error {{invalid operands}}
+
   fd = fd << fd2; // expected-error {{integer is required}}
   fd = fd << ul2; // expected-error {{integer is required}}
   fd = sl << fd2; // expected-error {{integer is required}}
@@ -803,6 +906,12 @@ void foo(void)
   ul <<= ul_scalar;
   bl <<= bl2; // expected-error {{invalid operands}}
 
+  slll <<= slll2;
+  slll <<= slll_scalar;
+  ulll <<= ulll2;
+  ulll <<= ulll_scalar;
+  blll <<= blll2; // expected-error {{invalid operands}}
+
   fd <<= fd2; // expected-error {{integer is required}}
 
   // -------------------------------------------------------------------------
@@ -862,6 +971,12 @@ void foo(void)
   ul = ul >> ul_scalar;
   bl = bl >> bl2; // expected-error {{invalid operands}}
 
+  slll = slll >> slll2;
+  slll = slll >> slll_scalar;
+  ulll = ulll >> ulll2;
+  ulll = ulll >> ulll_scalar;
+  blll = blll >> blll2; // expected-error {{invalid operands}}
+
   fd = fd >> fd2; // expected-error {{integer is required}}
   fd = fd >> ul2; // expected-error {{integer is required}}
   fd = sl >> fd2; // expected-error {{integer is required}}
@@ -903,6 +1018,12 @@ void foo(void)
   ul >>= ul_scalar;
   bl >>= bl2; // expected-error {{invalid operands}}
 
+  slll >>= slll2;
+  slll >>= slll_scalar;
+  ulll >>= ulll2;
+  ulll >>= ulll_scalar;
+  blll >>= blll2; // expected-error {{invalid operands}}
+
   fd >>= fd2; // expected-error {{integer is required}}
 
   // -------------------------------------------------------------------------
@@ -935,6 +1056,10 @@ void foo(void)
   (void)(bl == bl2);
   (void)(fd == fd2);
 
+  (void)(slll == slll2);
+  (void)(ulll == ulll2);
+  (void)(blll == blll2);
+
   (void)(fd == ul); // expected-error {{cannot convert}}
   (void)(ul == fd); // expected-error {{cannot convert}}
 
@@ -962,6 +1087,10 @@ void foo(void)
   (void)(bl != bl2);
   (void)(fd != fd2);
 
+  (void)(slll != slll2);
+  (void)(ulll != ulll2);
+  (void)(blll != blll2);
+
   // -------------------------------------------------------------------------
   // Test that == rules apply to <= too.
   // -------------------------------------------------------------------------
@@ -986,6 +1115,10 @@ void foo(void)
   (void)(bl <= bl2);
   (void)(fd <= fd2);
 
+  (void)(slll <= slll2);
+  (void)(ulll <= ulll2);
+  (void)(blll <= blll2);
+
   // -------------------------------------------------------------------------
   // Test that == rules apply to >= too.
   // -------------------------------------------------------------------------
@@ -1010,6 +1143,10 @@ void foo(void)
   (void)(bl >= bl2);
   (void)(fd >= fd2);
 
+  (void)(slll >= slll2);
+  (void)(ulll >= ulll2);
+  (void)(blll >= blll2);
+
   // -------------------------------------------------------------------------
   // Test that == rules apply to < too.
   // -------------------------------------------------------------------------
@@ -1034,6 +1171,10 @@ void foo(void)
   (void)(bl < bl2);
   (void)(fd < fd2);
 
+  (void)(slll < slll2);
+  (void)(ulll < ulll2);
+  (void)(blll < blll2);
+
   // -------------------------------------------------------------------------
   // Test that == rules apply to > too.
   // -------------------------------------------------------------------------
@@ -1057,4 +1198,8 @@ void foo(void)
   (void)(ul > ul2);
   (void)(bl > bl2);
   (void)(fd > fd2);
+
+  (void)(slll > slll2);
+  (void)(ulll > ulll2);
+  (void)(blll > blll2);
 }
diff --git clang/test/SemaCXX/attr-noreturn.cpp clang/test/SemaCXX/attr-noreturn.cpp
index e6de5ad45d34..f2d4964f8ebd 100644
--- clang/test/SemaCXX/attr-noreturn.cpp
+++ clang/test/SemaCXX/attr-noreturn.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -Wno-error=return-type -fsyntax-only -verify %s
 
 // Reachability tests have to come first because they get suppressed
 // if any errors have occurred.
diff --git clang/test/SemaCXX/builtin-assume-aligned.cpp clang/test/SemaCXX/builtin-assume-aligned.cpp
index 85a7faee9161..48bd8414fc50 100644
--- clang/test/SemaCXX/builtin-assume-aligned.cpp
+++ clang/test/SemaCXX/builtin-assume-aligned.cpp
@@ -1,5 +1,4 @@
 // RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 -triple x86_64-linux-gnu %s
-// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 -triple x86_64-linux-gnu %s -fexperimental-new-constant-interpreter
 
 int n;
 constexpr int *p = 0;
diff --git clang/test/SemaCXX/constant-expression-cxx14.cpp clang/test/SemaCXX/constant-expression-cxx14.cpp
index 579883ae52cc..e16a69df3830 100644
--- clang/test/SemaCXX/constant-expression-cxx14.cpp
+++ clang/test/SemaCXX/constant-expression-cxx14.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -std=c++23 -fsyntax-only -verify=expected,cxx20_23,cxx23          %s -fcxx-exceptions -triple=x86_64-linux-gnu
-// RUN: %clang_cc1 -std=c++20 -fsyntax-only -verify=expected,cxx14_20,cxx20_23,cxx20 %s -fcxx-exceptions -triple=x86_64-linux-gnu
-// RUN: %clang_cc1 -std=c++14 -fsyntax-only -verify=expected,cxx14_20,cxx14          %s -fcxx-exceptions -triple=x86_64-linux-gnu
+// RUN: %clang_cc1 -std=c++23 -Werror=return-type -fsyntax-only -verify=expected,cxx20_23,cxx23          %s -fcxx-exceptions -triple=x86_64-linux-gnu
+// RUN: %clang_cc1 -std=c++20 -Werror=return-type -fsyntax-only -verify=expected,cxx14_20,cxx20_23,cxx20 %s -fcxx-exceptions -triple=x86_64-linux-gnu
+// RUN: %clang_cc1 -std=c++14 -Werror=return-type -fsyntax-only -verify=expected,cxx14_20,cxx14          %s -fcxx-exceptions -triple=x86_64-linux-gnu
 
 struct S {
   // dummy ctor to make this a literal type
@@ -68,7 +68,7 @@ constexpr int j(int k) {
     }
   }
 } // expected-note 2{{control reached end of constexpr function}}
-  // cxx23-warning@-1 {{does not return a value in all control paths}}
+  // cxx23-error@-1 {{does not return a value in all control paths}}
 static_assert(j(0) == -3, "");
 static_assert(j(1) == 5, "");
 static_assert(j(2), ""); // expected-error {{constant expression}} expected-note {{in call to 'j(2)'}}
diff --git clang/test/SemaCXX/constexpr-return-non-void-cxx2b.cpp clang/test/SemaCXX/constexpr-return-non-void-cxx2b.cpp
index e3228dddef5f..713ba90a0034 100644
--- clang/test/SemaCXX/constexpr-return-non-void-cxx2b.cpp
+++ clang/test/SemaCXX/constexpr-return-non-void-cxx2b.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -std=c++23 -fsyntax-only -Wimplicit-fallthrough -Wconsumed -verify %s
+// RUN: %clang_cc1 -Wno-error=return-type -std=c++23 -fsyntax-only -Wimplicit-fallthrough -Wconsumed -verify %s
 
 constexpr int f() { } // expected-warning {{non-void function does not return a value}}
 static_assert(__is_same(decltype([] constexpr -> int { }( )), int)); // expected-warning {{non-void lambda does not return a value}}
diff --git clang/test/SemaCXX/err-missing-noreturn-1.cpp clang/test/SemaCXX/err-missing-noreturn-1.cpp
new file mode 100644
index 000000000000..29627a7bef1d
--- /dev/null
+++ clang/test/SemaCXX/err-missing-noreturn-1.cpp
@@ -0,0 +1,9 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s -Wmissing-noreturn -Wreturn-type -Werror=return-type
+
+struct rdar8875247 {
+  ~rdar8875247 ();
+};
+
+int rdar8875247_test() {
+  rdar8875247 f;
+} // expected-error{{non-void function does not return a value}}
diff --git clang/test/SemaCXX/err-missing-noreturn-2.cpp clang/test/SemaCXX/err-missing-noreturn-2.cpp
new file mode 100644
index 000000000000..b1888fbb33cc
--- /dev/null
+++ clang/test/SemaCXX/err-missing-noreturn-2.cpp
@@ -0,0 +1,15 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s -Wmissing-noreturn -Wreturn-type
+// expected-no-diagnostics
+
+namespace GH63009 {
+struct S1 {
+  [[noreturn]] S1();
+};
+
+int foo();
+
+int test_1() {
+  S1 s1;
+  foo();
+}
+}
diff --git clang/test/SemaCXX/fold_lambda_with_variadics.cpp clang/test/SemaCXX/fold_lambda_with_variadics.cpp
index 2257a4c2d975..69572bea3664 100644
--- clang/test/SemaCXX/fold_lambda_with_variadics.cpp
+++ clang/test/SemaCXX/fold_lambda_with_variadics.cpp
@@ -7,6 +7,8 @@ struct identity {
   using type = T;
 };
 
+template <class> using ElementType = int;
+
 template <class = void> void f() {
 
   static_assert([]<class... Is>(Is... x) {
@@ -47,6 +49,10 @@ template <class = void> void f() {
     }(), ...);
   }(1, 2);
 
+  []<class... Is>(Is...) {
+    ([] { using T = ElementType<Is>; }(), ...);
+  }(1);
+
   [](auto ...y) {
     ([y] { }(), ...);
   }();
diff --git clang/test/SemaCXX/reinterpret-cast.cpp clang/test/SemaCXX/reinterpret-cast.cpp
index 45332fd15b5d..bfb808773b90 100644
--- clang/test/SemaCXX/reinterpret-cast.cpp
+++ clang/test/SemaCXX/reinterpret-cast.cpp
@@ -302,3 +302,77 @@ void reinterpret_cast_allowlist () {
   (void)reinterpret_cast<unsigned char&>(b);
   (void)*reinterpret_cast<unsigned char*>(&b);
 }
+
+namespace templated {
+template <typename TARGETTYPE, typename UATYPE>
+void cast_uninstantiated() {
+  const UATYPE* data;
+  (void)*reinterpret_cast<const TARGETTYPE*>(data); // no warning
+}
+
+
+template <typename TARGETTYPE, typename UATYPE>
+void cast_instantiated_badly() {
+  const UATYPE* data;
+  (void)*reinterpret_cast<const TARGETTYPE*>(data); // expected-warning {{dereference of type 'const int *' that was reinterpret_cast from type 'const float *' has undefined behavior}}
+}
+
+template <typename TARGETTYPE, typename UATYPE>
+void cast_instantiated_well() {
+  const UATYPE* data;
+  (void)*reinterpret_cast<const TARGETTYPE*>(data); // no warning
+}
+
+template <typename TARGETTYPE>
+void cast_one_tmpl_arg_uninstantiated() {
+  const int* data;
+  (void)*reinterpret_cast<const TARGETTYPE*>(data); // no warning
+}
+
+template <typename TARGETTYPE>
+void cast_one_tmpl_arg_instantiated_badly() {
+  const float* data;
+  (void)*reinterpret_cast<const TARGETTYPE*>(data); // expected-warning {{dereference of type 'const int *' that was reinterpret_cast from type 'const float *' has undefined behavior}}
+}
+
+template <typename TARGETTYPE>
+void cast_one_tmpl_arg_instantiated_well() {
+  const float* data;
+  (void)*reinterpret_cast<const TARGETTYPE*>(data); // no warning
+}
+
+template <int size>
+void cast_nontype_template_true_positive_noninstantiated() {
+  const float *data;
+  const int arr[size];
+  (void)*reinterpret_cast<const int*>(data); // expected-warning {{dereference of type 'const int *' that was reinterpret_cast from type 'const float *' has undefined behavior}}
+}
+
+template <int size>
+void cast_nontype_template_true_negative_noninstantiated() {
+  const int data[size];
+  (void)*reinterpret_cast<const int*>(data); // no warning
+}
+
+void top() {
+  cast_instantiated_badly<int, float>();
+  // expected-note@-1 {{in instantiation of function template specialization 'templated::cast_instantiated_badly<int, float>' requested here}}
+  cast_instantiated_well<int, int>();
+  cast_one_tmpl_arg_instantiated_badly<int>();
+  // expected-note@-1 {{in instantiation of function template specialization 'templated::cast_one_tmpl_arg_instantiated_badly<int>' requested here}}
+  cast_one_tmpl_arg_instantiated_well<float>();
+}
+
+template<typename T, typename U>
+void cast_template_dependent_type_noninstantiated(T** x)
+{
+    (void)*reinterpret_cast<U**>(x);
+}
+
+template<typename T, typename U>
+void cast_template_dependent_member_type_noninstantiated(typename T::X x)
+{
+    (void)*reinterpret_cast<typename U::Y>(x);
+}
+
+} // namespace templated
diff --git clang/test/SemaCXX/return-noreturn.cpp clang/test/SemaCXX/return-noreturn.cpp
index b88e5a519d1b..873e4c7e12f2 100644
--- clang/test/SemaCXX/return-noreturn.cpp
+++ clang/test/SemaCXX/return-noreturn.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 %s -fsyntax-only -fcxx-exceptions -verify -Wreturn-type -Wmissing-noreturn -Wno-unreachable-code -Wno-covered-switch-default
-// RUN: %clang_cc1 %s -fsyntax-only -fcxx-exceptions -std=c++11 -verify -Wreturn-type -Wmissing-noreturn -Wno-unreachable-code -Wno-covered-switch-default
+// RUN: %clang_cc1 %s -fsyntax-only -fcxx-exceptions -verify -Wreturn-type -Wno-error=return-type -Wmissing-noreturn -Wno-unreachable-code -Wno-covered-switch-default
+// RUN: %clang_cc1 %s -fsyntax-only -fcxx-exceptions -std=c++11 -verify -Wreturn-type -Wno-error=return-type -Wmissing-noreturn -Wno-unreachable-code -Wno-covered-switch-default
 
 // A destructor may be marked noreturn and should still influence the CFG.
 void pr6884_abort() __attribute__((noreturn));
diff --git clang/test/SemaCXX/warn-missing-noreturn.cpp clang/test/SemaCXX/warn-missing-noreturn.cpp
index 32b49e0a325f..208a0fae68cd 100644
--- clang/test/SemaCXX/warn-missing-noreturn.cpp
+++ clang/test/SemaCXX/warn-missing-noreturn.cpp
@@ -88,10 +88,6 @@ struct rdar8875247 {
 };
 void rdar8875247_aux();
 
-int rdar8875247_test() {
-  rdar8875247 f;
-} // expected-warning{{non-void function does not return a value}}
-
 struct rdar8875247_B {
   rdar8875247_B();
   ~rdar8875247_B();
@@ -124,21 +120,12 @@ namespace PR10801 {
 }
 
 namespace GH63009 {
-struct S1 {
-  [[noreturn]] S1();
-};
-
 struct S2 {
   [[noreturn]] ~S2();
 };
 
 int foo();
 
-int test_1() {
-  S1 s1;
-  foo();
-}
-
 int test_2() {
   S2 s2;
   foo();
diff --git clang/test/SemaCXX/warn-unsafe-buffer-usage-array.cpp clang/test/SemaCXX/warn-unsafe-buffer-usage-array.cpp
index 7dd6c83dbba2..e80b54b7c696 100644
--- clang/test/SemaCXX/warn-unsafe-buffer-usage-array.cpp
+++ clang/test/SemaCXX/warn-unsafe-buffer-usage-array.cpp
@@ -92,3 +92,35 @@ char access_strings() {
   c = array_string[5];
   return c;
 }
+
+struct T {
+  int array[10];
+};
+
+const int index = 1;
+
+constexpr int get_const(int x) {
+  if(x < 3)
+    return ++x;
+  else
+    return x + 5;
+};
+
+void array_indexed_const_expr(unsigned idx) {
+  // expected-note@+2 {{change type of 'arr' to 'std::array' to label it for hardening}}
+  // expected-warning@+1{{'arr' is an unsafe buffer that does not perform bounds checks}}
+  int arr[10];
+  arr[sizeof(int)] = 5;
+
+  int array[sizeof(T)];
+  array[sizeof(int)] = 5;
+  array[sizeof(T) -1 ] = 3;
+
+  int k = arr[6 & 5];
+  k = arr[2 << index];
+  k = arr[8 << index]; // expected-note {{used in buffer access here}}
+  k = arr[16 >> 1];
+  k = arr[get_const(index)];
+  k = arr[get_const(5)]; // expected-note {{used in buffer access here}}
+  k = arr[get_const(4)];
+}
diff --git clang/test/SemaCXX/warn-unsafe-buffer-usage-fixits-add-assign.cpp clang/test/SemaCXX/warn-unsafe-buffer-usage-fixits-add-assign.cpp
index 1484f7e9d36c..007b3a43a3d9 100644
--- clang/test/SemaCXX/warn-unsafe-buffer-usage-fixits-add-assign.cpp
+++ clang/test/SemaCXX/warn-unsafe-buffer-usage-fixits-add-assign.cpp
@@ -46,7 +46,7 @@ void add_assign_test(unsigned int n, int *a, int y) {
   // CHECK-NOT: fix-it:"{{.*}}":{[[@LINE-1]]:5-[[@LINE-1]]:9}:"p = p.subspan("
 }
 
-int expr_test(unsigned x, int *q, int y) {
+void expr_test(unsigned x, int *q, int y) {
   char *p = new char[8];
   // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:9}:"std::span<char> "
   // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:13-[[@LINE-2]]:13}:"{"
diff --git clang/test/SemaHLSL/BuiltIns/lerp-errors.hlsl clang/test/SemaHLSL/BuiltIns/lerp-errors.hlsl
index 56c8b32cc14e..c77a07602b39 100644
--- clang/test/SemaHLSL/BuiltIns/lerp-errors.hlsl
+++ clang/test/SemaHLSL/BuiltIns/lerp-errors.hlsl
@@ -20,16 +20,38 @@ float2 test_lerp_no_second_arg(float2 p0) {
   // expected-error@-1 {{no matching function for call to 'lerp'}}
 }
 
-float2 test_lerp_vector_size_mismatch(float3 p0, float2 p1) {
+float2 test_lerp_vector_trunc_warn1(float3 p0) {
+  return lerp(p0, p0, p0);
+  // expected-warning@-1 {{implicit conversion truncates vector: 'float3' (aka 'vector<float, 3>') to 'vector<float, 2>' (vector of 2 'float' values)}}
+}
+
+float2 test_lerp_vector_trunc_warn2(float3 p0, float2 p1) {
   return lerp(p0, p0, p1);
   // expected-warning@-1 {{implicit conversion truncates vector: 'float3' (aka 'vector<float, 3>') to 'vector<float, 2>' (vector of 2 'float' values)}}
+  // expected-warning@-2 {{implicit conversion truncates vector: 'float3' (aka 'vector<float, 3>') to 'vector<float, 2>' (vector of 2 'float' values)}}
+}
+
+float2 test_lerp_vector_trunc_warn3(float3 p0, float2 p1) {
+  return lerp(p0, p1, p0);
+  // expected-warning@-1 {{implicit conversion truncates vector: 'float3' (aka 'vector<float, 3>') to 'vector<float, 2>' (vector of 2 'float' values)}}
+  // expected-warning@-2 {{implicit conversion truncates vector: 'float3' (aka 'vector<float, 3>') to 'vector<float, 2>' (vector of 2 'float' values)}}
 }
 
-float2 test_lerp_builtin_vector_size_mismatch(float3 p0, float2 p1) {
+float2 test_lerp_builtin_vector_size_mismatch_Arg1(float3 p0, float2 p1) {
   return __builtin_hlsl_lerp(p0, p1, p1);
   // expected-error@-1 {{all arguments to '__builtin_hlsl_lerp' must have the same type}}
 }
 
+float2 test_lerp_builtin_vector_size_mismatch_Arg2(float3 p0, float2 p1) {
+  return __builtin_hlsl_lerp(p1, p0, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_lerp' must have the same type}}
+}
+
+float2 test_lerp_builtin_vector_size_mismatch_Arg3(float3 p0, float2 p1) {
+  return __builtin_hlsl_lerp(p1, p1, p0);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_lerp' must have the same type}}
+}
+
 float test_lerp_scalar_mismatch(float p0, half p1) {
   return lerp(p1, p0, p1);
   // expected-error@-1 {{call to 'lerp' is ambiguous}}
@@ -45,6 +67,16 @@ float2 test_builtin_lerp_float2_splat(float p0, float2 p1) {
   // expected-error@-1 {{all arguments to '__builtin_hlsl_lerp' must be vectors}}
 }
 
+float2 test_builtin_lerp_float2_splat2(double p0, double2 p1) {
+  return __builtin_hlsl_lerp(p1, p0, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_lerp' must be vectors}}
+}
+
+float2 test_builtin_lerp_float2_splat3(double p0, double2 p1) {
+  return __builtin_hlsl_lerp(p1, p1, p0);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_lerp' must be vectors}}
+}
+
 float3 test_builtin_lerp_float3_splat(float p0, float3 p1) {
   return __builtin_hlsl_lerp(p0, p1, p1);
   // expected-error@-1 {{all arguments to '__builtin_hlsl_lerp' must be vectors}}
diff --git clang/test/SemaObjC/return-noreturn.m clang/test/SemaObjC/return-noreturn.m
index c7735ca211e6..c7b611559d86 100644
--- clang/test/SemaObjC/return-noreturn.m
+++ clang/test/SemaObjC/return-noreturn.m
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 %s -fsyntax-only -fobjc-exceptions -verify -Wreturn-type -Wmissing-noreturn
+// RUN: %clang_cc1 %s -fsyntax-only -fobjc-exceptions -verify -Wreturn-type -Wmissing-noreturn -Werror=return-type
 
 id f(id self) {
-} // expected-warning {{non-void function does not return a value}}
+} // expected-error {{non-void function does not return a value}}
 
 id f2(id self) {
   @try {
diff --git clang/test/SemaObjC/try-catch.m clang/test/SemaObjC/try-catch.m
index 0bea7a5150a4..1216b540a8d2 100644
--- clang/test/SemaObjC/try-catch.m
+++ clang/test/SemaObjC/try-catch.m
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -verify -fobjc-exceptions %s
+// RUN: %clang_cc1 -fsyntax-only -verify -fobjc-exceptions -Werror=return-type %s
 typedef signed char BOOL;
 typedef struct _NSZone NSZone;
 
@@ -34,7 +34,7 @@ typedef struct _NSZone NSZone;
     @try {}
     // the exception name is optional (weird)
     @catch (NSException *) {}
-} // expected-warning {{non-void function does not return a value}}
+} // expected-error {{non-void function does not return a value}}
 
 - (NSDictionary *)anotherFunction {
     @try {}
diff --git clang/test/SemaTemplate/concepts-lambda.cpp clang/test/SemaTemplate/concepts-lambda.cpp
index 829a71bc703f..306f86cfcb28 100644
--- clang/test/SemaTemplate/concepts-lambda.cpp
+++ clang/test/SemaTemplate/concepts-lambda.cpp
@@ -294,3 +294,16 @@ void foo() {
 }
 
 } // namespace GH110721
+
+namespace GH123441 {
+
+void test() {
+  auto L = [](auto... x) {
+    return [](decltype(x)... y)
+      requires true
+    {};
+  };
+  L(0, 1)(1, 2);
+}
+
+}
diff --git clang/test/SemaTemplate/late-parsing-eager-instantiation.cpp clang/test/SemaTemplate/late-parsing-eager-instantiation.cpp
index 0e654768787a..90f4d3261c5a 100644
--- clang/test/SemaTemplate/late-parsing-eager-instantiation.cpp
+++ clang/test/SemaTemplate/late-parsing-eager-instantiation.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -std=c++14 -verify %s
+// RUN: %clang_cc1 -Wno-error=return-type -std=c++14 -verify %s
 
 // pr33561
 class ArrayBuffer;
diff --git clang/tools/scan-build/bin/scan-build clang/tools/scan-build/bin/scan-build
index 37241c6d85c5..b90e635d3175 100755
--- clang/tools/scan-build/bin/scan-build
+++ clang/tools/scan-build/bin/scan-build
@@ -820,7 +820,8 @@ ENDTEXT
       }
 
       # Emit the "View" link.
-      print OUT "<td><a href=\"$ReportFile#EndPath\">View Report</a></td>";
+      my $EncodedReport = URLEscape($ReportFile);
+      print OUT "<td><a href=\"$EncodedReport#EndPath\">View Report</a></td>";
 
       # Emit REPORTBUG markers.
       print OUT "\n<!-- REPORTBUG id=\"$ReportFile\" -->\n";
@@ -1465,6 +1466,16 @@ sub HtmlEscape {
   return $tmp;
 }
 
+##----------------------------------------------------------------------------##
+# URLEscape - encode characters that are special in URLs
+##----------------------------------------------------------------------------##
+
+sub URLEscape {
+  my $arg = shift || '';
+  $arg =~ s/\+/%2B/g;
+  return $arg;
+}
+
 ##----------------------------------------------------------------------------##
 # ShellEscape - backslash escape characters that are special to the shell
 ##----------------------------------------------------------------------------##
diff --git clang/unittests/ASTMatchers/ASTMatchersNarrowingTest.cpp clang/unittests/ASTMatchers/ASTMatchersNarrowingTest.cpp
index 92ec79d12657..5e1c12ba26d8 100644
--- clang/unittests/ASTMatchers/ASTMatchersNarrowingTest.cpp
+++ clang/unittests/ASTMatchers/ASTMatchersNarrowingTest.cpp
@@ -4329,7 +4329,7 @@ TEST_P(ASTMatchersTest, hasOperator) {
 TEST_P(ASTMatchersTest, IsMain) {
   EXPECT_TRUE(matches("int main() {}", functionDecl(isMain())));
 
-  EXPECT_TRUE(notMatches("int main2() {}", functionDecl(isMain())));
+  EXPECT_TRUE(notMatches("int main2() { return 0; }", functionDecl(isMain())));
 }
 
 TEST_P(ASTMatchersTest, OMPExecutableDirective_IsStandaloneDirective) {
diff --git clang/unittests/ASTMatchers/ASTMatchersTraversalTest.cpp clang/unittests/ASTMatchers/ASTMatchersTraversalTest.cpp
index 75d6ca5ba17f..068cf6677102 100644
--- clang/unittests/ASTMatchers/ASTMatchersTraversalTest.cpp
+++ clang/unittests/ASTMatchers/ASTMatchersTraversalTest.cpp
@@ -665,6 +665,7 @@ void check_match_co_return() {
 #include <coro_header>
 void check_match_co_await() {
   co_await a;
+  co_return 1;
 }
 )cpp";
   EXPECT_TRUE(matchesConditionally(CoAwaitCode,
@@ -674,6 +675,7 @@ void check_match_co_await() {
 #include <coro_header>
 void check_match_co_yield() {
   co_yield 1.0;
+  co_return 1;
 }
 )cpp";
   EXPECT_TRUE(matchesConditionally(CoYieldCode,
@@ -714,7 +716,7 @@ void coro() {
 void coro() try {
   int thevar;
   co_return 1;
-} catch (...) {}
+} catch (...) { co_return 1; }
 )cpp";
   EXPECT_TRUE(matchesConditionally(
       CoroWithTryCatchDeclCode,
diff --git clang/unittests/Analysis/CFGBuildResult.h clang/unittests/Analysis/CFGBuildResult.h
index 72ad1cc7ce40..0d5539005840 100644
--- clang/unittests/Analysis/CFGBuildResult.h
+++ clang/unittests/Analysis/CFGBuildResult.h
@@ -65,8 +65,8 @@ public:
 template <typename FuncMatcherT = ast_matchers::internal::TrueMatcher>
 BuildResult BuildCFG(const char *Code, CFG::BuildOptions Options = {},
                      FuncMatcherT FuncMatcher = ast_matchers::anything()) {
-  std::vector<std::string> Args = {"-std=c++11",
-                                   "-fno-delayed-template-parsing"};
+  const std::vector<std::string> Args = {
+      "-std=c++11", "-fno-delayed-template-parsing", "-Wno-everything"};
   std::unique_ptr<ASTUnit> AST = tooling::buildASTFromCodeWithArgs(Code, Args);
   if (!AST)
     return BuildResult::ToolFailed;
diff --git clang/unittests/Analysis/CFGTest.cpp clang/unittests/Analysis/CFGTest.cpp
index 2b27da008142..46a6751391cf 100644
--- clang/unittests/Analysis/CFGTest.cpp
+++ clang/unittests/Analysis/CFGTest.cpp
@@ -195,7 +195,6 @@ TEST(CFG, ElementRefIterator) {
   // Reverse, non-const version
   Index = MainBlockSize;
   for (CFGBlock::CFGElementRef ElementRef : MainBlock->rrefs()) {
-    llvm::errs() << Index << '\n';
     EXPECT_EQ(ElementRef.getParent(), MainBlock);
     EXPECT_EQ(ElementRef.getIndexInBlock(), Index);
     EXPECT_TRUE(ElementRef->getAs<CFGStmt>());
diff --git clang/unittests/Format/ConfigParseTest.cpp clang/unittests/Format/ConfigParseTest.cpp
index 1f0beafaad7f..9746aa354784 100644
--- clang/unittests/Format/ConfigParseTest.cpp
+++ clang/unittests/Format/ConfigParseTest.cpp
@@ -176,6 +176,7 @@ TEST(ConfigParseTest, ParsesConfigurationBools) {
   CHECK_PARSE_BOOL(IndentAccessModifiers);
   CHECK_PARSE_BOOL(IndentCaseBlocks);
   CHECK_PARSE_BOOL(IndentCaseLabels);
+  CHECK_PARSE_BOOL(IndentExportBlock);
   CHECK_PARSE_BOOL(IndentGotoLabels);
   CHECK_PARSE_BOOL(IndentRequiresClause);
   CHECK_PARSE_BOOL_FIELD(IndentRequiresClause, "IndentRequires");
diff --git clang/unittests/Format/FormatTest.cpp clang/unittests/Format/FormatTest.cpp
index 4d48bcacddea..61aa140dfdc9 100644
--- clang/unittests/Format/FormatTest.cpp
+++ clang/unittests/Format/FormatTest.cpp
@@ -5732,23 +5732,12 @@ TEST_F(FormatTest, HashInMacroDefinition) {
 
   verifyFormat("#define A void # ## #", getLLVMStyleWithColumns(22));
 
-#if 0
-  // FIXME: The correct format is:
   verifyFormat("{\n"
                "  {\n"
                "#define GEN_ID(_x) char *_x{#_x}\n"
                "    GEN_ID(one);\n"
                "  }\n"
                "}");
-#endif
-  verifyFormat("{\n"
-               "  {\n"
-               "#define GEN_ID(_x) \\\n"
-               "  char *_x { #_x }\n"
-               "    GEN_ID(one);\n"
-               "  }\n"
-               "}",
-               getGoogleStyle());
 }
 
 TEST_F(FormatTest, RespectWhitespaceInMacroDefinitions) {
@@ -9070,6 +9059,121 @@ TEST_F(FormatTest, AdaptiveOnePerLineFormatting) {
                Style);
 }
 
+TEST_F(FormatTest, IndentExportBlock) {
+  FormatStyle Style = getLLVMStyleWithColumns(80);
+  Style.IndentExportBlock = true;
+  verifyFormat("export {\n"
+               "  int x;\n"
+               "  int y;\n"
+               "}",
+               "export {\n"
+               "int x;\n"
+               "int y;\n"
+               "}",
+               Style);
+
+  Style.IndentExportBlock = false;
+  verifyFormat("export {\n"
+               "int x;\n"
+               "int y;\n"
+               "}",
+               "export {\n"
+               "  int x;\n"
+               "  int y;\n"
+               "}",
+               Style);
+}
+
+TEST_F(FormatTest, ShortExportBlocks) {
+  FormatStyle Style = getLLVMStyleWithColumns(80);
+  Style.IndentExportBlock = false;
+
+  Style.AllowShortBlocksOnASingleLine = FormatStyle::SBS_Never;
+  verifyFormat("export {\n"
+               "}",
+               Style);
+
+  verifyFormat("export {\n"
+               "int x;\n"
+               "}",
+               Style);
+
+  verifyFormat("export {\n"
+               "int x;\n"
+               "}",
+               "export\n"
+               "{\n"
+               "int x;\n"
+               "}",
+               Style);
+
+  verifyFormat("export {\n"
+               "}",
+               "export {}", Style);
+
+  verifyFormat("export {\n"
+               "int x;\n"
+               "}",
+               "export { int x; }", Style);
+
+  Style.AllowShortBlocksOnASingleLine = FormatStyle::SBS_Always;
+  verifyFormat("export {}",
+               "export {\n"
+               "}",
+               Style);
+
+  verifyFormat("export { int x; }",
+               "export {\n"
+               "int x;\n"
+               "}",
+               Style);
+
+  verifyFormat("export { int x; }",
+               "export\n"
+               "{\n"
+               "int x;\n"
+               "}",
+               Style);
+
+  verifyFormat("export {}",
+               "export {\n"
+               "}",
+               Style);
+
+  verifyFormat("export { int x; }",
+               "export {\n"
+               "int x;\n"
+               "}",
+               Style);
+
+  Style.AllowShortBlocksOnASingleLine = FormatStyle::SBS_Empty;
+  verifyFormat("export {}",
+               "export {\n"
+               "}",
+               Style);
+
+  verifyFormat("export {\n"
+               "int x;\n"
+               "}",
+               Style);
+
+  verifyFormat("export {\n"
+               "int x;\n"
+               "}",
+               "export\n"
+               "{\n"
+               "int x;\n"
+               "}",
+               Style);
+
+  verifyFormat("export {}", Style);
+
+  verifyFormat("export {\n"
+               "int x;\n"
+               "}",
+               "export { int x; }", Style);
+}
+
 TEST_F(FormatTest, FormatsBuilderPattern) {
   verifyFormat("return llvm::StringSwitch<Reference::Kind>(name)\n"
                "    .StartsWith(\".eh_frame_hdr\", ORDER_EH_FRAMEHDR)\n"
@@ -27987,6 +28091,11 @@ TEST_F(FormatTest, BreakBinaryOperations) {
                "    operand1 + operand2 - (operand3 + operand4);",
                Style);
 
+  // Check operator>> special case.
+  verifyFormat("std::cin >> longOperand_1 >> longOperand_2 >>\n"
+               "    longOperand_3_;",
+               Style);
+
   Style.BreakBinaryOperations = FormatStyle::BBO_OnePerLine;
 
   // Logical operations
@@ -28065,6 +28174,13 @@ TEST_F(FormatTest, BreakBinaryOperations) {
                "         operand6->member;",
                Style);
 
+  // Check operator>> special case.
+  verifyFormat("std::cin >>\n"
+               "    longOperand_1 >>\n"
+               "    longOperand_2 >>\n"
+               "    longOperand_3_;",
+               Style);
+
   Style.BreakBinaryOperations = FormatStyle::BBO_RespectPrecedence;
   verifyFormat("result = op1 + op2 * op3 - op4;", Style);
 
@@ -28090,6 +28206,13 @@ TEST_F(FormatTest, BreakBinaryOperations) {
                "                  byte_buffer[3] << 24;",
                Style);
 
+  // Check operator>> special case.
+  verifyFormat("std::cin >>\n"
+               "    longOperand_1 >>\n"
+               "    longOperand_2 >>\n"
+               "    longOperand_3_;",
+               Style);
+
   Style.BreakBinaryOperations = FormatStyle::BBO_OnePerLine;
   Style.BreakBeforeBinaryOperators = FormatStyle::BOS_NonAssignment;
 
@@ -28164,6 +28287,13 @@ TEST_F(FormatTest, BreakBinaryOperations) {
                "                  << 24;",
                Style);
 
+  // Check operator>> special case.
+  verifyFormat("std::cin\n"
+               "    >> longOperand_1\n"
+               "    >> longOperand_2\n"
+               "    >> longOperand_3_;",
+               Style);
+
   Style.BreakBinaryOperations = FormatStyle::BBO_RespectPrecedence;
   verifyFormat("result = op1 + op2 * op3 - op4;", Style);
 
@@ -28188,6 +28318,13 @@ TEST_F(FormatTest, BreakBinaryOperations) {
                "                  | byte_buffer[2] << 16\n"
                "                  | byte_buffer[3] << 24;",
                Style);
+
+  // Check operator>> special case.
+  verifyFormat("std::cin\n"
+               "    >> longOperand_1\n"
+               "    >> longOperand_2\n"
+               "    >> longOperand_3_;",
+               Style);
 }
 
 TEST_F(FormatTest, RemoveEmptyLinesInUnwrappedLines) {
diff --git clang/unittests/Format/TokenAnnotatorTest.cpp clang/unittests/Format/TokenAnnotatorTest.cpp
index 399502db52cb..9ac60ce73750 100644
--- clang/unittests/Format/TokenAnnotatorTest.cpp
+++ clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -3413,14 +3413,27 @@ TEST_F(TokenAnnotatorTest, BraceKind) {
   EXPECT_BRACE_KIND(Tokens[0], BK_Block);
   EXPECT_TOKEN(Tokens[1], tok::l_brace, TT_BlockLBrace);
   EXPECT_BRACE_KIND(Tokens[1], BK_Block);
-#if 0
-  // FIXME:
   EXPECT_BRACE_KIND(Tokens[11], BK_BracedInit);
   EXPECT_BRACE_KIND(Tokens[14], BK_BracedInit);
-#endif
   EXPECT_BRACE_KIND(Tokens[20], BK_Block);
   EXPECT_BRACE_KIND(Tokens[21], BK_Block);
 
+  Tokens = annotate("{\n"
+                    "#define FOO \\\n"
+                    "  { \\\n"
+                    "    case bar: { \\\n"
+                    "      break; \\\n"
+                    "    } \\\n"
+                    "  }\n"
+                    "}");
+  ASSERT_EQ(Tokens.size(), 15u) << Tokens;
+  EXPECT_TOKEN(Tokens[4], tok::l_brace, TT_BlockLBrace);
+  EXPECT_BRACE_KIND(Tokens[4], BK_Block);
+  EXPECT_TOKEN(Tokens[7], tok::colon, TT_CaseLabelColon);
+  EXPECT_BRACE_KIND(Tokens[8], BK_Block);
+  EXPECT_BRACE_KIND(Tokens[11], BK_Block);
+  EXPECT_BRACE_KIND(Tokens[12], BK_Block);
+
   Tokens = annotate("a = class extends goog.a {};",
                     getGoogleStyle(FormatStyle::LK_JavaScript));
   ASSERT_EQ(Tokens.size(), 11u) << Tokens;
diff --git clang/unittests/Sema/CMakeLists.txt clang/unittests/Sema/CMakeLists.txt
index 7ded562e8edf..17d39408000a 100644
--- clang/unittests/Sema/CMakeLists.txt
+++ clang/unittests/Sema/CMakeLists.txt
@@ -6,6 +6,7 @@ set(LLVM_LINK_COMPONENTS
 add_clang_unittest(SemaTests
   ExternalSemaSourceTest.cpp
   CodeCompleteTest.cpp
+  HeuristicResolverTest.cpp
   GslOwnerPointerInference.cpp
   SemaLookupTest.cpp
   SemaNoloadLookupTest.cpp
diff --git clang-tools-extra/clangd/unittests/HeuristicResolverTests.cpp clang/unittests/Sema/HeuristicResolverTest.cpp
similarity index 96%
rename from clang-tools-extra/clangd/unittests/HeuristicResolverTests.cpp
rename to clang/unittests/Sema/HeuristicResolverTest.cpp
index e4b3822fc7eb..2cd5486b3227 100644
--- clang-tools-extra/clangd/unittests/HeuristicResolverTests.cpp
+++ clang/unittests/Sema/HeuristicResolverTest.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-#include "HeuristicResolver.h"
+#include "clang/Sema/HeuristicResolver.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
 #include "clang/ASTMatchers/ASTMatchers.h"
 #include "clang/Tooling/Tooling.h"
@@ -13,7 +13,6 @@
 #include "gtest/gtest.h"
 
 using namespace clang::ast_matchers;
-using clang::clangd::HeuristicResolver;
 using testing::ElementsAre;
 
 namespace clang {
@@ -136,6 +135,26 @@ TEST(HeuristicResolver, MemberExpr_SmartPointer) {
       cxxMethodDecl(hasName("foo")).bind("output"));
 }
 
+TEST(HeuristicResolver, MemberExpr_SmartPointer_Qualified) {
+  std::string Code = R"cpp(
+    template <typename> struct Waldo {
+      void find();
+      void find() const;
+    };
+    template <typename T> struct unique_ptr {
+      T* operator->();
+    };
+    template <typename T>
+    void test(unique_ptr<const Waldo<T>>& w) {
+      w->find();
+    }
+  )cpp";
+  expectResolution(
+      Code, &HeuristicResolver::resolveMemberExpr,
+      cxxDependentScopeMemberExpr(hasMemberName("find")).bind("input"),
+      cxxMethodDecl(hasName("find"), isConst()).bind("output"));
+}
+
 TEST(HeuristicResolver, MemberExpr_Chained) {
   std::string Code = R"cpp(
     struct A { void foo() {} };
diff --git clang/unittests/Tooling/ASTSelectionTest.cpp clang/unittests/Tooling/ASTSelectionTest.cpp
index 1897bc15196e..0f60749725cd 100644
--- clang/unittests/Tooling/ASTSelectionTest.cpp
+++ clang/unittests/Tooling/ASTSelectionTest.cpp
@@ -384,7 +384,7 @@ TEST(ASTSelectionFinder, SelectionInFunctionInObjCImplementation) {
 @end
 @implementation I
 
-int notSelected() { }
+int notSelected() { return 0; }
 
 int selected(int x) {
   return x;
diff --git clang/unittests/Tooling/LexicallyOrderedRecursiveASTVisitorTest.cpp clang/unittests/Tooling/LexicallyOrderedRecursiveASTVisitorTest.cpp
index b167eb4b8117..c0833dc4a640 100644
--- clang/unittests/Tooling/LexicallyOrderedRecursiveASTVisitorTest.cpp
+++ clang/unittests/Tooling/LexicallyOrderedRecursiveASTVisitorTest.cpp
@@ -107,7 +107,7 @@ TEST(LexicallyOrderedRecursiveASTVisitor, VisitDeclsInImplementation) {
 @end
 @implementation I
 
-int nestedFunction() { }
+void nestedFunction() { }
 
 - (void) method{ }
 
diff --git clang/unittests/Tooling/Syntax/BuildTreeTest.cpp clang/unittests/Tooling/Syntax/BuildTreeTest.cpp
index a551f83ff3f9..d58e190923a1 100644
--- clang/unittests/Tooling/Syntax/BuildTreeTest.cpp
+++ clang/unittests/Tooling/Syntax/BuildTreeTest.cpp
@@ -4604,7 +4604,7 @@ TEST_P(BuildSyntaxTreeTest, ConstructorCall_DefaultArguments) {
 struct X {
   X(int i = 1, char c = '2');
 };
-X test() {
+void test() {
   auto x0 = [[X()]];
   auto x1 = [[X(1)]];
   auto x2 = [[X(1, '2')]];
diff --git clang/utils/TableGen/SveEmitter.cpp clang/utils/TableGen/SveEmitter.cpp
index 35477cfc3cf4..0ecbf7cede1d 100644
--- clang/utils/TableGen/SveEmitter.cpp
+++ clang/utils/TableGen/SveEmitter.cpp
@@ -1050,7 +1050,7 @@ std::string Intrinsic::replaceTemplatedArgs(std::string Name, TypeSpec TS,
     else if (T.isBFloat())
       TypeCode = "bf";
     else if (T.isMFloat())
-      TypeCode = "mfp";
+      TypeCode = "mf";
     else
       TypeCode = 'f';
     Ret.replace(Pos, NumChars, TypeCode + utostr(T.getElementSizeInBits()));
diff --git compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake
index ab5d55a9a35c..ee5be276f3df 100644
--- compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake
+++ compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake
@@ -110,7 +110,7 @@ set(ALL_SHADOWCALLSTACK_SUPPORTED_ARCH ${ARM64})
 
 if (UNIX)
   if (OS_NAME MATCHES "Linux")
-    set(ALL_ORC_SUPPORTED_ARCH ${X86_64} ${ARM64} ${ARM32} ${PPC64})
+    set(ALL_ORC_SUPPORTED_ARCH ${X86_64} ${ARM64} ${ARM32} ${PPC64} ${LOONGARCH64})
   else()
     set(ALL_ORC_SUPPORTED_ARCH ${X86_64} ${ARM64} ${ARM32})
   endif()
diff --git compiler-rt/lib/interception/interception_win.cpp compiler-rt/lib/interception/interception_win.cpp
index 7a1a47a78dbc..0841161ee5c4 100644
--- compiler-rt/lib/interception/interception_win.cpp
+++ compiler-rt/lib/interception/interception_win.cpp
@@ -651,6 +651,10 @@ static size_t GetInstructionSize(uptr address, size_t* rel_offset = nullptr) {
     case 0xD284:  // 84 D2 : test dl,dl
       return 2;
 
+    case 0x3980:  // 80 39 XX : cmp BYTE PTR [rcx], XX
+    case 0x4D8B:  // 8B 4D XX : mov XX(%ebp), ecx
+    case 0x558B:  // 8B 55 XX : mov XX(%ebp), edx
+    case 0x758B:  // 8B 75 XX : mov XX(%ebp), esp
     case 0xE483:  // 83 E4 XX : and esp, XX
     case 0xEC83:  // 83 EC XX : sub esp, XX
     case 0xC1F6:  // F6 C1 XX : test cl, XX
@@ -757,6 +761,9 @@ static size_t GetInstructionSize(uptr address, size_t* rel_offset = nullptr) {
     case 0xc1ff48:    // 48 ff c1 : inc rcx
     case 0xc1ff49:    // 49 ff c1 : inc r9
     case 0xc28b41:    // 41 8b c2 : mov eax, r10d
+    case 0x01b60f:    // 0f b6 01 : movzx eax, BYTE PTR [rcx]
+    case 0x09b60f:    // 0f b6 09 : movzx ecx, BYTE PTR [rcx]
+    case 0x11b60f:    // 0f b6 11 : movzx edx, BYTE PTR [rcx]
     case 0xc2b60f:    // 0f b6 c2 : movzx eax, dl
     case 0xc2ff48:    // 48 ff c2 : inc rdx
     case 0xc2ff49:    // 49 ff c2 : inc r10
@@ -775,6 +782,7 @@ static size_t GetInstructionSize(uptr address, size_t* rel_offset = nullptr) {
     case 0xc98548:    // 48 85 c9 : test rcx, rcx
     case 0xc9854d:    // 4d 85 c9 : test r9, r9
     case 0xc98b4c:    // 4c 8b c9 : mov r9, rcx
+    case 0xd12948:    // 48 29 d1 : sub rcx, rdx
     case 0xca2b48:    // 48 2b ca : sub rcx, rdx
     case 0xca3b48:    // 48 3b ca : cmp rcx, rdx
     case 0xd12b48:    // 48 2b d1 : sub rdx, rcx
@@ -784,16 +792,33 @@ static size_t GetInstructionSize(uptr address, size_t* rel_offset = nullptr) {
     case 0xd2854d:    // 4d 85 d2 : test r10, r10
     case 0xd28b4c:    // 4c 8b d2 : mov r10, rdx
     case 0xd2b60f:    // 0f b6 d2 : movzx edx, dl
+    case 0xd2be0f:    // 0f be d2 : movsx edx, dl
     case 0xd98b4c:    // 4c 8b d9 : mov r11, rcx
     case 0xd9f748:    // 48 f7 d9 : neg rcx
+    case 0xc03145:    // 45 31 c0 : xor r8d,r8d
+    case 0xc93145:    // 45 31 c9 : xor r9d,r9d
     case 0xdb3345:    // 45 33 db : xor r11d, r11d
+    case 0xc08445:    // 45 84 c0 : test r8b,r8b
+    case 0xd28445:    // 45 84 d2 : test r10b,r10b
     case 0xdb8548:    // 48 85 db : test rbx, rbx
     case 0xdb854d:    // 4d 85 db : test r11, r11
     case 0xdc8b4c:    // 4c 8b dc : mov r11, rsp
     case 0xe48548:    // 48 85 e4 : test rsp, rsp
     case 0xe4854d:    // 4d 85 e4 : test r12, r12
+    case 0xc88948:    // 48 89 c8 : mov rax,rcx
+    case 0xcb8948:    // 48 89 cb : mov rbx,rcx
+    case 0xd08948:    // 48 89 d0 : mov rax,rdx
+    case 0xd18948:    // 48 89 d1 : mov rcx,rdx
+    case 0xd38948:    // 48 89 d3 : mov rbx,rdx
     case 0xe58948:    // 48 89 e5 : mov rbp, rsp
     case 0xed8548:    // 48 85 ed : test rbp, rbp
+    case 0xc88949:    // 49 89 c8 : mov r8, rcx
+    case 0xc98949:    // 49 89 c9 : mov r9, rcx
+    case 0xca8949:    // 49 89 ca : mov r10,rcx
+    case 0xd08949:    // 49 89 d0 : mov r8, rdx
+    case 0xd18949:    // 49 89 d1 : mov r9, rdx
+    case 0xd28949:    // 49 89 d2 : mov r10, rdx
+    case 0xd38949:    // 49 89 d3 : mov r11, rdx
     case 0xed854d:    // 4d 85 ed : test r13, r13
     case 0xf6854d:    // 4d 85 f6 : test r14, r14
     case 0xff854d:    // 4d 85 ff : test r15, r15
diff --git compiler-rt/lib/interception/tests/interception_win_test.cpp compiler-rt/lib/interception/tests/interception_win_test.cpp
index e0258a3d0bd5..9d8dbfcb4285 100644
--- compiler-rt/lib/interception/tests/interception_win_test.cpp
+++ compiler-rt/lib/interception/tests/interception_win_test.cpp
@@ -857,8 +857,12 @@ const struct InstructionSizeData {
     { 2, {0x8B, 0xC1}, 0, "8B C1 : mov eax, ecx"},
     { 2, {0x8B, 0xEC}, 0, "8B EC : mov ebp, esp"},
     { 2, {0x8B, 0xFF}, 0, "8B FF : mov edi, edi"},
+    { 3, {0x80, 0x39, 0x72}, 0, "80 39 XX : cmp BYTE PTR [rcx], XX"},
     { 3, {0x83, 0xE4, 0x72}, 0, "83 E4 XX : and esp, XX"},
     { 3, {0x83, 0xEC, 0x72}, 0, "83 EC XX : sub esp, XX"},
+    { 3, {0x8B, 0x4D, 0x72}, 0, "8B 4D XX : mov XX(%ebp), ecx"},
+    { 3, {0x8B, 0x55, 0x72}, 0, "8B 55 XX : mov XX(%ebp), edx"},
+    { 3, {0x8B, 0x75, 0x72}, 0, "8B 75 XX : mov XX(%ebp), esp"},
     { 3, {0xc2, 0x71, 0x72}, 0, "C2 XX XX : ret XX (needed for registering weak functions)"},
     { 5, {0x68, 0x71, 0x72, 0x73, 0x74}, 0, "68 XX XX XX XX : push imm32"},
     { 5, {0xb8, 0x71, 0x72, 0x73, 0x74}, 0, "b8 XX XX XX XX : mov eax, XX XX XX XX"},
@@ -881,17 +885,26 @@ const struct InstructionSizeData {
     { 2, {0x66, 0x90}, 0, "66 90 : Two-byte NOP"},
     { 2, {0x84, 0xc0}, 0, "84 c0 : test al, al"},
     { 2, {0x8a, 0x01}, 0, "8a 01 : mov al, byte ptr [rcx]"},
+    { 3, {0x0f, 0xb6, 0x01}, 0, "0f b6 01 : movzx eax, BYTE PTR [rcx]"},
+    { 3, {0x0f, 0xb6, 0x09}, 0, "0f b6 09 : movzx ecx, BYTE PTR [rcx]"},
+    { 3, {0x0f, 0xb6, 0x11}, 0, "0f b6 11 : movzx edx, BYTE PTR [rcx]"},
     { 3, {0x0f, 0xb6, 0xc2}, 0, "0f b6 c2 : movzx eax, dl"},
     { 3, {0x0f, 0xb6, 0xd2}, 0, "0f b6 d2 : movzx edx, dl"},
     { 3, {0x0f, 0xb7, 0x10}, 0, "0f b7 10 : movzx edx, WORD PTR [rax]"},
+    { 3, {0x0f, 0xbe, 0xd2}, 0, "0f be d2 : movsx edx, dl"},
     { 3, {0x41, 0x8b, 0xc0}, 0, "41 8b c0 : mov eax, r8d"},
     { 3, {0x41, 0x8b, 0xc1}, 0, "41 8b c1 : mov eax, r9d"},
     { 3, {0x41, 0x8b, 0xc2}, 0, "41 8b c2 : mov eax, r10d"},
     { 3, {0x41, 0x8b, 0xc3}, 0, "41 8b c3 : mov eax, r11d"},
     { 3, {0x41, 0x8b, 0xc4}, 0, "41 8b c4 : mov eax, r12d"},
+    { 3, {0x45, 0x31, 0xc0}, 0, "45 31 c0 : xor r8d,r8d"},
+    { 3, {0x45, 0x31, 0xc9}, 0, "45 31 c9 : xor r9d,r9d"},
     { 3, {0x45, 0x33, 0xc0}, 0, "45 33 c0 : xor r8d, r8d"},
     { 3, {0x45, 0x33, 0xc9}, 0, "45 33 c9 : xor r9d, r9d"},
     { 3, {0x45, 0x33, 0xdb}, 0, "45 33 db : xor r11d, r11d"},
+    { 3, {0x45, 0x84, 0xc0}, 0, "45 84 c0 : test r8b,r8b"},
+    { 3, {0x45, 0x84, 0xd2}, 0, "45 84 d2 : test r10b,r10b"},
+    { 3, {0x48, 0x29, 0xd1}, 0, "48 29 d1 : sub rcx, rdx"},
     { 3, {0x48, 0x2b, 0xca}, 0, "48 2b ca : sub rcx, rdx"},
     { 3, {0x48, 0x2b, 0xd1}, 0, "48 2b d1 : sub rdx, rcx"},
     { 3, {0x48, 0x3b, 0xca}, 0, "48 3b ca : cmp rcx, rdx"},
@@ -901,6 +914,11 @@ const struct InstructionSizeData {
     { 3, {0x48, 0x85, 0xdb}, 0, "48 85 db : test rbx, rbx"},
     { 3, {0x48, 0x85, 0xe4}, 0, "48 85 e4 : test rsp, rsp"},
     { 3, {0x48, 0x85, 0xed}, 0, "48 85 ed : test rbp, rbp"},
+    { 3, {0x48, 0x89, 0xc8}, 0, "48 89 c8 : mov rax,rcx"},
+    { 3, {0x48, 0x89, 0xcb}, 0, "48 89 cb : mov rbx,rcx"},
+    { 3, {0x48, 0x89, 0xd0}, 0, "48 89 d0 : mov rax,rdx"},
+    { 3, {0x48, 0x89, 0xd1}, 0, "48 89 d1 : mov rcx,rdx"},
+    { 3, {0x48, 0x89, 0xd3}, 0, "48 89 d3 : mov rbx,rdx"},
     { 3, {0x48, 0x89, 0xe5}, 0, "48 89 e5 : mov rbp, rsp"},
     { 3, {0x48, 0x8b, 0xc1}, 0, "48 8b c1 : mov rax, rcx"},
     { 3, {0x48, 0x8b, 0xc4}, 0, "48 8b c4 : mov rax, rsp"},
@@ -912,6 +930,13 @@ const struct InstructionSizeData {
     { 3, {0x48, 0xff, 0xc3}, 0, "48 ff c3 : inc rbx"},
     { 3, {0x48, 0xff, 0xc6}, 0, "48 ff c6 : inc rsi"},
     { 3, {0x48, 0xff, 0xc7}, 0, "48 ff c7 : inc rdi"},
+    { 3, {0x49, 0x89, 0xc8}, 0, "49 89 c8 : mov r8, rcx"},
+    { 3, {0x49, 0x89, 0xc9}, 0, "49 89 c9 : mov r9, rcx"},
+    { 3, {0x49, 0x89, 0xca}, 0, "49 89 ca : mov r10,rcx"},
+    { 3, {0x49, 0x89, 0xd0}, 0, "49 89 d0 : mov r8, rdx"},
+    { 3, {0x49, 0x89, 0xd1}, 0, "49 89 d1 : mov r9, rdx"},
+    { 3, {0x49, 0x89, 0xd2}, 0, "49 89 d2 : mov r10, rdx"},
+    { 3, {0x49, 0x89, 0xd3}, 0, "49 89 d3 : mov r11, rdx"},
     { 3, {0x49, 0xff, 0xc0}, 0, "49 ff c0 : inc r8"},
     { 3, {0x49, 0xff, 0xc1}, 0, "49 ff c1 : inc r9"},
     { 3, {0x49, 0xff, 0xc2}, 0, "49 ff c2 : inc r10"},
diff --git compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp
index e3f3d12d7e52..34c2d4cb37fd 100644
--- compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp
+++ compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp
@@ -302,6 +302,9 @@ INTERCEPTOR(int, fpurge, FILE *stream) {
   __rtsan_notify_intercepted_call("fpurge");
   return REAL(fpurge)(stream);
 }
+#define RTSAN_MAYBE_INTERCEPT_FPURGE INTERCEPT_FUNCTION(fpurge)
+#else
+#define RTSAN_MAYBE_INTERCEPT_FPURGE
 #endif
 
 INTERCEPTOR(FILE *, fdopen, int fd, const char *mode) {
@@ -829,6 +832,28 @@ INTERCEPTOR(int, getnameinfo, const struct sockaddr *sa, socklen_t salen,
   return REAL(getnameinfo)(sa, salen, host, hostlen, serv, servlen, flags);
 }
 
+#if SANITIZER_INTERCEPT_GETSOCKNAME
+INTERCEPTOR(int, getsockname, int socket, struct sockaddr *sa,
+            socklen_t *salen) {
+  __rtsan_notify_intercepted_call("getsockname");
+  return REAL(getsockname)(socket, sa, salen);
+}
+#define RTSAN_MAYBE_INTERCEPT_GETSOCKNAME INTERCEPT_FUNCTION(getsockname)
+#else
+#define RTSAN_MAYBE_INTERCEPT_GETSOCKNAME
+#endif
+
+#if SANITIZER_INTERCEPT_GETPEERNAME
+INTERCEPTOR(int, getpeername, int socket, struct sockaddr *sa,
+            socklen_t *salen) {
+  __rtsan_notify_intercepted_call("getpeername");
+  return REAL(getpeername)(socket, sa, salen);
+}
+#define RTSAN_MAYBE_INTERCEPT_GETPEERNAME INTERCEPT_FUNCTION(getpeername)
+#else
+#define RTSAN_MAYBE_INTERCEPT_GETPEERNAME
+#endif
+
 INTERCEPTOR(int, bind, int socket, const struct sockaddr *address,
             socklen_t address_len) {
   __rtsan_notify_intercepted_call("bind");
@@ -868,6 +893,17 @@ INTERCEPTOR(ssize_t, sendmsg, int socket, const struct msghdr *message,
   return REAL(sendmsg)(socket, message, flags);
 }
 
+#if SANITIZER_INTERCEPT_SENDMMSG
+INTERCEPTOR(int, sendmmsg, int socket, struct mmsghdr *message,
+            unsigned int len, int flags) {
+  __rtsan_notify_intercepted_call("sendmmsg");
+  return REAL(sendmmsg)(socket, message, len, flags);
+}
+#define RTSAN_MAYBE_INTERCEPT_SENDMMSG INTERCEPT_FUNCTION(sendmmsg)
+#else
+#define RTSAN_MAYBE_INTERCEPT_SENDMMSG
+#endif
+
 INTERCEPTOR(ssize_t, sendto, int socket, const void *buffer, size_t length,
             int flags, const struct sockaddr *dest_addr, socklen_t dest_len) {
   __rtsan_notify_intercepted_call("sendto");
@@ -890,6 +926,17 @@ INTERCEPTOR(ssize_t, recvmsg, int socket, struct msghdr *message, int flags) {
   return REAL(recvmsg)(socket, message, flags);
 }
 
+#if SANITIZER_INTERCEPT_RECVMMSG
+INTERCEPTOR(int, recvmmsg, int socket, struct mmsghdr *message,
+            unsigned int len, int flags, struct timespec *timeout) {
+  __rtsan_notify_intercepted_call("recvmmsg");
+  return REAL(recvmmsg)(socket, message, len, flags, timeout);
+}
+#define RTSAN_MAYBE_INTERCEPT_RECVMMSG INTERCEPT_FUNCTION(recvmmsg)
+#else
+#define RTSAN_MAYBE_INTERCEPT_RECVMMSG
+#endif
+
 INTERCEPTOR(int, shutdown, int socket, int how) {
   __rtsan_notify_intercepted_call("shutdown");
   return REAL(shutdown)(socket, how);
@@ -1020,6 +1067,16 @@ INTERCEPTOR(int, pipe, int pipefd[2]) {
   return REAL(pipe)(pipefd);
 }
 
+#if !SANITIZER_APPLE
+INTERCEPTOR(int, pipe2, int pipefd[2], int flags) {
+  __rtsan_notify_intercepted_call("pipe2");
+  return REAL(pipe2)(pipefd, flags);
+}
+#define RTSAN_MAYBE_INTERCEPT_PIPE2 INTERCEPT_FUNCTION(pipe2)
+#else
+#define RTSAN_MAYBE_INTERCEPT_PIPE2
+#endif
+
 INTERCEPTOR(int, mkfifo, const char *pathname, mode_t mode) {
   __rtsan_notify_intercepted_call("mkfifo");
   return REAL(mkfifo)(pathname, mode);
@@ -1122,6 +1179,8 @@ void __rtsan::InitializeInterceptors() {
   INTERCEPT_FUNCTION(puts);
   INTERCEPT_FUNCTION(fputs);
   INTERCEPT_FUNCTION(fflush);
+  RTSAN_MAYBE_INTERCEPT_FPURGE;
+  RTSAN_MAYBE_INTERCEPT_PIPE2;
   INTERCEPT_FUNCTION(fdopen);
   INTERCEPT_FUNCTION(freopen);
   RTSAN_MAYBE_INTERCEPT_FOPENCOOKIE;
@@ -1183,12 +1242,16 @@ void __rtsan::InitializeInterceptors() {
   INTERCEPT_FUNCTION(recv);
   INTERCEPT_FUNCTION(recvfrom);
   INTERCEPT_FUNCTION(recvmsg);
+  RTSAN_MAYBE_INTERCEPT_RECVMMSG;
   INTERCEPT_FUNCTION(send);
   INTERCEPT_FUNCTION(sendmsg);
+  RTSAN_MAYBE_INTERCEPT_SENDMMSG;
   INTERCEPT_FUNCTION(sendto);
   INTERCEPT_FUNCTION(shutdown);
   INTERCEPT_FUNCTION(socket);
   RTSAN_MAYBE_INTERCEPT_ACCEPT4;
+  RTSAN_MAYBE_INTERCEPT_GETSOCKNAME;
+  RTSAN_MAYBE_INTERCEPT_GETPEERNAME;
 
   RTSAN_MAYBE_INTERCEPT_SELECT;
   INTERCEPT_FUNCTION(pselect);
diff --git compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp
index c26643c6a2d6..c858a5a771fe 100644
--- compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp
+++ compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp
@@ -1118,6 +1118,15 @@ TEST(TestRtsanInterceptors, SendmsgToASocketDiesWhenRealtime) {
   ExpectNonRealtimeSurvival(Func);
 }
 
+#if SANITIZER_INTERCEPT_SENDMMSG
+TEST(TestRtsanInterceptors, SendmmsgOnASocketDiesWhenRealtime) {
+  mmsghdr msg{};
+  auto Func = [&]() { sendmmsg(0, &msg, 0, 0); };
+  ExpectRealtimeDeath(Func, "sendmmsg");
+  ExpectNonRealtimeSurvival(Func);
+}
+#endif
+
 TEST(TestRtsanInterceptors, SendtoToASocketDiesWhenRealtime) {
   sockaddr addr{};
   socklen_t len{};
@@ -1147,12 +1156,41 @@ TEST(TestRtsanInterceptors, RecvmsgOnASocketDiesWhenRealtime) {
   ExpectNonRealtimeSurvival(Func);
 }
 
+#if SANITIZER_INTERCEPT_RECVMMSG
+TEST(TestRtsanInterceptors, RecvmmsgOnASocketDiesWhenRealtime) {
+  mmsghdr msg{};
+  auto Func = [&]() { recvmmsg(0, &msg, 0, 0, nullptr); };
+  ExpectRealtimeDeath(Func, "recvmmsg");
+  ExpectNonRealtimeSurvival(Func);
+}
+#endif
+
 TEST(TestRtsanInterceptors, ShutdownOnASocketDiesWhenRealtime) {
   auto Func = [&]() { shutdown(0, 0); };
   ExpectRealtimeDeath(Func, "shutdown");
   ExpectNonRealtimeSurvival(Func);
 }
 
+#if SANITIZER_INTERCEPT_GETSOCKNAME
+TEST(TestRtsanInterceptors, GetsocknameOnASocketDiesWhenRealtime) {
+  sockaddr addr{};
+  socklen_t len{};
+  auto Func = [&]() { getsockname(0, &addr, &len); };
+  ExpectRealtimeDeath(Func, "getsockname");
+  ExpectNonRealtimeSurvival(Func);
+}
+#endif
+
+#if SANITIZER_INTERCEPT_GETPEERNAME
+TEST(TestRtsanInterceptors, GetpeernameOnASocketDiesWhenRealtime) {
+  sockaddr addr{};
+  socklen_t len{};
+  auto Func = [&]() { getpeername(0, &addr, &len); };
+  ExpectRealtimeDeath(Func, "getpeername");
+  ExpectNonRealtimeSurvival(Func);
+}
+#endif
+
 /*
     I/O Multiplexing
 */
@@ -1339,6 +1377,15 @@ TEST(TestRtsanInterceptors, PipeDiesWhenRealtime) {
   ExpectNonRealtimeSurvival(Func);
 }
 
+#if !SANITIZER_APPLE
+TEST(TestRtsanInterceptors, Pipe2DiesWhenRealtime) {
+  int fds[2];
+  auto Func = [&fds]() { pipe2(fds, O_CLOEXEC); };
+  ExpectRealtimeDeath(Func, "pipe2");
+  ExpectNonRealtimeSurvival(Func);
+}
+#endif
+
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wdeprecated-declarations"
 TEST(TestRtsanInterceptors, SyscallDiesWhenRealtime) {
diff --git compiler-rt/test/orc/TestCases/Linux/loongarch64/ehframe-default.cpp compiler-rt/test/orc/TestCases/Linux/loongarch64/ehframe-default.cpp
new file mode 100644
index 000000000000..0f7dcec4b5a5
--- /dev/null
+++ compiler-rt/test/orc/TestCases/Linux/loongarch64/ehframe-default.cpp
@@ -0,0 +1,14 @@
+// RUN: %clangxx -fexceptions -fPIC -c -o %t %s
+// RUN: %llvm_jitlink %t
+
+extern "C" void llvm_jitlink_setTestResultOverride(long Value);
+
+int main(int argc, char *argv[]) {
+  llvm_jitlink_setTestResultOverride(1);
+  try {
+    throw 0;
+  } catch (int X) {
+    llvm_jitlink_setTestResultOverride(X);
+  }
+  return 0;
+}
diff --git compiler-rt/test/orc/TestCases/Linux/loongarch64/ehframe-libunwind.cpp compiler-rt/test/orc/TestCases/Linux/loongarch64/ehframe-libunwind.cpp
new file mode 100644
index 000000000000..f56aa8fba950
--- /dev/null
+++ compiler-rt/test/orc/TestCases/Linux/loongarch64/ehframe-libunwind.cpp
@@ -0,0 +1,15 @@
+// REQUIRES: libunwind-available
+// RUN: %clangxx -fexceptions -fPIC -c -o %t %s
+// RUN: env LD_PRELOAD=%shared_libunwind %llvm_jitlink %t
+
+extern "C" void llvm_jitlink_setTestResultOverride(long Value);
+
+int main(int argc, char *argv[]) {
+  llvm_jitlink_setTestResultOverride(1);
+  try {
+    throw 0;
+  } catch (int X) {
+    llvm_jitlink_setTestResultOverride(X);
+  }
+  return 0;
+}
diff --git compiler-rt/test/orc/TestCases/Linux/loongarch64/lit.local.cfg.py compiler-rt/test/orc/TestCases/Linux/loongarch64/lit.local.cfg.py
new file mode 100644
index 000000000000..b8a5a418ec25
--- /dev/null
+++ compiler-rt/test/orc/TestCases/Linux/loongarch64/lit.local.cfg.py
@@ -0,0 +1,5 @@
+if config.root.host_arch != "loongarch64":
+    config.unsupported = True
+
+if config.target_arch != "loongarch64":
+    config.unsupported = True
diff --git compiler-rt/test/orc/TestCases/Linux/loongarch64/lljit-ehframe.cpp compiler-rt/test/orc/TestCases/Linux/loongarch64/lljit-ehframe.cpp
new file mode 100644
index 000000000000..b73ec2387028
--- /dev/null
+++ compiler-rt/test/orc/TestCases/Linux/loongarch64/lljit-ehframe.cpp
@@ -0,0 +1,15 @@
+// RUN: %clangxx -fPIC -emit-llvm -c -o %t %s
+// RUN: %lli_orc_jitlink -relocation-model=pic %t | FileCheck %s
+
+// CHECK: catch
+
+#include <stdio.h>
+
+int main(int argc, char *argv[]) {
+  try {
+    throw 0;
+  } catch (int X) {
+    puts("catch");
+  }
+  return 0;
+}
diff --git compiler-rt/test/orc/TestCases/Linux/loongarch64/lljit-initialize-deinitialize.ll compiler-rt/test/orc/TestCases/Linux/loongarch64/lljit-initialize-deinitialize.ll
new file mode 100644
index 000000000000..34bfc10b9d89
--- /dev/null
+++ compiler-rt/test/orc/TestCases/Linux/loongarch64/lljit-initialize-deinitialize.ll
@@ -0,0 +1,32 @@
+; RUN: %lli_orc_jitlink %s | FileCheck %s
+
+; CHECK: constructor
+; CHECK-NEXT: main
+; CHECK-NEXT: destructor
+
+@__dso_handle = external hidden global i8
+@.str = private unnamed_addr constant [5 x i8] c"main\00", align 1
+@.str.1 = private unnamed_addr constant [12 x i8] c"constructor\00", align 1
+@.str.2 = private unnamed_addr constant [11 x i8] c"destructor\00", align 1
+@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @constructor, i8* null }]
+
+define dso_local void @destructor(i8* %0) {
+  %2 = tail call i32 @puts(i8* nonnull dereferenceable(1) getelementptr inbounds ([11 x i8], [11 x i8]* @.str.2, i64 0, i64 0))
+  ret void
+}
+
+declare i32 @__cxa_atexit(void (i8*)*, i8*, i8*)
+
+; Function Attrs: nofree norecurse nounwind uwtable
+define dso_local i32 @main(i32 %0, i8** nocapture readnone %1) local_unnamed_addr #2 {
+  %3 = tail call i32 @puts(i8* nonnull dereferenceable(1) getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0))
+  ret i32 0
+}
+
+declare i32 @puts(i8* nocapture readonly)
+
+define internal void @constructor() {
+  %1 = tail call i32 @puts(i8* nonnull dereferenceable(1) getelementptr inbounds ([12 x i8], [12 x i8]* @.str.1, i64 0, i64 0)) #5
+  %2 = tail call i32 @__cxa_atexit(void (i8*)* @destructor, i8* null, i8* nonnull @__dso_handle) #5
+  ret void
+}
diff --git compiler-rt/test/orc/TestCases/Linux/loongarch64/priority-static-initializer.S compiler-rt/test/orc/TestCases/Linux/loongarch64/priority-static-initializer.S
new file mode 100644
index 000000000000..9e2ce7a20a1f
--- /dev/null
+++ compiler-rt/test/orc/TestCases/Linux/loongarch64/priority-static-initializer.S
@@ -0,0 +1,126 @@
+// Test that ELF static initializers with different constructor priorities work
+// and are executed in the proper order.
+//
+// RUN: %clang -c -o %t %s
+// RUN: %llvm_jitlink %t | FileCheck %s
+
+// CHECK: constructor 100
+// CHECK-NEXT: constructor 200
+// CHECK-NEXT: constructor 65535
+// CHECK-NEXT: main
+// CHECK-NEXT: destructor
+
+    .text
+    .globl    destructor
+    .p2align    2
+    .type    destructor,@function
+destructor:
+.Ldestructor$local:
+
+    pcalau12i    $a0, %pc_hi20(.L.str.2)
+    addi.d    $a0, $a0, %pc_lo12(.L.str.2)
+    b    %plt(puts)
+
+    .globl    main
+    .p2align    2
+    .type    main,@function
+main:
+
+    addi.d    $sp, $sp, -16
+    st.d    $ra, $sp, 8                     # 8-byte Folded Spill
+    pcalau12i    $a0, %pc_hi20(.L.str)
+    addi.d    $a0, $a0, %pc_lo12(.L.str)
+    bl    %plt(puts)
+    move    $a0, $zero
+    ld.d    $ra, $sp, 8                     # 8-byte Folded Reload
+    addi.d    $sp, $sp, 16
+    ret
+
+    .p2align    2
+    .type    constructor.65535,@function
+constructor.65535:
+
+    addi.d    $sp, $sp, -16
+    st.d    $ra, $sp, 8                     # 8-byte Folded Spill
+    pcalau12i    $a0, %pc_hi20(.L.str.65535)
+    addi.d    $a0, $a0, %pc_lo12(.L.str.65535)
+    bl    %plt(puts)
+    pcalau12i    $a0, %got_pc_hi20(__dso_handle)
+    ld.d    $a0, $a0, %got_pc_lo12(__dso_handle)
+    ld.d    $a2, $a0, 0
+    pcalau12i    $a0, %pc_hi20(.Ldestructor$local)
+    addi.d    $a0, $a0, %pc_lo12(.Ldestructor$local)
+    move    $a1, $zero
+    ld.d    $ra, $sp, 8                     # 8-byte Folded Reload
+    addi.d    $sp, $sp, 16
+    b    %plt(__cxa_atexit)
+
+    .p2align    2
+    .type    constructor.100,@function
+constructor.100:
+
+    addi.d    $sp, $sp, -16
+    st.d    $ra, $sp, 8                     # 8-byte Folded Spill
+    st.d    $fp, $sp, 0                     # 8-byte Folded Spill
+    addi.d    $fp, $sp, 16
+    pcalau12i    $a0, %pc_hi20(.L.str.100)
+    addi.d    $a0, $a0, %pc_lo12(.L.str.100)
+    bl    %plt(puts)
+    ld.d    $fp, $sp, 0                     # 8-byte Folded Reload
+    ld.d    $ra, $sp, 8                     # 8-byte Folded Reload
+    addi.d    $sp, $sp, 16
+    ret
+
+    .p2align    2
+    .type    constructor.200,@function
+constructor.200:
+
+    addi.d    $sp, $sp, -16
+    st.d    $ra, $sp, 8                     # 8-byte Folded Spill
+    st.d    $fp, $sp, 0                     # 8-byte Folded Spill
+    addi.d    $fp, $sp, 16
+    pcalau12i    $a0, %pc_hi20(.L.str.200)
+    addi.d    $a0, $a0, %pc_lo12(.L.str.200)
+    bl    %plt(puts)
+    ld.d    $fp, $sp, 0                     # 8-byte Folded Reload
+    ld.d    $ra, $sp, 8                     # 8-byte Folded Reload
+    addi.d    $sp, $sp, 16
+    ret
+
+    .hidden    __dso_handle
+    .type    .L.str,@object
+    .section    .rodata.str1.1,"aMS",@progbits,1
+.L.str:
+    .asciz    "main"
+    .size    .L.str, 5
+
+    .type    .L.str.100,@object
+.L.str.100:
+    .asciz    "constructor 100"
+    .size    .L.str.100, 16
+
+    .type    .L.str.200,@object
+.L.str.200:
+    .asciz    "constructor 200"
+    .size    .L.str.200, 16
+
+    .type    .L.str.65535,@object
+.L.str.65535:
+    .asciz    "constructor 65535"
+    .size    .L.str.65535, 18
+
+
+    .type    .L.str.2,@object
+.L.str.2:
+    .asciz    "destructor"
+    .size    .L.str.2, 11
+
+    .section    .init_array.100,"aw",@init_array
+    .p2align    3
+    .dword    constructor.100
+    .section    .init_array.200,"aw",@init_array
+    .p2align    3
+    .dword    constructor.200
+    .section    .init_array,"aw",@init_array
+    .p2align    3
+    .dword    constructor.65535
diff --git compiler-rt/test/orc/TestCases/Linux/loongarch64/trivial-atexit.S compiler-rt/test/orc/TestCases/Linux/loongarch64/trivial-atexit.S
new file mode 100644
index 000000000000..38a388b19ba4
--- /dev/null
+++ compiler-rt/test/orc/TestCases/Linux/loongarch64/trivial-atexit.S
@@ -0,0 +1,42 @@
+// Test that the runtime correctly interposes atexit.
+//
+// REQUIRES: disabled
+//   This test is disabled until a proper atexit interpose can be implemented:
+//   the current one assumes that atexit is defined in the dylib that calls it,
+//   which is not true in general. See
+//   https://github.com/llvm/llvm-project/issues/74641
+//
+// RUN: %clang -c -o %t %s
+// RUN: %llvm_jitlink %t
+
+    .text
+// OnExit destructor resets the test result override to zero.
+    .globl    on_exit
+    .p2align    2
+    .type    on_exit,@function
+on_exit:
+
+    move    $a0, $zero
+    b    %plt(llvm_jitlink_setTestResultOverride)
+.Lfunc_end0:
+    .size    on_exit, .Lfunc_end0-on_exit
+
+// main registers the atexit and sets the test result to one.
+    .globl    main
+    .p2align    2
+    .type    main,@function
+main:
+
+    addi.d    $sp, $sp, -16
+    st.d    $ra, $sp, 8                     # 8-byte Folded Spill
+    pcalau12i    $a0, %pc_hi20(on_exit)
+    addi.d    $a0, $a0, %pc_lo12(on_exit)
+    bl    %plt(atexit)
+    ori    $a0, $zero, 1
+    bl    %plt(llvm_jitlink_setTestResultOverride)
+    move    $a0, $zero
+    ld.d    $ra, $sp, 8                     # 8-byte Folded Reload
+    addi.d    $sp, $sp, 16
+    ret
+.Lfunc_end1:
+    .size    main, .Lfunc_end1-main
diff --git compiler-rt/test/orc/TestCases/Linux/loongarch64/trivial-cxa-atexit.S compiler-rt/test/orc/TestCases/Linux/loongarch64/trivial-cxa-atexit.S
new file mode 100644
index 000000000000..f2a6a7624cdb
--- /dev/null
+++ compiler-rt/test/orc/TestCases/Linux/loongarch64/trivial-cxa-atexit.S
@@ -0,0 +1,40 @@
+// Test that the runtime correctly interposes ___cxa_atexit.
+//
+// RUN: %clang -c -o %t %s
+// RUN: %llvm_jitlink %t
+
+    .text
+// Destructor resets the test result override to zero.
+    .globl    on_exit
+    .p2align    2
+    .type    on_exit,@function
+on_exit:
+
+    move    $a0, $zero
+    b    %plt(llvm_jitlink_setTestResultOverride)
+.Lfunc_end0:
+    .size    on_exit, .Lfunc_end0-on_exit
+
+// main registers the atexit and sets the test result to one.
+    .globl    main
+    .p2align    2
+    .type    main,@function
+main:
+
+    addi.d    $sp, $sp, -16
+    st.d    $ra, $sp, 8                     # 8-byte Folded Spill
+    pcalau12i    $a0, %got_pc_hi20(__dso_handle)
+    ld.d    $a0, $a0, %got_pc_lo12(__dso_handle)
+    ld.d    $a2, $a0, 0
+    pcalau12i    $a0, %pc_hi20(on_exit)
+    addi.d    $a0, $a0, %pc_lo12(on_exit)
+    move    $a1, $zero
+    bl    %plt(__cxa_atexit)
+    ori    $a0, $zero, 1
+    bl    %plt(llvm_jitlink_setTestResultOverride)
+    move    $a0, $zero
+    ld.d    $ra, $sp, 8                     # 8-byte Folded Reload
+    addi.d    $sp, $sp, 16
+    ret
+.Lfunc_end1:
+    .size    main, .Lfunc_end1-main
diff --git compiler-rt/test/orc/TestCases/Linux/loongarch64/trivial-static-initializer.S compiler-rt/test/orc/TestCases/Linux/loongarch64/trivial-static-initializer.S
new file mode 100644
index 000000000000..20fa0cc2bcd4
--- /dev/null
+++ compiler-rt/test/orc/TestCases/Linux/loongarch64/trivial-static-initializer.S
@@ -0,0 +1,45 @@
+// Test that basic ELF static initializers work. The main function in this
+// test returns the value of 'x', which is initially 1 in the data section,
+// and reset to 0 if the _static_init function is run. If the static initializer
+// does not run then main will return 1, causing the test to be treated as a
+// failure.
+//
+// RUN: %clang -c -o %t %s
+// RUN: %llvm_jitlink %t
+
+    .text
+    .globl    main
+    .p2align    2
+    .type    main,@function
+main:
+
+    pcalau12i    $a0, %pc_hi20(x)
+    ld.w    $a0, $a0, %pc_lo12(x)
+    ret
+.Lfunc_end0:
+    .size    main, .Lfunc_end0-main
+
+// static initializer sets the value of 'x' to zero.
+
+    .section    .text.startup,"ax",@progbits
+    .p2align    2
+    .type    static_init,@function
+static_init:
+
+    pcalau12i    $a0, %pc_hi20(x)
+    st.w    $zero, $a0, %pc_lo12(x)
+    ret
+.Lfunc_end1:
+    .size    static_init, .Lfunc_end1-static_init
+
+    .type    x,@object
+    .data
+    .globl    x
+    .p2align    2
+x:
+    .word    1
+    .size    x, 4
+
+    .section    .init_array,"aw",@init_array
+    .p2align    3
+    .dword    static_init
diff --git flang/include/flang/Lower/LoweringOptions.def flang/include/flang/Lower/LoweringOptions.def
index 5a6debfdffe0..396c91948be3 100644
--- flang/include/flang/Lower/LoweringOptions.def
+++ flang/include/flang/Lower/LoweringOptions.def
@@ -44,5 +44,8 @@ ENUM_LOWERINGOPT(IntegerWrapAround, unsigned, 1, 0)
 /// If false, assume that the shapes/types/allocation-status match.
 ENUM_LOWERINGOPT(ReallocateLHS, unsigned, 1, 1)
 
+/// If true, initialize globals without initialization to zero.
+/// On by default.
+ENUM_LOWERINGOPT(InitGlobalZero, unsigned, 1, 1)
 #undef LOWERINGOPT
 #undef ENUM_LOWERINGOPT
diff --git flang/include/flang/Optimizer/Builder/HLFIRTools.h flang/include/flang/Optimizer/Builder/HLFIRTools.h
index 6e85b8f4ddf8..0684ad0f926e 100644
--- flang/include/flang/Optimizer/Builder/HLFIRTools.h
+++ flang/include/flang/Optimizer/Builder/HLFIRTools.h
@@ -513,6 +513,12 @@ genTypeAndKindConvert(mlir::Location loc, fir::FirOpBuilder &builder,
 Entity loadElementAt(mlir::Location loc, fir::FirOpBuilder &builder,
                      Entity entity, mlir::ValueRange oneBasedIndices);
 
+/// Return a vector of extents for the given entity.
+/// The function creates new operations, but tries to clean-up
+/// after itself.
+llvm::SmallVector<mlir::Value, Fortran::common::maxRank>
+genExtentsVector(mlir::Location loc, fir::FirOpBuilder &builder, Entity entity);
+
 } // namespace hlfir
 
 #endif // FORTRAN_OPTIMIZER_BUILDER_HLFIRTOOLS_H
diff --git flang/include/flang/Optimizer/Dialect/FIRType.h flang/include/flang/Optimizer/Dialect/FIRType.h
index 78257ab70308..e19fcde8d0e6 100644
--- flang/include/flang/Optimizer/Dialect/FIRType.h
+++ flang/include/flang/Optimizer/Dialect/FIRType.h
@@ -139,6 +139,13 @@ inline bool isa_builtin_cptr_type(mlir::Type t) {
   return false;
 }
 
+// Is `t` type(c_devptr)?
+inline bool isa_builtin_c_devptr_type(mlir::Type t) {
+  if (auto recTy = mlir::dyn_cast_or_null<fir::RecordType>(t))
+    return recTy.getName().ends_with("T__builtin_c_devptr");
+  return false;
+}
+
 /// Is `t` type(c_devptr)?
 inline bool isa_builtin_cdevptr_type(mlir::Type t) {
   if (auto recTy = mlir::dyn_cast_or_null<fir::RecordType>(t))
diff --git flang/lib/Frontend/CompilerInvocation.cpp flang/lib/Frontend/CompilerInvocation.cpp
index 15b1e1e0a248..3c6da4687f65 100644
--- flang/lib/Frontend/CompilerInvocation.cpp
+++ flang/lib/Frontend/CompilerInvocation.cpp
@@ -1377,6 +1377,14 @@ bool CompilerInvocation::createFromArgs(
     invoc.loweringOpts.setNoPPCNativeVecElemOrder(true);
   }
 
+  // -f[no-]init-global-zero
+  if (args.hasFlag(clang::driver::options::OPT_finit_global_zero,
+                   clang::driver::options::OPT_fno_init_global_zero,
+                   /*default=*/true))
+    invoc.loweringOpts.setInitGlobalZero(true);
+  else
+    invoc.loweringOpts.setInitGlobalZero(false);
+
   // Preserve all the remark options requested, i.e. -Rpass, -Rpass-missed or
   // -Rpass-analysis. This will be used later when processing and outputting the
   // remarks generated by LLVM in ExecuteCompilerInvocation.cpp.
diff --git flang/lib/Lower/ConvertVariable.cpp flang/lib/Lower/ConvertVariable.cpp
index 9ee42d5cd880..87236dc293eb 100644
--- flang/lib/Lower/ConvertVariable.cpp
+++ flang/lib/Lower/ConvertVariable.cpp
@@ -635,7 +635,11 @@ static fir::GlobalOp defineGlobal(Fortran::lower::AbstractConverter &converter,
       global.setLinkName(builder.createCommonLinkage());
     Fortran::lower::createGlobalInitialization(
         builder, global, [&](fir::FirOpBuilder &builder) {
-          mlir::Value initValue = builder.create<fir::ZeroOp>(loc, symTy);
+          mlir::Value initValue;
+          if (converter.getLoweringOptions().getInitGlobalZero())
+            initValue = builder.create<fir::ZeroOp>(loc, symTy);
+          else
+            initValue = builder.create<fir::UndefOp>(loc, symTy);
           builder.create<fir::HasValueOp>(loc, initValue);
         });
   }
diff --git flang/lib/Lower/OpenMP/OpenMP.cpp flang/lib/Lower/OpenMP/OpenMP.cpp
index 52541bb91481..39b4de919c8b 100644
--- flang/lib/Lower/OpenMP/OpenMP.cpp
+++ flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -3341,6 +3341,7 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
         !std::holds_alternative<clause::UseDevicePtr>(clause.u) &&
         !std::holds_alternative<clause::InReduction>(clause.u) &&
         !std::holds_alternative<clause::Mergeable>(clause.u) &&
+        !std::holds_alternative<clause::Untied>(clause.u) &&
         !std::holds_alternative<clause::TaskReduction>(clause.u) &&
         !std::holds_alternative<clause::Detach>(clause.u)) {
       std::string name =
diff --git flang/lib/Optimizer/Builder/FIRBuilder.cpp flang/lib/Optimizer/Builder/FIRBuilder.cpp
index ad1244ef99b4..64c540cfb95a 100644
--- flang/lib/Optimizer/Builder/FIRBuilder.cpp
+++ flang/lib/Optimizer/Builder/FIRBuilder.cpp
@@ -1401,6 +1401,10 @@ static void genComponentByComponentAssignment(fir::FirOpBuilder &builder,
 /// Can the assignment of this record type be implement with a simple memory
 /// copy (it requires no deep copy or user defined assignment of components )?
 static bool recordTypeCanBeMemCopied(fir::RecordType recordType) {
+  // c_devptr type is a special case. It has a nested c_ptr field but we know it
+  // can be copied directly.
+  if (fir::isa_builtin_c_devptr_type(recordType))
+    return true;
   if (fir::hasDynamicSize(recordType))
     return false;
   for (auto [_, fieldType] : recordType.getTypeList()) {
diff --git flang/lib/Optimizer/Builder/HLFIRTools.cpp flang/lib/Optimizer/Builder/HLFIRTools.cpp
index 5e5d0bbd6813..f71adf123511 100644
--- flang/lib/Optimizer/Builder/HLFIRTools.cpp
+++ flang/lib/Optimizer/Builder/HLFIRTools.cpp
@@ -1421,3 +1421,15 @@ hlfir::Entity hlfir::loadElementAt(mlir::Location loc,
   return loadTrivialScalar(loc, builder,
                            getElementAt(loc, builder, entity, oneBasedIndices));
 }
+
+llvm::SmallVector<mlir::Value, Fortran::common::maxRank>
+hlfir::genExtentsVector(mlir::Location loc, fir::FirOpBuilder &builder,
+                        hlfir::Entity entity) {
+  entity = hlfir::derefPointersAndAllocatables(loc, builder, entity);
+  mlir::Value shape = hlfir::genShape(loc, builder, entity);
+  llvm::SmallVector<mlir::Value, Fortran::common::maxRank> extents =
+      hlfir::getExplicitExtentsFromShape(shape, builder);
+  if (shape.getUses().empty())
+    shape.getDefiningOp()->erase();
+  return extents;
+}
diff --git flang/lib/Optimizer/Dialect/FIRType.cpp flang/lib/Optimizer/Dialect/FIRType.cpp
index d8ce231d1b5a..0b57a10a6c49 100644
--- flang/lib/Optimizer/Dialect/FIRType.cpp
+++ flang/lib/Optimizer/Dialect/FIRType.cpp
@@ -210,6 +210,7 @@ mlir::Type getDerivedType(mlir::Type ty) {
           return seq.getEleTy();
         return p.getEleTy();
       })
+      .Case<fir::BoxType>([](auto p) { return getDerivedType(p.getEleTy()); })
       .Default([](mlir::Type t) { return t; });
 }
 
diff --git flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
index 0fe3620b7f1a..fe7ae0eeed3c 100644
--- flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
+++ flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
@@ -37,6 +37,79 @@ static llvm::cl::opt<bool> forceMatmulAsElemental(
 
 namespace {
 
+// Helper class to generate operations related to computing
+// product of values.
+class ProductFactory {
+public:
+  ProductFactory(mlir::Location loc, fir::FirOpBuilder &builder)
+      : loc(loc), builder(builder) {}
+
+  // Generate an update of the inner product value:
+  //   acc += v1 * v2, OR
+  //   acc += CONJ(v1) * v2, OR
+  //   acc ||= v1 && v2
+  //
+  // CONJ parameter specifies whether the first complex product argument
+  // needs to be conjugated.
+  template <bool CONJ = false>
+  mlir::Value genAccumulateProduct(mlir::Value acc, mlir::Value v1,
+                                   mlir::Value v2) {
+    mlir::Type resultType = acc.getType();
+    acc = castToProductType(acc, resultType);
+    v1 = castToProductType(v1, resultType);
+    v2 = castToProductType(v2, resultType);
+    mlir::Value result;
+    if (mlir::isa<mlir::FloatType>(resultType)) {
+      result = builder.create<mlir::arith::AddFOp>(
+          loc, acc, builder.create<mlir::arith::MulFOp>(loc, v1, v2));
+    } else if (mlir::isa<mlir::ComplexType>(resultType)) {
+      if constexpr (CONJ)
+        result = fir::IntrinsicLibrary{builder, loc}.genConjg(resultType, v1);
+      else
+        result = v1;
+
+      result = builder.create<fir::AddcOp>(
+          loc, acc, builder.create<fir::MulcOp>(loc, result, v2));
+    } else if (mlir::isa<mlir::IntegerType>(resultType)) {
+      result = builder.create<mlir::arith::AddIOp>(
+          loc, acc, builder.create<mlir::arith::MulIOp>(loc, v1, v2));
+    } else if (mlir::isa<fir::LogicalType>(resultType)) {
+      result = builder.create<mlir::arith::OrIOp>(
+          loc, acc, builder.create<mlir::arith::AndIOp>(loc, v1, v2));
+    } else {
+      llvm_unreachable("unsupported type");
+    }
+
+    return builder.createConvert(loc, resultType, result);
+  }
+
+private:
+  mlir::Location loc;
+  fir::FirOpBuilder &builder;
+
+  mlir::Value castToProductType(mlir::Value value, mlir::Type type) {
+    if (mlir::isa<fir::LogicalType>(type))
+      return builder.createConvert(loc, builder.getIntegerType(1), value);
+
+    // TODO: the multiplications/additions by/of zero resulting from
+    // complex * real are optimized by LLVM under -fno-signed-zeros
+    // -fno-honor-nans.
+    // We can make them disappear by default if we:
+    //   * either expand the complex multiplication into real
+    //     operations, OR
+    //   * set nnan nsz fast-math flags to the complex operations.
+    if (fir::isa_complex(type) && !fir::isa_complex(value.getType())) {
+      mlir::Value zeroCmplx = fir::factory::createZeroValue(builder, loc, type);
+      fir::factory::Complex helper(builder, loc);
+      mlir::Type partType = helper.getComplexPartType(type);
+      return helper.insertComplexPart(zeroCmplx,
+                                      castToProductType(value, partType),
+                                      /*isImagPart=*/false);
+    }
+    return builder.createConvert(loc, type, value);
+  }
+};
+
 class TransposeAsElementalConversion
     : public mlir::OpRewritePattern<hlfir::TransposeOp> {
 public:
@@ -90,11 +163,8 @@ private:
   static mlir::Value genResultShape(mlir::Location loc,
                                     fir::FirOpBuilder &builder,
                                     hlfir::Entity array) {
-    mlir::Value inShape = hlfir::genShape(loc, builder, array);
-    llvm::SmallVector<mlir::Value> inExtents =
-        hlfir::getExplicitExtentsFromShape(inShape, builder);
-    if (inShape.getUses().empty())
-      inShape.getDefiningOp()->erase();
+    llvm::SmallVector<mlir::Value, 2> inExtents =
+        hlfir::genExtentsVector(loc, builder, array);
 
     // transpose indices
     assert(inExtents.size() == 2 && "checked in TransposeOp::validate");
@@ -137,7 +207,7 @@ public:
     mlir::Value resultShape, dimExtent;
     llvm::SmallVector<mlir::Value> arrayExtents;
     if (isTotalReduction)
-      arrayExtents = genArrayExtents(loc, builder, array);
+      arrayExtents = hlfir::genExtentsVector(loc, builder, array);
     else
       std::tie(resultShape, dimExtent) =
           genResultShapeForPartialReduction(loc, builder, array, dimVal);
@@ -163,7 +233,8 @@ public:
       // If DIM is not present, do total reduction.
 
       // Initial value for the reduction.
-      mlir::Value reductionInitValue = genInitValue(loc, builder, elementType);
+      mlir::Value reductionInitValue =
+          fir::factory::createZeroValue(builder, loc, elementType);
 
       // The reduction loop may be unordered if FastMathFlags::reassoc
       // transformations are allowed. The integer reduction is always
@@ -264,17 +335,6 @@ public:
   }
 
 private:
-  static llvm::SmallVector<mlir::Value>
-  genArrayExtents(mlir::Location loc, fir::FirOpBuilder &builder,
-                  hlfir::Entity array) {
-    mlir::Value inShape = hlfir::genShape(loc, builder, array);
-    llvm::SmallVector<mlir::Value> inExtents =
-        hlfir::getExplicitExtentsFromShape(inShape, builder);
-    if (inShape.getUses().empty())
-      inShape.getDefiningOp()->erase();
-    return inExtents;
-  }
-
   // Return fir.shape specifying the shape of the result
   // of a SUM reduction with DIM=dimVal. The second return value
   // is the extent of the DIM dimension.
@@ -283,7 +343,7 @@ private:
                                     fir::FirOpBuilder &builder,
                                     hlfir::Entity array, int64_t dimVal) {
     llvm::SmallVector<mlir::Value> inExtents =
-        genArrayExtents(loc, builder, array);
+        hlfir::genExtentsVector(loc, builder, array);
     assert(dimVal > 0 && dimVal <= static_cast<int64_t>(inExtents.size()) &&
            "DIM must be present and a positive constant not exceeding "
            "the array's rank");
@@ -293,26 +353,6 @@ private:
     return {builder.create<fir::ShapeOp>(loc, inExtents), dimExtent};
   }
 
-  // Generate the initial value for a SUM reduction with the given
-  // data type.
-  static mlir::Value genInitValue(mlir::Location loc,
-                                  fir::FirOpBuilder &builder,
-                                  mlir::Type elementType) {
-    if (auto ty = mlir::dyn_cast<mlir::FloatType>(elementType)) {
-      const llvm::fltSemantics &sem = ty.getFloatSemantics();
-      return builder.createRealConstant(loc, elementType,
-                                        llvm::APFloat::getZero(sem));
-    } else if (auto ty = mlir::dyn_cast<mlir::ComplexType>(elementType)) {
-      mlir::Value initValue = genInitValue(loc, builder, ty.getElementType());
-      return fir::factory::Complex{builder, loc}.createComplex(ty, initValue,
-                                                               initValue);
-    } else if (mlir::isa<mlir::IntegerType>(elementType)) {
-      return builder.createIntegerConstant(loc, elementType, 0);
-    }
-
-    llvm_unreachable("unsupported SUM reduction type");
-  }
-
   // Generate scalar addition of the two values (of the same data type).
   static mlir::Value genScalarAdd(mlir::Location loc,
                                   fir::FirOpBuilder &builder,
@@ -570,16 +610,10 @@ private:
   static std::tuple<mlir::Value, mlir::Value>
   genResultShape(mlir::Location loc, fir::FirOpBuilder &builder,
                  hlfir::Entity input1, hlfir::Entity input2) {
-    mlir::Value input1Shape = hlfir::genShape(loc, builder, input1);
-    llvm::SmallVector<mlir::Value> input1Extents =
-        hlfir::getExplicitExtentsFromShape(input1Shape, builder);
-    if (input1Shape.getUses().empty())
-      input1Shape.getDefiningOp()->erase();
-    mlir::Value input2Shape = hlfir::genShape(loc, builder, input2);
-    llvm::SmallVector<mlir::Value> input2Extents =
-        hlfir::getExplicitExtentsFromShape(input2Shape, builder);
-    if (input2Shape.getUses().empty())
-      input2Shape.getDefiningOp()->erase();
+    llvm::SmallVector<mlir::Value, 2> input1Extents =
+        hlfir::genExtentsVector(loc, builder, input1);
+    llvm::SmallVector<mlir::Value, 2> input2Extents =
+        hlfir::genExtentsVector(loc, builder, input2);
 
     llvm::SmallVector<mlir::Value, 2> newExtents;
     mlir::Value innerProduct1Extent, innerProduct2Extent;
@@ -627,60 +661,6 @@ private:
             innerProductExtent[0]};
   }
 
-  static mlir::Value castToProductType(mlir::Location loc,
-                                       fir::FirOpBuilder &builder,
-                                       mlir::Value value, mlir::Type type) {
-    if (mlir::isa<fir::LogicalType>(type))
-      return builder.createConvert(loc, builder.getIntegerType(1), value);
-
-    // TODO: the multiplications/additions by/of zero resulting from
-    // complex * real are optimized by LLVM under -fno-signed-zeros
-    // -fno-honor-nans.
-    // We can make them disappear by default if we:
-    //   * either expand the complex multiplication into real
-    //     operations, OR
-    //   * set nnan nsz fast-math flags to the complex operations.
-    if (fir::isa_complex(type) && !fir::isa_complex(value.getType())) {
-      mlir::Value zeroCmplx = fir::factory::createZeroValue(builder, loc, type);
-      fir::factory::Complex helper(builder, loc);
-      mlir::Type partType = helper.getComplexPartType(type);
-      return helper.insertComplexPart(
-          zeroCmplx, castToProductType(loc, builder, value, partType),
-          /*isImagPart=*/false);
-    }
-    return builder.createConvert(loc, type, value);
-  }
-
-  // Generate an update of the inner product value:
-  //   acc += v1 * v2, OR
-  //   acc ||= v1 && v2
-  static mlir::Value genAccumulateProduct(mlir::Location loc,
-                                          fir::FirOpBuilder &builder,
-                                          mlir::Type resultType,
-                                          mlir::Value acc, mlir::Value v1,
-                                          mlir::Value v2) {
-    acc = castToProductType(loc, builder, acc, resultType);
-    v1 = castToProductType(loc, builder, v1, resultType);
-    v2 = castToProductType(loc, builder, v2, resultType);
-    mlir::Value result;
-    if (mlir::isa<mlir::FloatType>(resultType))
-      result = builder.create<mlir::arith::AddFOp>(
-          loc, acc, builder.create<mlir::arith::MulFOp>(loc, v1, v2));
-    else if (mlir::isa<mlir::ComplexType>(resultType))
-      result = builder.create<fir::AddcOp>(
-          loc, acc, builder.create<fir::MulcOp>(loc, v1, v2));
-    else if (mlir::isa<mlir::IntegerType>(resultType))
-      result = builder.create<mlir::arith::AddIOp>(
-          loc, acc, builder.create<mlir::arith::MulIOp>(loc, v1, v2));
-    else if (mlir::isa<fir::LogicalType>(resultType))
-      result = builder.create<mlir::arith::OrIOp>(
-          loc, acc, builder.create<mlir::arith::AndIOp>(loc, v1, v2));
-    else
-      llvm_unreachable("unsupported type");
-
-    return builder.createConvert(loc, resultType, result);
-  }
-
   static mlir::LogicalResult
   genContiguousMatmul(mlir::Location loc, fir::FirOpBuilder &builder,
                       hlfir::Entity result, mlir::Value resultShape,
@@ -748,9 +728,9 @@ private:
             hlfir::loadElementAt(loc, builder, lhs, {I, K});
         hlfir::Entity rhsElementValue =
             hlfir::loadElementAt(loc, builder, rhs, {K, J});
-        mlir::Value productValue = genAccumulateProduct(
-            loc, builder, resultElementType, resultElementValue,
-            lhsElementValue, rhsElementValue);
+        mlir::Value productValue =
+            ProductFactory{loc, builder}.genAccumulateProduct(
+                resultElementValue, lhsElementValue, rhsElementValue);
         builder.create<hlfir::AssignOp>(loc, productValue, resultElement);
         return {};
       };
@@ -785,9 +765,9 @@ private:
             hlfir::loadElementAt(loc, builder, lhs, {J, K});
         hlfir::Entity rhsElementValue =
             hlfir::loadElementAt(loc, builder, rhs, {K});
-        mlir::Value productValue = genAccumulateProduct(
-            loc, builder, resultElementType, resultElementValue,
-            lhsElementValue, rhsElementValue);
+        mlir::Value productValue =
+            ProductFactory{loc, builder}.genAccumulateProduct(
+                resultElementValue, lhsElementValue, rhsElementValue);
         builder.create<hlfir::AssignOp>(loc, productValue, resultElement);
         return {};
       };
@@ -817,9 +797,9 @@ private:
             hlfir::loadElementAt(loc, builder, lhs, {K});
         hlfir::Entity rhsElementValue =
             hlfir::loadElementAt(loc, builder, rhs, {K, J});
-        mlir::Value productValue = genAccumulateProduct(
-            loc, builder, resultElementType, resultElementValue,
-            lhsElementValue, rhsElementValue);
+        mlir::Value productValue =
+            ProductFactory{loc, builder}.genAccumulateProduct(
+                resultElementValue, lhsElementValue, rhsElementValue);
         builder.create<hlfir::AssignOp>(loc, productValue, resultElement);
         return {};
       };
@@ -885,9 +865,9 @@ private:
             hlfir::loadElementAt(loc, builder, lhs, lhsIndices);
         hlfir::Entity rhsElementValue =
             hlfir::loadElementAt(loc, builder, rhs, rhsIndices);
-        mlir::Value productValue = genAccumulateProduct(
-            loc, builder, resultElementType, reductionArgs[0], lhsElementValue,
-            rhsElementValue);
+        mlir::Value productValue =
+            ProductFactory{loc, builder}.genAccumulateProduct(
+                reductionArgs[0], lhsElementValue, rhsElementValue);
         return {productValue};
       };
       llvm::SmallVector<mlir::Value, 1> innerProductValue =
@@ -904,6 +884,73 @@ private:
   }
 };
 
+class DotProductConversion
+    : public mlir::OpRewritePattern<hlfir::DotProductOp> {
+public:
+  using mlir::OpRewritePattern<hlfir::DotProductOp>::OpRewritePattern;
+
+  llvm::LogicalResult
+  matchAndRewrite(hlfir::DotProductOp product,
+                  mlir::PatternRewriter &rewriter) const override {
+    hlfir::Entity op = hlfir::Entity{product};
+    if (!op.isScalar())
+      return rewriter.notifyMatchFailure(product, "produces non-scalar result");
+
+    mlir::Location loc = product.getLoc();
+    fir::FirOpBuilder builder{rewriter, product.getOperation()};
+    hlfir::Entity lhs = hlfir::Entity{product.getLhs()};
+    hlfir::Entity rhs = hlfir::Entity{product.getRhs()};
+    mlir::Type resultElementType = product.getType();
+    bool isUnordered = mlir::isa<mlir::IntegerType>(resultElementType) ||
+                       mlir::isa<fir::LogicalType>(resultElementType) ||
+                       static_cast<bool>(builder.getFastMathFlags() &
+                                         mlir::arith::FastMathFlags::reassoc);
+
+    mlir::Value extent = genProductExtent(loc, builder, lhs, rhs);
+
+    auto genBody = [&](mlir::Location loc, fir::FirOpBuilder &builder,
+                       mlir::ValueRange oneBasedIndices,
+                       mlir::ValueRange reductionArgs)
+        -> llvm::SmallVector<mlir::Value, 1> {
+      hlfir::Entity lhsElementValue =
+          hlfir::loadElementAt(loc, builder, lhs, oneBasedIndices);
+      hlfir::Entity rhsElementValue =
+          hlfir::loadElementAt(loc, builder, rhs, oneBasedIndices);
+      mlir::Value productValue =
+          ProductFactory{loc, builder}.genAccumulateProduct</*CONJ=*/true>(
+              reductionArgs[0], lhsElementValue, rhsElementValue);
+      return {productValue};
+    };
+
+    mlir::Value initValue =
+        fir::factory::createZeroValue(builder, loc, resultElementType);
+
+    llvm::SmallVector<mlir::Value, 1> result = hlfir::genLoopNestWithReductions(
+        loc, builder, {extent},
+        /*reductionInits=*/{initValue}, genBody, isUnordered);
+
+    rewriter.replaceOp(product, result[0]);
+    return mlir::success();
+  }
+
+private:
+  static mlir::Value genProductExtent(mlir::Location loc,
+                                      fir::FirOpBuilder &builder,
+                                      hlfir::Entity input1,
+                                      hlfir::Entity input2) {
+    llvm::SmallVector<mlir::Value, 1> input1Extents =
+        hlfir::genExtentsVector(loc, builder, input1);
+    llvm::SmallVector<mlir::Value, 1> input2Extents =
+        hlfir::genExtentsVector(loc, builder, input2);
+
+    assert(input1Extents.size() == 1 && input2Extents.size() == 1 &&
+           "hlfir.dot_product arguments must be vectors");
+    llvm::SmallVector<mlir::Value, 1> extent =
+        fir::factory::deduceOptimalExtents(input1Extents, input2Extents);
+    return extent[0];
+  }
+};
+
 class SimplifyHLFIRIntrinsics
     : public hlfir::impl::SimplifyHLFIRIntrinsicsBase<SimplifyHLFIRIntrinsics> {
 public:
@@ -939,6 +986,8 @@ public:
     if (forceMatmulAsElemental || this->allowNewSideEffects)
       patterns.insert<MatmulConversion<hlfir::MatmulOp>>(context);
 
+    patterns.insert<DotProductConversion>(context);
+
     if (mlir::failed(mlir::applyPatternsGreedily(
             getOperation(), std::move(patterns), config))) {
       mlir::emitError(getOperation()->getLoc(),
diff --git flang/lib/Optimizer/OpenMP/GenericLoopConversion.cpp flang/lib/Optimizer/OpenMP/GenericLoopConversion.cpp
index c3c1f3b2848b..555601c5e92d 100644
--- flang/lib/Optimizer/OpenMP/GenericLoopConversion.cpp
+++ flang/lib/Optimizer/OpenMP/GenericLoopConversion.cpp
@@ -30,19 +30,39 @@ class GenericLoopConversionPattern
     : public mlir::OpConversionPattern<mlir::omp::LoopOp> {
 public:
   enum class GenericLoopCombinedInfo {
-    None,
+    Standalone,
     TargetTeamsLoop,
     TargetParallelLoop
   };
 
   using mlir::OpConversionPattern<mlir::omp::LoopOp>::OpConversionPattern;
 
+  explicit GenericLoopConversionPattern(mlir::MLIRContext *ctx)
+      : mlir::OpConversionPattern<mlir::omp::LoopOp>{ctx} {
+    // Enable rewrite recursion to make sure nested `loop` directives are
+    // handled.
+    this->setHasBoundedRewriteRecursion(true);
+  }
+
   mlir::LogicalResult
   matchAndRewrite(mlir::omp::LoopOp loopOp, OpAdaptor adaptor,
                   mlir::ConversionPatternRewriter &rewriter) const override {
     assert(mlir::succeeded(checkLoopConversionSupportStatus(loopOp)));
 
-    rewriteToDistributeParallelDo(loopOp, rewriter);
+    GenericLoopCombinedInfo combinedInfo = findGenericLoopCombineInfo(loopOp);
+
+    switch (combinedInfo) {
+    case GenericLoopCombinedInfo::Standalone:
+      rewriteToSimdLoop(loopOp, rewriter);
+      break;
+    case GenericLoopCombinedInfo::TargetParallelLoop:
+      llvm_unreachable("not yet implemented: `parallel loop` direcitve");
+      break;
+    case GenericLoopCombinedInfo::TargetTeamsLoop:
+      rewriteToDistributeParallelDo(loopOp, rewriter);
+      break;
+    }
+
     rewriter.eraseOp(loopOp);
     return mlir::success();
   }
@@ -52,9 +72,8 @@ public:
     GenericLoopCombinedInfo combinedInfo = findGenericLoopCombineInfo(loopOp);
 
     switch (combinedInfo) {
-    case GenericLoopCombinedInfo::None:
-      return loopOp.emitError(
-          "not yet implemented: Standalone `omp loop` directive");
+    case GenericLoopCombinedInfo::Standalone:
+      break;
     case GenericLoopCombinedInfo::TargetParallelLoop:
       return loopOp.emitError(
           "not yet implemented: Combined `omp target parallel loop` directive");
@@ -86,7 +105,7 @@ private:
   static GenericLoopCombinedInfo
   findGenericLoopCombineInfo(mlir::omp::LoopOp loopOp) {
     mlir::Operation *parentOp = loopOp->getParentOp();
-    GenericLoopCombinedInfo result = GenericLoopCombinedInfo::None;
+    GenericLoopCombinedInfo result = GenericLoopCombinedInfo::Standalone;
 
     if (auto teamsOp = mlir::dyn_cast_if_present<mlir::omp::TeamsOp>(parentOp))
       if (mlir::isa_and_present<mlir::omp::TargetOp>(teamsOp->getParentOp()))
@@ -100,6 +119,62 @@ private:
     return result;
   }
 
+  /// Rewrites standalone `loop` directives to equivalent `simd` constructs.
+  /// The reasoning behind this decision is that according to the spec (version
+  /// 5.2, section 11.7.1):
+  ///
+  /// "If the bind clause is not specified on a construct for which it may be
+  /// specified and the construct is closely nested inside a teams or parallel
+  /// construct, the effect is as if binding is teams or parallel. If none of
+  /// those conditions hold, the binding region is not defined."
+  ///
+  /// which means that standalone `loop` directives have undefined binding
+  /// region. Moreover, the spec says (in the next paragraph):
+  ///
+  /// "The specified binding region determines the binding thread set.
+  /// Specifically, if the binding region is a teams region, then the binding
+  /// thread set is the set of initial threads that are executing that region
+  /// while if the binding region is a parallel region, then the binding thread
+  /// set is the team of threads that are executing that region. If the binding
+  /// region is not defined, then the binding thread set is the encountering
+  /// thread."
+  ///
+  /// which means that the binding thread set for a standalone `loop` directive
+  /// is only the encountering thread.
+  ///
+  /// Since the encountering thread is the binding thread (set) for a
+  /// standalone `loop` directive, the best we can do in such case is to "simd"
+  /// the directive.
+  void rewriteToSimdLoop(mlir::omp::LoopOp loopOp,
+                         mlir::ConversionPatternRewriter &rewriter) const {
+    loopOp.emitWarning("Detected standalone OpenMP `loop` directive, the "
+                       "associated loop will be rewritten to `simd`.");
+    mlir::omp::SimdOperands simdClauseOps;
+    simdClauseOps.privateVars = loopOp.getPrivateVars();
+
+    auto privateSyms = loopOp.getPrivateSyms();
+    if (privateSyms)
+      simdClauseOps.privateSyms.assign(privateSyms->begin(),
+                                       privateSyms->end());
+
+    Fortran::common::openmp::EntryBlockArgs simdArgs;
+    simdArgs.priv.vars = simdClauseOps.privateVars;
+
+    auto simdOp =
+        rewriter.create<mlir::omp::SimdOp>(loopOp.getLoc(), simdClauseOps);
+    mlir::Block *simdBlock =
+        genEntryBlock(rewriter, simdArgs, simdOp.getRegion());
+
+    mlir::IRMapping mapper;
+    mlir::Block &loopBlock = *loopOp.getRegion().begin();
+
+    for (auto [loopOpArg, simdopArg] :
+         llvm::zip_equal(loopBlock.getArguments(), simdBlock->getArguments()))
+      mapper.map(loopOpArg, simdopArg);
+
+    rewriter.clone(*loopOp.begin(), mapper);
+  }
+
   void rewriteToDistributeParallelDo(
       mlir::omp::LoopOp loopOp,
       mlir::ConversionPatternRewriter &rewriter) const {
diff --git flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
index 8ae3d313d881..555f354521c9 100644
--- flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
+++ flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
@@ -258,70 +258,6 @@ mlir::LLVM::DITypeAttr DebugTypeGenerator::convertBoxedSequenceType(
       dataLocation, /*rank=*/nullptr, allocated, associated);
 }
 
-// If the type is a pointer or array type then gets its underlying type.
-static mlir::LLVM::DITypeAttr getUnderlyingType(mlir::LLVM::DITypeAttr Ty) {
-  if (auto ptrTy =
-          mlir::dyn_cast_if_present<mlir::LLVM::DIDerivedTypeAttr>(Ty)) {
-    if (ptrTy.getTag() == llvm::dwarf::DW_TAG_pointer_type)
-      Ty = getUnderlyingType(ptrTy.getBaseType());
-  }
-  if (auto comTy =
-          mlir::dyn_cast_if_present<mlir::LLVM::DICompositeTypeAttr>(Ty)) {
-    if (comTy.getTag() == llvm::dwarf::DW_TAG_array_type)
-      Ty = getUnderlyingType(comTy.getBaseType());
-  }
-  return Ty;
-}
-
-// Currently, the handling of recursive debug type in mlir has some limitations.
-// Those limitations were discussed at the end of the thread for following PR.
-// https://github.com/llvm/llvm-project/pull/106571
-//
-// Problem could be explained with the following example code:
-//  type t2
-//   type(t1), pointer :: p1
-// end type
-// type t1
-//   type(t2), pointer :: p2
-// end type
-// In the description below, type_self means a temporary type that is generated
-// as a place holder while the members of that type are being processed.
-//
-// If we process t1 first then we will have the following structure after it has
-// been processed.
-// t1 -> t2 -> t1_self
-// This is because when we started processing t2, we did not have the complete
-// t1 but its place holder t1_self.
-// Now if some entity requires t2, we will already have that in cache and will
-// return it. But this t2 refers to t1_self and not to t1. In mlir handling,
-// only those types are allowed to have _self reference which are wrapped by
-// entity whose reference it is. So t1 -> t2 -> t1_self is ok because the
-// t1_self reference can be resolved by the outer t1. But standalone t2 is not
-// because there will be no way to resolve it. Until this is fixed in mlir, we
-// avoid caching such types. Please see DebugTranslation::translateRecursive for
-// details on how mlir handles recursive types.
-static bool canCacheThisType(mlir::LLVM::DICompositeTypeAttr comTy) {
-  for (auto el : comTy.getElements()) {
-    if (auto mem =
-            mlir::dyn_cast_if_present<mlir::LLVM::DIDerivedTypeAttr>(el)) {
-      mlir::LLVM::DITypeAttr memTy = getUnderlyingType(mem.getBaseType());
-      if (auto baseTy =
-              mlir::dyn_cast_if_present<mlir::LLVM::DICompositeTypeAttr>(
-                  memTy)) {
-        // We will not cache a type if one of its member meets the following
-        // conditions:
-        // 1. It is a structure type
-        // 2. It is a place holder type (getIsRecSelf() is true)
-        // 3. It is not a self reference. It is ok to have t1_self in t1.
-        if (baseTy.getTag() == llvm::dwarf::DW_TAG_structure_type &&
-            baseTy.getIsRecSelf() && (comTy.getRecId() != baseTy.getRecId()))
-          return false;
-      }
-    }
-  }
-  return true;
-}
-
 std::pair<std::uint64_t, unsigned short>
 DebugTypeGenerator::getFieldSizeAndAlign(mlir::Type fieldTy) {
   mlir::Type llvmTy;
@@ -343,6 +279,7 @@ mlir::LLVM::DITypeAttr DebugTypeGenerator::convertRecordType(
   if (iter != typeCache.end())
     return iter->second;
 
+  bool canCacheThisType = true;
   llvm::SmallVector<mlir::LLVM::DINodeAttr> elements;
   mlir::MLIRContext *context = module.getContext();
   auto recId = mlir::DistinctAttr::create(mlir::UnitAttr::get(context));
@@ -406,6 +343,62 @@ mlir::LLVM::DITypeAttr DebugTypeGenerator::convertRecordType(
         /*extra data=*/nullptr);
     elements.push_back(tyAttr);
     offset += llvm::alignTo(byteSize, byteAlign);
+
+    // Currently, the handling of recursive debug type in mlir has some
+    // limitations that were discussed at the end of the thread for following
+    // PR.
+    // https://github.com/llvm/llvm-project/pull/106571
+    //
+    // Problem could be explained with the following example code:
+    //  type t2
+    //   type(t1), pointer :: p1
+    // end type
+    // type t1
+    //   type(t2), pointer :: p2
+    // end type
+    // In the description below, type_self means a temporary type that is
+    // generated
+    // as a place holder while the members of that type are being processed.
+    //
+    // If we process t1 first then we will have the following structure after
+    // it has been processed.
+    // t1 -> t2 -> t1_self
+    // This is because when we started processing t2, we did not have the
+    // complete t1 but its place holder t1_self.
+    // Now if some entity requires t2, we will already have that in cache and
+    // will return it. But this t2 refers to t1_self and not to t1. In mlir
+    // handling, only those types are allowed to have _self reference which are
+    // wrapped by entity whose reference it is. So t1 -> t2 -> t1_self is ok
+    // because the t1_self reference can be resolved by the outer t1. But
+    // standalone t2 is not because there will be no way to resolve it. Until
+    // this is fixed in mlir, we avoid caching such types. Please see
+    // DebugTranslation::translateRecursive for details on how mlir handles
+    // recursive types.
+    // The code below checks for situation where it will be unsafe to cache
+    // a type to avoid this problem. We do that in 2 situations.
+    // 1. If a member is record type, then its type would have been processed
+    // before reaching here. If it is not in the cache, it means that it was
+    // found to be unsafe to cache. So any type containing it will also not
+    // be cached
+    // 2. The type of the member is found in the cache but it is a place holder.
+    // In this case, its recID should match the recID of the type we are
+    // processing. This helps us to cache the following type.
+    // type t
+    //  type(t), allocatable :: p
+    // end type
+    mlir::Type baseTy = getDerivedType(fieldTy);
+    if (auto recTy = mlir::dyn_cast<fir::RecordType>(baseTy)) {
+      auto iter = typeCache.find(recTy);
+      if (iter == typeCache.end())
+        canCacheThisType = false;
+      else {
+        if (auto tyAttr =
+                mlir::dyn_cast<mlir::LLVM::DICompositeTypeAttr>(iter->second)) {
+          if (tyAttr.getIsRecSelf() && tyAttr.getRecId() != recId)
+            canCacheThisType = false;
+        }
+      }
+    }
   }
 
   auto finalAttr = mlir::LLVM::DICompositeTypeAttr::get(
@@ -414,7 +407,7 @@ mlir::LLVM::DITypeAttr DebugTypeGenerator::convertRecordType(
       /*baseType=*/nullptr, mlir::LLVM::DIFlags::Zero, offset * 8,
       /*alignInBits=*/0, elements, /*dataLocation=*/nullptr, /*rank=*/nullptr,
       /*allocated=*/nullptr, /*associated=*/nullptr);
-  if (canCacheThisType(finalAttr)) {
+  if (canCacheThisType) {
     typeCache[Ty] = finalAttr;
   } else {
     auto iter = typeCache.find(Ty);
diff --git flang/lib/Semantics/check-omp-structure.cpp flang/lib/Semantics/check-omp-structure.cpp
index 6db43cf6f04b..d3f2d3fd2f9d 100644
--- flang/lib/Semantics/check-omp-structure.cpp
+++ flang/lib/Semantics/check-omp-structure.cpp
@@ -213,6 +213,30 @@ private:
   std::map<std::string, std::int64_t> constructNamesAndLevels_;
 };
 
+// `OmpUnitedTaskDesignatorChecker` is used to check if the designator
+// can appear within the TASK construct
+class OmpUnitedTaskDesignatorChecker {
+public:
+  OmpUnitedTaskDesignatorChecker(SemanticsContext &context)
+      : context_{context} {}
+
+  template <typename T> bool Pre(const T &) { return true; }
+  template <typename T> void Post(const T &) {}
+
+  bool Pre(const parser::Name &name) {
+    if (name.symbol->test(Symbol::Flag::OmpThreadprivate)) {
+      // OpenMP 5.2: 5.2 threadprivate directive restriction
+      context_.Say(name.source,
+          "A THREADPRIVATE variable `%s` cannot appear in an UNTIED TASK region"_err_en_US,
+          name.source);
+    }
+    return true;
+  }
+
+private:
+  SemanticsContext &context_;
+};
+
 bool OmpStructureChecker::CheckAllowedClause(llvmOmpClause clause) {
   unsigned version{context_.langOptions().OpenMPVersion};
   DirectiveContext &dirCtx = GetContext();
@@ -1172,6 +1196,16 @@ void OmpStructureChecker::Enter(const parser::OpenMPBlockConstruct &x) {
     HasInvalidWorksharingNesting(
         beginDir.source, llvm::omp::nestedWorkshareErrSet);
     break;
+  case llvm::omp::Directive::OMPD_task: {
+    const auto &clauses{std::get<parser::OmpClauseList>(beginBlockDir.t)};
+    for (const auto &clause : clauses.v) {
+      if (std::get_if<parser::OmpClause::Untied>(&clause.u)) {
+        OmpUnitedTaskDesignatorChecker check{context_};
+        parser::Walk(block, check);
+      }
+    }
+    break;
+  }
   default:
     break;
   }
diff --git flang/lib/Semantics/resolve-directives.cpp flang/lib/Semantics/resolve-directives.cpp
index 39478b58a907..ea102371334a 100644
--- flang/lib/Semantics/resolve-directives.cpp
+++ flang/lib/Semantics/resolve-directives.cpp
@@ -1777,7 +1777,6 @@ void OmpAttributeVisitor::ResolveSeqLoopIndexInParallelOrTaskConstruct(
 // Use of DO CONCURRENT inside OpenMP construct is unspecified behavior
 // till OpenMP-5.0 standard.
 // In above both cases we skip the privatization of iteration variables.
-// [OpenMP 5.1] DO CONCURRENT indices are private
 bool OmpAttributeVisitor::Pre(const parser::DoConstruct &x) {
   if (!dirContext_.empty() && GetContext().withinConstruct) {
     llvm::SmallVector<const parser::Name *> ivs;
@@ -1785,20 +1784,6 @@ bool OmpAttributeVisitor::Pre(const parser::DoConstruct &x) {
       const parser::Name *iv{GetLoopIndex(x)};
       if (iv && iv->symbol)
         ivs.push_back(iv);
-    } else if (x.IsDoConcurrent()) {
-      const Fortran::parser::LoopControl *loopControl = &*x.GetLoopControl();
-      const Fortran::parser::LoopControl::Concurrent &concurrent =
-          std::get<Fortran::parser::LoopControl::Concurrent>(loopControl->u);
-      const Fortran::parser::ConcurrentHeader &concurrentHeader =
-          std::get<Fortran::parser::ConcurrentHeader>(concurrent.t);
-      const std::list<Fortran::parser::ConcurrentControl> &controls =
-          std::get<std::list<Fortran::parser::ConcurrentControl>>(
-              concurrentHeader.t);
-      for (const auto &control : controls) {
-        const parser::Name *iv{&std::get<0>(control.t)};
-        if (iv && iv->symbol)
-          ivs.push_back(iv);
-      }
     }
     ordCollapseLevel--;
     for (auto iv : ivs) {
@@ -1810,9 +1795,6 @@ bool OmpAttributeVisitor::Pre(const parser::DoConstruct &x) {
       if (ordCollapseLevel) {
         if (const auto *details{iv->symbol->detailsIf<HostAssocDetails>()}) {
           const Symbol *tpSymbol = &details->symbol();
-          // TODO: DoConcurrent won't capture the following check because a new
-          // symbol is declared in ResolveIndexName(), which will not have the
-          // OmpThreadprivate flag.
           if (tpSymbol->test(Symbol::Flag::OmpThreadprivate)) {
             context_.Say(iv->source,
                 "Loop iteration variable %s is not allowed in THREADPRIVATE."_err_en_US,
@@ -2119,6 +2101,7 @@ static bool IsPrivatizable(const Symbol *sym) {
           *sym) && /* OpenMP 5.2, 5.1.1: Assumed-size arrays are shared*/
       !sym->owner().IsDerivedType() &&
       sym->owner().kind() != Scope::Kind::ImpliedDos &&
+      sym->owner().kind() != Scope::Kind::Forall &&
       !sym->detailsIf<semantics::AssocEntityDetails>() &&
       !sym->detailsIf<semantics::NamelistDetails>() &&
       (!misc ||
diff --git flang/lib/Semantics/resolve-names.cpp flang/lib/Semantics/resolve-names.cpp
index f3c2a5bf094d..9857ae61939d 100644
--- flang/lib/Semantics/resolve-names.cpp
+++ flang/lib/Semantics/resolve-names.cpp
@@ -736,6 +736,8 @@ protected:
     std::vector<const std::list<parser::EquivalenceObject> *> equivalenceSets;
     // Names of all common block objects in the scope
     std::set<SourceName> commonBlockObjects;
+    // Names of all names that show in a declare target declaration
+    std::set<SourceName> declareTargetNames;
     // Info about SAVE statements and attributes in current scope
     struct {
       std::optional<SourceName> saveAll; // "SAVE" without entity list
@@ -1223,6 +1225,7 @@ private:
   const parser::Name *FindComponent(const parser::Name *, const parser::Name &);
   void Initialization(const parser::Name &, const parser::Initialization &,
       bool inComponentDecl);
+  bool FindAndMarkDeclareTargetSymbol(const parser::Name &);
   bool PassesLocalityChecks(
       const parser::Name &name, Symbol &symbol, Symbol::Flag flag);
   bool CheckForHostAssociatedImplicit(const parser::Name &);
@@ -1524,7 +1527,47 @@ public:
     return true;
   }
   void Post(const parser::OpenMPThreadprivate &) { SkipImplicitTyping(false); }
-  bool Pre(const parser::OpenMPDeclareTargetConstruct &) {
+  bool Pre(const parser::OpenMPDeclareTargetConstruct &x) {
+    const auto &spec{std::get<parser::OmpDeclareTargetSpecifier>(x.t)};
+    auto populateDeclareTargetNames{
+        [this](const parser::OmpObjectList &objectList) {
+          for (const auto &ompObject : objectList.v) {
+            common::visit(
+                common::visitors{
+                    [&](const parser::Designator &designator) {
+                      if (const auto *name{
+                              semantics::getDesignatorNameIfDataRef(
+                                  designator)}) {
+                        specPartState_.declareTargetNames.insert(name->source);
+                      }
+                    },
+                    [&](const parser::Name &name) {
+                      specPartState_.declareTargetNames.insert(name.source);
+                    },
+                },
+                ompObject.u);
+          }
+        }};
+
+    if (const auto *objectList{parser::Unwrap<parser::OmpObjectList>(spec.u)}) {
+      populateDeclareTargetNames(*objectList);
+    } else if (const auto *clauseList{
+                   parser::Unwrap<parser::OmpClauseList>(spec.u)}) {
+      for (const auto &clause : clauseList->v) {
+        if (const auto *toClause{
+                std::get_if<parser::OmpClause::To>(&clause.u)}) {
+          populateDeclareTargetNames(
+              std::get<parser::OmpObjectList>(toClause->v.t));
+        } else if (const auto *linkClause{
+                       std::get_if<parser::OmpClause::Link>(&clause.u)}) {
+          populateDeclareTargetNames(linkClause->v);
+        } else if (const auto *enterClause{
+                       std::get_if<parser::OmpClause::Enter>(&clause.u)}) {
+          populateDeclareTargetNames(enterClause->v);
+        }
+      }
+    }
+
     SkipImplicitTyping(true);
     return true;
   }
@@ -8126,7 +8169,12 @@ const parser::Name *DeclarationVisitor::ResolveDataRef(
 // If implicit types are allowed, ensure name is in the symbol table.
 // Otherwise, report an error if it hasn't been declared.
 const parser::Name *DeclarationVisitor::ResolveName(const parser::Name &name) {
-  FindSymbol(name);
+  if (!FindSymbol(name)) {
+    if (FindAndMarkDeclareTargetSymbol(name)) {
+      return &name;
+    }
+  }
+
   if (CheckForHostAssociatedImplicit(name)) {
     NotePossibleBadForwardRef(name);
     return &name;
@@ -8313,6 +8361,48 @@ const parser::Name *DeclarationVisitor::FindComponent(
   return nullptr;
 }
 
+bool DeclarationVisitor::FindAndMarkDeclareTargetSymbol(
+    const parser::Name &name) {
+  if (!specPartState_.declareTargetNames.empty()) {
+    if (specPartState_.declareTargetNames.count(name.source)) {
+      if (!currScope().IsTopLevel()) {
+        // Search preceding scopes until we find a matching symbol or run out
+        // of scopes to search, we skip the current scope as it's already been
+        // designated as implicit here.
+        Symbol *symbol = nullptr;
+        for (auto *scope = &currScope().parent();; scope = &scope->parent()) {
+          if (Symbol * symbol{scope->FindSymbol(name.source)}) {
+            if (symbol->test(Symbol::Flag::Subroutine) ||
+                symbol->test(Symbol::Flag::Function)) {
+              const auto [sym, success]{currScope().try_emplace(
+                  symbol->name(), Attrs{}, HostAssocDetails{*symbol})};
+              assert(success &&
+                  "FindAndMarkDeclareTargetSymbol could not emplace new "
+                  "subroutine/function symbol");
+              name.symbol = &*sym->second;
+              symbol->test(Symbol::Flag::Subroutine)
+                  ? name.symbol->set(Symbol::Flag::Subroutine)
+                  : name.symbol->set(Symbol::Flag::Function);
+              return true;
+            }
+            // if we find a symbol that is not a function or subroutine, we
+            // currently escape without doing anything.
+            break;
+          }
+
+          // This is our loop exit condition, as parent() has an inbuilt assert
+          // if you call it on a top level scope, rather than returning a null
+          // value.
+          if (scope->IsTopLevel()) {
+            return false;
+          }
+        }
+      }
+    }
+  }
+  return false;
+}
+
 void DeclarationVisitor::Initialization(const parser::Name &name,
     const parser::Initialization &init, bool inComponentDecl) {
   // Traversal of the initializer was deferred to here so that the
diff --git flang/test/Driver/fno-zero-init.f90 flang/test/Driver/fno-zero-init.f90
new file mode 100644
index 000000000000..2ffa10dd040d
--- /dev/null
+++ flang/test/Driver/fno-zero-init.f90
@@ -0,0 +1,9 @@
+! Check that the driver passes through -f[no-]init-global-zero:
+! RUN: %flang -### -S -finit-global-zero %s -o - 2>&1 | FileCheck --check-prefix=CHECK-POS %s
+! RUN: %flang -### -S -fno-init-global-zero %s -o - 2>&1 | FileCheck --check-prefix=CHECK-NEG %s
+! Check that the compiler accepts -f[no-]init-global-zero:
+! RUN: %flang_fc1 -emit-hlfir -finit-global-zero %s -o -
+! RUN: %flang_fc1 -emit-hlfir -fno-init-global-zero %s -o -
+
+! CHECK-POS: "-fc1"{{.*}}"-finit-global-zero"
+! CHECK-NEG: "-fc1"{{.*}}"-fno-init-global-zero"
diff --git flang/test/HLFIR/simplify-hlfir-intrinsics-dotproduct.fir flang/test/HLFIR/simplify-hlfir-intrinsics-dotproduct.fir
new file mode 100644
index 000000000000..f59b1422dbc8
--- /dev/null
+++ flang/test/HLFIR/simplify-hlfir-intrinsics-dotproduct.fir
@@ -0,0 +1,144 @@
+// Test hlfir.dot_product simplification to a reduction loop:
+// RUN: fir-opt --simplify-hlfir-intrinsics %s | FileCheck %s
+
+func.func @dot_product_integer(%arg0: !hlfir.expr<?xi16>, %arg1: !hlfir.expr<?xi32>) -> i32 {
+  %res = hlfir.dot_product %arg0 %arg1 : (!hlfir.expr<?xi16>, !hlfir.expr<?xi32>) -> i32
+  return %res : i32
+}
+// CHECK-LABEL:   func.func @dot_product_integer(
+// CHECK-SAME:                                   %[[VAL_0:.*]]: !hlfir.expr<?xi16>,
+// CHECK-SAME:                                   %[[VAL_1:.*]]: !hlfir.expr<?xi32>) -> i32 {
+// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_3:.*]] = arith.constant 0 : i32
+// CHECK:           %[[VAL_4:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr<?xi16>) -> !fir.shape<1>
+// CHECK:           %[[VAL_5:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 0 : index} : (!fir.shape<1>) -> index
+// CHECK:           %[[VAL_6:.*]] = fir.do_loop %[[VAL_7:.*]] = %[[VAL_2]] to %[[VAL_5]] step %[[VAL_2]] unordered iter_args(%[[VAL_8:.*]] = %[[VAL_3]]) -> (i32) {
+// CHECK:             %[[VAL_9:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_7]] : (!hlfir.expr<?xi16>, index) -> i16
+// CHECK:             %[[VAL_10:.*]] = hlfir.apply %[[VAL_1]], %[[VAL_7]] : (!hlfir.expr<?xi32>, index) -> i32
+// CHECK:             %[[VAL_11:.*]] = fir.convert %[[VAL_9]] : (i16) -> i32
+// CHECK:             %[[VAL_12:.*]] = arith.muli %[[VAL_11]], %[[VAL_10]] : i32
+// CHECK:             %[[VAL_13:.*]] = arith.addi %[[VAL_8]], %[[VAL_12]] : i32
+// CHECK:             fir.result %[[VAL_13]] : i32
+// CHECK:           }
+// CHECK:           return %[[VAL_6]] : i32
+// CHECK:         }
+
+func.func @dot_product_real(%arg0: !hlfir.expr<?xf32>, %arg1: !hlfir.expr<?xf16>) -> f32 {
+  %res = hlfir.dot_product %arg0 %arg1 : (!hlfir.expr<?xf32>, !hlfir.expr<?xf16>) -> f32
+  return %res : f32
+}
+// CHECK-LABEL:   func.func @dot_product_real(
+// CHECK-SAME:                                %[[VAL_0:.*]]: !hlfir.expr<?xf32>,
+// CHECK-SAME:                                %[[VAL_1:.*]]: !hlfir.expr<?xf16>) -> f32 {
+// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:           %[[VAL_4:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr<?xf32>) -> !fir.shape<1>
+// CHECK:           %[[VAL_5:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 0 : index} : (!fir.shape<1>) -> index
+// CHECK:           %[[VAL_6:.*]] = fir.do_loop %[[VAL_7:.*]] = %[[VAL_2]] to %[[VAL_5]] step %[[VAL_2]] iter_args(%[[VAL_8:.*]] = %[[VAL_3]]) -> (f32) {
+// CHECK:             %[[VAL_9:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_7]] : (!hlfir.expr<?xf32>, index) -> f32
+// CHECK:             %[[VAL_10:.*]] = hlfir.apply %[[VAL_1]], %[[VAL_7]] : (!hlfir.expr<?xf16>, index) -> f16
+// CHECK:             %[[VAL_11:.*]] = fir.convert %[[VAL_10]] : (f16) -> f32
+// CHECK:             %[[VAL_12:.*]] = arith.mulf %[[VAL_9]], %[[VAL_11]] : f32
+// CHECK:             %[[VAL_13:.*]] = arith.addf %[[VAL_8]], %[[VAL_12]] : f32
+// CHECK:             fir.result %[[VAL_13]] : f32
+// CHECK:           }
+// CHECK:           return %[[VAL_6]] : f32
+// CHECK:         }
+
+func.func @dot_product_complex(%arg0: !hlfir.expr<?xcomplex<f32>>, %arg1: !hlfir.expr<?xcomplex<f16>>) -> complex<f32> {
+  %res = hlfir.dot_product %arg0 %arg1 : (!hlfir.expr<?xcomplex<f32>>, !hlfir.expr<?xcomplex<f16>>) -> complex<f32>
+  return %res : complex<f32>
+}
+// CHECK-LABEL:   func.func @dot_product_complex(
+// CHECK-SAME:                                   %[[VAL_0:.*]]: !hlfir.expr<?xcomplex<f32>>,
+// CHECK-SAME:                                   %[[VAL_1:.*]]: !hlfir.expr<?xcomplex<f16>>) -> complex<f32> {
+// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:           %[[VAL_4:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr<?xcomplex<f32>>) -> !fir.shape<1>
+// CHECK:           %[[VAL_5:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 0 : index} : (!fir.shape<1>) -> index
+// CHECK:           %[[VAL_6:.*]] = fir.undefined complex<f32>
+// CHECK:           %[[VAL_7:.*]] = fir.insert_value %[[VAL_6]], %[[VAL_3]], [0 : index] : (complex<f32>, f32) -> complex<f32>
+// CHECK:           %[[VAL_8:.*]] = fir.insert_value %[[VAL_7]], %[[VAL_3]], [1 : index] : (complex<f32>, f32) -> complex<f32>
+// CHECK:           %[[VAL_9:.*]] = fir.do_loop %[[VAL_10:.*]] = %[[VAL_2]] to %[[VAL_5]] step %[[VAL_2]] iter_args(%[[VAL_11:.*]] = %[[VAL_8]]) -> (complex<f32>) {
+// CHECK:             %[[VAL_12:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_10]] : (!hlfir.expr<?xcomplex<f32>>, index) -> complex<f32>
+// CHECK:             %[[VAL_13:.*]] = hlfir.apply %[[VAL_1]], %[[VAL_10]] : (!hlfir.expr<?xcomplex<f16>>, index) -> complex<f16>
+// CHECK:             %[[VAL_14:.*]] = fir.convert %[[VAL_13]] : (complex<f16>) -> complex<f32>
+// CHECK:             %[[VAL_15:.*]] = fir.extract_value %[[VAL_12]], [1 : index] : (complex<f32>) -> f32
+// CHECK:             %[[VAL_16:.*]] = arith.negf %[[VAL_15]] : f32
+// CHECK:             %[[VAL_17:.*]] = fir.insert_value %[[VAL_12]], %[[VAL_16]], [1 : index] : (complex<f32>, f32) -> complex<f32>
+// CHECK:             %[[VAL_18:.*]] = fir.mulc %[[VAL_17]], %[[VAL_14]] : complex<f32>
+// CHECK:             %[[VAL_19:.*]] = fir.addc %[[VAL_11]], %[[VAL_18]] : complex<f32>
+// CHECK:             fir.result %[[VAL_19]] : complex<f32>
+// CHECK:           }
+// CHECK:           return %[[VAL_9]] : complex<f32>
+// CHECK:         }
+
+func.func @dot_product_real_complex(%arg0: !hlfir.expr<?xf32>, %arg1: !hlfir.expr<?xcomplex<f16>>) -> complex<f32> {
+  %res = hlfir.dot_product %arg0 %arg1 : (!hlfir.expr<?xf32>, !hlfir.expr<?xcomplex<f16>>) -> complex<f32>
+  return %res : complex<f32>
+}
+// CHECK-LABEL:   func.func @dot_product_real_complex(
+// CHECK-SAME:                                        %[[VAL_0:.*]]: !hlfir.expr<?xf32>,
+// CHECK-SAME:                                        %[[VAL_1:.*]]: !hlfir.expr<?xcomplex<f16>>) -> complex<f32> {
+// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:           %[[VAL_4:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr<?xf32>) -> !fir.shape<1>
+// CHECK:           %[[VAL_5:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 0 : index} : (!fir.shape<1>) -> index
+// CHECK:           %[[VAL_6:.*]] = fir.undefined complex<f32>
+// CHECK:           %[[VAL_7:.*]] = fir.insert_value %[[VAL_6]], %[[VAL_3]], [0 : index] : (complex<f32>, f32) -> complex<f32>
+// CHECK:           %[[VAL_8:.*]] = fir.insert_value %[[VAL_7]], %[[VAL_3]], [1 : index] : (complex<f32>, f32) -> complex<f32>
+// CHECK:           %[[VAL_9:.*]] = fir.do_loop %[[VAL_10:.*]] = %[[VAL_2]] to %[[VAL_5]] step %[[VAL_2]] iter_args(%[[VAL_11:.*]] = %[[VAL_8]]) -> (complex<f32>) {
+// CHECK:             %[[VAL_12:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_10]] : (!hlfir.expr<?xf32>, index) -> f32
+// CHECK:             %[[VAL_13:.*]] = hlfir.apply %[[VAL_1]], %[[VAL_10]] : (!hlfir.expr<?xcomplex<f16>>, index) -> complex<f16>
+// CHECK:             %[[VAL_14:.*]] = fir.undefined complex<f32>
+// CHECK:             %[[VAL_15:.*]] = fir.insert_value %[[VAL_14]], %[[VAL_3]], [0 : index] : (complex<f32>, f32) -> complex<f32>
+// CHECK:             %[[VAL_16:.*]] = fir.insert_value %[[VAL_15]], %[[VAL_3]], [1 : index] : (complex<f32>, f32) -> complex<f32>
+// CHECK:             %[[VAL_17:.*]] = fir.insert_value %[[VAL_16]], %[[VAL_12]], [0 : index] : (complex<f32>, f32) -> complex<f32>
+// CHECK:             %[[VAL_18:.*]] = fir.convert %[[VAL_13]] : (complex<f16>) -> complex<f32>
+// CHECK:             %[[VAL_19:.*]] = fir.extract_value %[[VAL_17]], [1 : index] : (complex<f32>) -> f32
+// CHECK:             %[[VAL_20:.*]] = arith.negf %[[VAL_19]] : f32
+// CHECK:             %[[VAL_21:.*]] = fir.insert_value %[[VAL_17]], %[[VAL_20]], [1 : index] : (complex<f32>, f32) -> complex<f32>
+// CHECK:             %[[VAL_22:.*]] = fir.mulc %[[VAL_21]], %[[VAL_18]] : complex<f32>
+// CHECK:             %[[VAL_23:.*]] = fir.addc %[[VAL_11]], %[[VAL_22]] : complex<f32>
+// CHECK:             fir.result %[[VAL_23]] : complex<f32>
+// CHECK:           }
+// CHECK:           return %[[VAL_9]] : complex<f32>
+// CHECK:         }
+
+func.func @dot_product_logical(%arg0: !hlfir.expr<?x!fir.logical<1>>, %arg1: !hlfir.expr<?x!fir.logical<4>>) -> !fir.logical<4> {
+  %res = hlfir.dot_product %arg0 %arg1 : (!hlfir.expr<?x!fir.logical<1>>, !hlfir.expr<?x!fir.logical<4>>) -> !fir.logical<4>
+  return %res : !fir.logical<4>
+}
+// CHECK-LABEL:   func.func @dot_product_logical(
+// CHECK-SAME:                                   %[[VAL_0:.*]]: !hlfir.expr<?x!fir.logical<1>>,
+// CHECK-SAME:                                   %[[VAL_1:.*]]: !hlfir.expr<?x!fir.logical<4>>) -> !fir.logical<4> {
+// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_3:.*]] = arith.constant false
+// CHECK:           %[[VAL_4:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr<?x!fir.logical<1>>) -> !fir.shape<1>
+// CHECK:           %[[VAL_5:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 0 : index} : (!fir.shape<1>) -> index
+// CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_3]] : (i1) -> !fir.logical<4>
+// CHECK:           %[[VAL_7:.*]] = fir.do_loop %[[VAL_8:.*]] = %[[VAL_2]] to %[[VAL_5]] step %[[VAL_2]] unordered iter_args(%[[VAL_9:.*]] = %[[VAL_6]]) -> (!fir.logical<4>) {
+// CHECK:             %[[VAL_10:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_8]] : (!hlfir.expr<?x!fir.logical<1>>, index) -> !fir.logical<1>
+// CHECK:             %[[VAL_11:.*]] = hlfir.apply %[[VAL_1]], %[[VAL_8]] : (!hlfir.expr<?x!fir.logical<4>>, index) -> !fir.logical<4>
+// CHECK:             %[[VAL_12:.*]] = fir.convert %[[VAL_9]] : (!fir.logical<4>) -> i1
+// CHECK:             %[[VAL_13:.*]] = fir.convert %[[VAL_10]] : (!fir.logical<1>) -> i1
+// CHECK:             %[[VAL_14:.*]] = fir.convert %[[VAL_11]] : (!fir.logical<4>) -> i1
+// CHECK:             %[[VAL_15:.*]] = arith.andi %[[VAL_13]], %[[VAL_14]] : i1
+// CHECK:             %[[VAL_16:.*]] = arith.ori %[[VAL_12]], %[[VAL_15]] : i1
+// CHECK:             %[[VAL_17:.*]] = fir.convert %[[VAL_16]] : (i1) -> !fir.logical<4>
+// CHECK:             fir.result %[[VAL_17]] : !fir.logical<4>
+// CHECK:           }
+// CHECK:           return %[[VAL_7]] : !fir.logical<4>
+// CHECK:         }
+
+func.func @dot_product_known_dim(%arg0: !hlfir.expr<10xf32>, %arg1: !hlfir.expr<?xi16>) -> f32 {
+  %res1 = hlfir.dot_product %arg0 %arg1 : (!hlfir.expr<10xf32>, !hlfir.expr<?xi16>) -> f32
+  %res2 = hlfir.dot_product %arg1 %arg0 : (!hlfir.expr<?xi16>, !hlfir.expr<10xf32>) -> f32
+  %res = arith.addf %res1, %res2 : f32
+  return %res : f32
+}
+// CHECK-LABEL:   func.func @dot_product_known_dim(
+// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_4:.*]] = arith.constant 10 : index
+// CHECK:           fir.do_loop %{{.*}} = %[[VAL_2]] to %[[VAL_4]] step %[[VAL_2]]
+// CHECK:           fir.do_loop %{{.*}} = %[[VAL_2]] to %[[VAL_4]] step %[[VAL_2]]
diff --git flang/test/HLFIR/unroll-loops.fir flang/test/HLFIR/unroll-loops.fir
index 83b30d4d7269..d8f820263ffd 100644
--- flang/test/HLFIR/unroll-loops.fir
+++ flang/test/HLFIR/unroll-loops.fir
@@ -3,6 +3,9 @@
 // RUN: %flang_fc1 -emit-llvm -O1 -fno-unroll-loops -mllvm -force-vector-width=2 -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL
 // RUN: %flang_fc1 -emit-llvm -O1 -mllvm -force-vector-width=2 -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL
 
+// FIXME: https://github.com/llvm/llvm-project/issues/123668
+// XFAIL: powerpc64-target-arch
+
 // CHECK-LABEL: @unroll
 // CHECK-SAME: (ptr nocapture writeonly %[[ARG0:.*]])
 func.func @unroll(%arg0: !fir.ref<!fir.array<1000 x index>> {fir.bindc_name = "a"}) {
diff --git flang/test/Integration/debug-cyclic-derived-type-3.f90 flang/test/Integration/debug-cyclic-derived-type-3.f90
new file mode 100644
index 000000000000..ef9aed13cc51
--- /dev/null
+++ flang/test/Integration/debug-cyclic-derived-type-3.f90
@@ -0,0 +1,32 @@
+! RUN: %flang_fc1 -emit-llvm -debug-info-kind=standalone %s -o -
+
+! mainly test that this program does not cause an assertion failure
+! testcase for issue 122024
+
+module m1
+  type t1
+    type(t2),pointer :: x1
+  end type
+  type t2
+    type(t3),pointer :: x2
+  end type
+  type t3
+    type(t1),pointer :: x3
+  end type
+end
+
+program test
+  use m1
+  type(t1),pointer :: foo
+  allocate(foo)
+  allocate(foo%x1)
+  allocate(foo%x1%x2)
+  allocate(foo%x1%x2%x3)
+  call sub1(foo%x1)
+  print *,'done'
+end program
+
+subroutine sub1(bar)
+  use m1
+  type(t2) :: bar
+end subroutine
diff --git flang/test/Integration/unroll-loops.f90 flang/test/Integration/unroll-loops.f90
index 939c96e15069..4a356c1ec5e9 100644
--- flang/test/Integration/unroll-loops.f90
+++ flang/test/Integration/unroll-loops.f90
@@ -3,6 +3,9 @@
 ! RUN: %flang_fc1 -emit-llvm -O1 -fno-unroll-loops -mllvm -force-vector-width=2 -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL
 ! RUN: %flang_fc1 -emit-llvm -O1 -mllvm -force-vector-width=2 -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL
 
+! FIXME: https://github.com/llvm/llvm-project/issues/123668
+! XFAIL: powerpc64-target-arch
+
 ! CHECK-LABEL: @unroll
 ! CHECK-SAME: (ptr nocapture writeonly %[[ARG0:.*]])
 subroutine unroll(a)
diff --git flang/test/Lower/CUDA/cuda-devptr.cuf flang/test/Lower/CUDA/cuda-devptr.cuf
index 561d92ecd3e2..d61d84d9bc75 100644
--- flang/test/Lower/CUDA/cuda-devptr.cuf
+++ flang/test/Lower/CUDA/cuda-devptr.cuf
@@ -1,4 +1,4 @@
-! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
+! RUN: bbc -emit-fir -hlfir -fcuda %s -o - | FileCheck %s
 
 ! Test CUDA Fortran specific type
 
@@ -37,12 +37,34 @@ subroutine sub2()
 end
 
 ! CHECK-LABEL: func.func @_QPsub2()
-! CHECK: %[[X:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFsub2Ex"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
+! CHECK: %[[X:.*]] = fir.declare %{{.*}} {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFsub2Ex"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
 ! CHECK: %[[CPTR:.*]] = fir.field_index cptr, !fir.type<_QM__fortran_builtinsT__builtin_c_devptr{{[<]?}}{cptr:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}{{[>]?}}>
-! CHECK: %[[CPTR_COORD:.*]] = fir.coordinate_of %{{.*}}#1, %[[CPTR]] : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_devptr{{[<]?}}{cptr:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}{{[>]?}}>>, !fir.field) -> !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>
+! CHECK: %[[CPTR_COORD:.*]] = fir.coordinate_of %{{.*}}, %[[CPTR]] : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_devptr{{[<]?}}{cptr:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}{{[>]?}}>>, !fir.field) -> !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>
 ! CHECK: %[[ADDRESS:.*]] = fir.field_index __address, !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>
 ! CHECK: %[[ADDRESS_COORD:.*]] = fir.coordinate_of %[[CPTR_COORD]], %[[ADDRESS]] : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.field) -> !fir.ref<i64>
 ! CHECK: %[[ADDRESS_LOADED:.*]] = fir.load %[[ADDRESS_COORD]] : !fir.ref<i64>
 ! CHECK: %[[ADDRESS_IDX:.*]] = fir.convert %[[ADDRESS_LOADED]] : (i64) -> !fir.ptr<!fir.array<?xf32>>
 ! CHECK: %[[EMBOX:.*]] = fir.embox %[[ADDRESS_IDX]](%{{.*}}) : (!fir.ptr<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.box<!fir.ptr<!fir.array<?xf32>>>
-! CHECK: fir.store %[[EMBOX]] to %[[X]]#1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
+! CHECK: fir.store %[[EMBOX]] to %[[X]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
+
+attributes(global) subroutine assign_c_devptr(p, a)
+  use __fortran_builtins, only: c_devloc => __builtin_c_devloc
+  use __fortran_builtins, only: c_devptr => __builtin_c_devptr
+  type (c_devptr), device :: p
+  complex :: a(10)
+  p = c_devloc(a(1))
+end subroutine
+
+! CHECK-LABEL: func.func @_QPassign_c_devptr
+! CHECK: %[[P:.*]] = fir.declare %arg0 dummy_scope %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFassign_c_devptrEp"}
+! CHECK: %[[C_DEVLOC_RES:.*]] = fir.declare %15 {uniq_name = ".tmp.intrinsic_result"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_devptr{cptr:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>) -> !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_devptr{cptr:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>
+! CHECK: %[[CPTR_FIELD:.*]] = fir.field_index cptr, !fir.type<_QM__fortran_builtinsT__builtin_c_devptr{cptr:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>
+! CHECK: %[[RES_CPTR_COORD:.*]] = fir.coordinate_of %[[C_DEVLOC_RES]], %[[CPTR_FIELD]] : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_devptr{cptr:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>, !fir.field) -> !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>
+! CHECK: %[[CPTR_FIELD:.*]] = fir.field_index cptr, !fir.type<_QM__fortran_builtinsT__builtin_c_devptr{cptr:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>
+! CHECK: %[[P_CPTR_COORD:.*]] = fir.coordinate_of %[[P]], %[[CPTR_FIELD]] : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_devptr{cptr:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>, !fir.field) -> !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>
+! CHECK: %[[ADDRESS_FIELD:.*]] = fir.field_index __address, !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>
+! CHECK: %[[RES_ADDR_COORD:.*]] = fir.coordinate_of %[[RES_CPTR_COORD]], %[[ADDRESS_FIELD]] : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.field) -> !fir.ref<i64>
+! CHECK: %[[ADDRESS_FIELD:.*]] = fir.field_index __address, !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>
+! CHECK: %[[P_ADDR_COORD:.*]] = fir.coordinate_of %[[P_CPTR_COORD]], %[[ADDRESS_FIELD]] : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.field) -> !fir.ref<i64>
+! CHECK: %[[ADDR:.*]] = fir.load %[[RES_ADDR_COORD]] : !fir.ref<i64>
+! CHECK: fir.store %[[ADDR]] to %[[P_ADDR_COORD]] : !fir.ref<i64>
diff --git flang/test/Lower/OpenMP/Todo/allocate-clause-align.f90 flang/test/Lower/OpenMP/Todo/allocate-clause-align.f90
new file mode 100644
index 000000000000..b272d2e76d70
--- /dev/null
+++ flang/test/Lower/OpenMP/Todo/allocate-clause-align.f90
@@ -0,0 +1,13 @@
+! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -fopenmp-version=51 -o - %s 2>&1 | FileCheck %s
+
+! CHECK: not yet implemented: OmpAllocateClause ALIGN modifier
+program p
+  integer :: x
+  integer :: a
+  integer :: i
+  !$omp parallel private(x) allocate(align(4): x)
+  do i=1,10
+     a = a + i
+  end do
+  !$omp end parallel
+end program p
diff --git flang/test/Lower/OpenMP/Todo/allocate-clause-allocator.f90 flang/test/Lower/OpenMP/Todo/allocate-clause-allocator.f90
new file mode 100644
index 000000000000..90158a61f750
--- /dev/null
+++ flang/test/Lower/OpenMP/Todo/allocate-clause-allocator.f90
@@ -0,0 +1,17 @@
+! REQUIRES: openmp_runtime
+! RUN: %not_todo_cmd %flang_fc1 -emit-llvm -fopenmp -fopenmp-version=51 -o - %s 2>&1 | FileCheck %s
+
+! CHECK: not yet implemented: Unhandled clause allocate in omp.parallel
+! CHECK: LLVM Translation failed for operation: omp.parallel
+program p
+  !use omp_lib
+  integer(8),parameter::omp_default_mem_alloc=1_8
+  integer :: x
+  integer :: a
+  integer :: i
+  !$omp parallel private(x) allocate(allocator(omp_default_mem_alloc): x)
+  do i=1,10
+     a = a + i
+  end do
+  !$omp end parallel
+end program p
diff --git flang/test/Lower/OpenMP/Todo/task_untied.f90 flang/test/Lower/OpenMP/Todo/task_untied.f90
deleted file mode 100644
index 87d242ba3e9d..000000000000
--- flang/test/Lower/OpenMP/Todo/task_untied.f90
+++ /dev/null
@@ -1,13 +0,0 @@
-! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s
-! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s
-
-!===============================================================================
-! `untied` clause
-!===============================================================================
-
-! CHECK: not yet implemented: UNTIED clause is not implemented yet
-subroutine omp_task_untied()
-  !$omp task untied
-  call foo()
-  !$omp end task
-end subroutine omp_task_untied
diff --git flang/test/Lower/OpenMP/loop-directive.f90 flang/test/Lower/OpenMP/loop-directive.f90
index 4b4d640e449e..9fa0de3bfe17 100644
--- flang/test/Lower/OpenMP/loop-directive.f90
+++ flang/test/Lower/OpenMP/loop-directive.f90
@@ -11,7 +11,7 @@
 subroutine test_no_clauses()
   integer :: i, j, dummy = 1
 
-  ! CHECK: omp.loop private(@[[I_PRIV]] %{{.*}}#0 -> %[[ARG:.*]] : !fir.ref<i32>) {
+  ! CHECK: omp.simd private(@[[I_PRIV]] %{{.*}}#0 -> %[[ARG:.*]] : !fir.ref<i32>) {
   ! CHECK-NEXT:   omp.loop_nest (%[[IV:.*]]) : i32 = (%{{.*}}) to (%{{.*}}) {{.*}} {
   ! CHECK:          %[[ARG_DECL:.*]]:2 = hlfir.declare %[[ARG]]
   ! CHECK:          fir.store %[[IV]] to %[[ARG_DECL]]#1 : !fir.ref<i32>
@@ -27,7 +27,7 @@ end subroutine
 ! CHECK-LABEL: func.func @_QPtest_collapse
 subroutine test_collapse()
   integer :: i, j, dummy = 1
-  ! CHECK: omp.loop private(@{{.*}} %{{.*}}#0 -> %{{.*}}, @{{.*}} %{{.*}}#0 -> %{{.*}} : {{.*}}) {
+  ! CHECK: omp.simd private(@{{.*}} %{{.*}}#0 -> %{{.*}}, @{{.*}} %{{.*}}#0 -> %{{.*}} : {{.*}}) {
   ! CHECK-NEXT:   omp.loop_nest (%{{.*}}, %{{.*}}) : i32 {{.*}} {
   ! CHECK:        }
   ! CHECK: }
@@ -43,7 +43,7 @@ end subroutine
 ! CHECK-LABEL: func.func @_QPtest_private
 subroutine test_private()
   integer :: i, dummy = 1
-  ! CHECK: omp.loop private(@[[DUMMY_PRIV]] %{{.*}}#0 -> %[[DUMMY_ARG:.*]], @{{.*}} %{{.*}}#0 -> %{{.*}} : {{.*}}) {
+  ! CHECK: omp.simd private(@[[DUMMY_PRIV]] %{{.*}}#0 -> %[[DUMMY_ARG:.*]], @{{.*}} %{{.*}}#0 -> %{{.*}} : {{.*}}) {
   ! CHECK-NEXT:   omp.loop_nest (%{{.*}}) : i32 = (%{{.*}}) to (%{{.*}}) {{.*}} {
   ! CHECK:          %[[DUMMY_DECL:.*]]:2 = hlfir.declare %[[DUMMY_ARG]] {uniq_name = "_QFtest_privateEdummy"}
   ! CHECK:          %{{.*}} = fir.load %[[DUMMY_DECL]]#0
@@ -100,3 +100,42 @@ subroutine test_bind()
   end do
   !$omp end loop
 end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_nested_directives
+subroutine test_nested_directives
+  implicit none
+  integer, parameter :: N = 100000
+  integer a(N), b(N), c(N)
+  integer j,i, num, flag;
+  num = N
+
+  ! CHECK: omp.teams {
+
+  ! Verify the first `loop` directive was combined with `target teams` into 
+  ! `target teams distribute parallel do`.
+  ! CHECK:   omp.parallel {{.*}} {
+  ! CHECK:     omp.distribute {
+  ! CHECK:       omp.wsloop {
+  ! CHECK:         omp.loop_nest {{.*}} {
+
+  ! Very the second `loop` directive was rewritten to `simd`.
+  ! CHECK:           omp.simd {{.*}} {
+  ! CHECK:             omp.loop_nest {{.*}} {
+  ! CHECK:             }
+  ! CHECK:           }
+
+  ! CHECK:         }
+  ! CHECK:       } {omp.composite}
+  ! CHECK:     } {omp.composite}
+  ! CHECK:   } {omp.composite}
+  ! CHECK: }
+  !$omp target teams map(to: a,b) map(from: c)
+  !$omp loop
+  do j=1,1000
+    !$omp loop
+    do i=1,N
+      c(i) = a(i) * b(i)
+    end do
+  end do
+  !$omp end target teams
+end subroutine
diff --git flang/test/Lower/OpenMP/task.f90 flang/test/Lower/OpenMP/task.f90
index f5591bd9d860..13ebf2acd910 100644
--- flang/test/Lower/OpenMP/task.f90
+++ flang/test/Lower/OpenMP/task.f90
@@ -247,6 +247,10 @@ subroutine task_multiple_clauses()
   !$omp end task
 end subroutine task_multiple_clauses
 
+!===============================================================================
+! `mergeable` clause
+!===============================================================================
+
 subroutine task_mergeable()
 !CHECK: omp.task mergeable {
 !CHECK: omp.terminator
@@ -254,3 +258,16 @@ subroutine task_mergeable()
  !$omp task mergeable
  !$omp end task
 end subroutine
+
+!===============================================================================
+! `untied` clause
+!===============================================================================
+
+!CHECK-LABEL: func.func @_QPomp_task_untied() {
+subroutine omp_task_untied()
+  !CHECK: omp.task untied {
+  !$omp task untied
+    call foo()
+  !CHECK: omp.terminator
+  !$omp end task
+end subroutine omp_task_untied
diff --git flang/test/Lower/module_use.f90 flang/test/Lower/module_use.f90
index b976663239ef..f7c610c9ad22 100644
--- flang/test/Lower/module_use.f90
+++ flang/test/Lower/module_use.f90
@@ -1,6 +1,6 @@
-! RUN: rm -fr %t && mkdir -p %t
-! RUN: bbc -emit-fir -module %t %S/module_definition.f90
-! RUN: bbc -emit-fir -J %t %s -o - | FileCheck %s
+! RUN: rm -fr %t && mkdir -p %t && cd %t
+! RUN: bbc -emit-fir %S/module_definition.f90
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
 
 ! Test use of module data not defined in this file.
 ! The modules are defined in module_definition.f90
diff --git flang/test/Lower/zero_init.f90 flang/test/Lower/zero_init.f90
new file mode 100644
index 000000000000..16b11158bfce
--- /dev/null
+++ flang/test/Lower/zero_init.f90
@@ -0,0 +1,20 @@
+! RUN: %flang_fc1 -emit-hlfir -o - %s | FileCheck --check-prefix=CHECK-DEFAULT %s
+! RUN: %flang_fc1 -finit-global-zero -emit-hlfir -o - %s | FileCheck --check-prefix=CHECK-DEFAULT %s
+! RUN: %flang_fc1 -fno-init-global-zero -emit-hlfir -o - %s | FileCheck --check-prefix=CHECK-NO-ZERO-INIT %s
+! RUN: bbc -emit-hlfir -o - %s | FileCheck --check-prefix=CHECK-DEFAULT %s
+! RUN: bbc -finit-global-zero -emit-hlfir -o - %s | FileCheck --check-prefix=CHECK-DEFAULT %s
+! RUN: bbc -finit-global-zero=false -emit-hlfir -o - %s | FileCheck --check-prefix=CHECK-NO-ZERO-INIT %s
+
+module zeroInitM1
+  real :: x
+end module zeroInitM1
+
+!CHECK-DEFAULT: fir.global @_QMzeroinitm1Ex : f32 {
+!CHECK-DEFAULT:   %[[UNDEF:.*]] = fir.zero_bits f32
+!CHECK-DEFAULT:   fir.has_value %[[UNDEF]] : f32
+!CHECK-DEFAULT: }
+
+!CHECK-NO-ZERO-INIT: fir.global @_QMzeroinitm1Ex : f32 {
+!CHECK-NO-ZERO-INIT:   %[[UNDEF:.*]] = fir.undefined f32
+!CHECK-NO-ZERO-INIT:   fir.has_value %[[UNDEF]] : f32
+!CHECK-NO-ZERO-INIT: }
diff --git flang/test/Lower/zero_init_default_init.f90 flang/test/Lower/zero_init_default_init.f90
new file mode 100644
index 000000000000..8ca3b33b8ef5
--- /dev/null
+++ flang/test/Lower/zero_init_default_init.f90
@@ -0,0 +1,22 @@
+! RUN: %flang_fc1 -emit-hlfir -o - %s | FileCheck %s
+! RUN: %flang_fc1 -finit-global-zero -emit-hlfir -o - %s | FileCheck %s
+! RUN: %flang_fc1 -fno-init-global-zero -emit-hlfir -o - %s | FileCheck %s
+! RUN: bbc -emit-hlfir -o - %s | FileCheck %s
+! RUN: bbc -finit-global-zero -emit-hlfir -o - %s | FileCheck %s
+! RUN: bbc -finit-global-zero=false -emit-hlfir -o - %s | FileCheck %s
+
+! Test that the flag does not affect globals with default init
+
+module zeroInitM2
+  type val
+    integer :: my_val = 1
+  end type val
+  type(val) :: v1
+end module zeroInitM2
+
+!CHECK:  fir.global @_QMzeroinitm2Ev1 : !fir.type<_QMzeroinitm2Tval{my_val:i32}> {
+!CHECK:    %[[V1:.*]] = fir.undefined !fir.type<_QMzeroinitm2Tval{my_val:i32}>
+!CHECK:    %[[ONE:.*]] = arith.constant 1 : i32
+!CHECK:    %[[V1_INIT:.*]] = fir.insert_value %[[V1]], %[[ONE]], ["my_val", !fir.type<_QMzeroinitm2Tval{my_val:i32}>] : (!fir.type<_QMzeroinitm2Tval{my_val:i32}>, i32) -> !fir.type<_QMzeroinitm2Tval{my_val:i32}>
+!CHECK:    fir.has_value %[[V1_INIT]] : !fir.type<_QMzeroinitm2Tval{my_val:i32}>
+!CHECK:  }
diff --git flang/test/Semantics/OpenMP/declare-target08.f90 flang/test/Semantics/OpenMP/declare-target08.f90
new file mode 100644
index 000000000000..1438d79d3734
--- /dev/null
+++ flang/test/Semantics/OpenMP/declare-target08.f90
@@ -0,0 +1,41 @@
+! RUN: %flang_fc1 -fopenmp -fdebug-dump-symbols %s | FileCheck %s
+
+subroutine bar(i, a)
+    !$omp declare target
+    real :: a
+    integer :: i
+    a = a - i
+end subroutine
+
+function baz(a)
+    !$omp declare target
+    real, intent(in) :: a
+    baz = a
+end function baz
+
+program main
+real a
+!CHECK: bar (Subroutine, OmpDeclareTarget): HostAssoc
+!CHECK: baz (Function, OmpDeclareTarget): HostAssoc
+!$omp declare target(bar)
+!$omp declare target(baz)
+
+a = baz(a)
+call bar(2,a)
+call foo(a)
+return
+end
+
+subroutine foo(a)
+real a
+integer i
+!CHECK: bar (Subroutine, OmpDeclareTarget): HostAssoc
+!CHECK: baz (Function, OmpDeclareTarget): HostAssoc
+!$omp declare target(bar)
+!$omp declare target(baz)
+!$omp target
+    a = baz(a)
+    call bar(i,a)
+!$omp end target
+return
+end
diff --git flang/test/Semantics/OpenMP/doconcurrent01.f90 flang/test/Semantics/OpenMP/doconcurrent01.f90
index 7e3bdce871dd..e46fe0ba3127 100644
--- flang/test/Semantics/OpenMP/doconcurrent01.f90
+++ flang/test/Semantics/OpenMP/doconcurrent01.f90
@@ -1,7 +1,11 @@
 ! RUN: %python %S/../test_symbols.py %s %flang_fc1 -fopenmp
 
-! OpenMP 5.1.1
-! DO Concurrent indices are private
+! OpenMP 5.2 5.1.1 Variables Referenced in a Construct
+! DO CONCURRENT indices have predetermined private DSA.
+!
+! As DO CONCURRENT indices are defined in the construct itself, and OpenMP
+! directives may not appear in it, they are already private.
+! Check that index symbols are not modified.
 
 !DEF: /private_iv (Subroutine)Subprogram
 subroutine private_iv
@@ -9,7 +13,7 @@ subroutine private_iv
    integer i
    !$omp parallel default(private)
    !$omp single
-   !DEF: /private_iv/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+   !DEF: /private_iv/OtherConstruct1/OtherConstruct1/Forall1/i ObjectEntity INTEGER(4)
    do concurrent(i=1:2)
    end do
    !$omp end single
diff --git flang/test/Semantics/OpenMP/forall.f90 flang/test/Semantics/OpenMP/forall.f90
new file mode 100644
index 000000000000..58492664a4e8
--- /dev/null
+++ flang/test/Semantics/OpenMP/forall.f90
@@ -0,0 +1,32 @@
+! RUN: %python %S/../test_symbols.py %s %flang_fc1 -fopenmp
+
+! OpenMP 5.2 5.1.1 Variables Referenced in a Construct
+! FORALL indices have predetermined private DSA.
+!
+! As FORALL indices are defined in the construct itself, and OpenMP
+! directives may not appear in it, they are already private.
+! Check that index symbols are not modified.
+
+  !DEF: /MainProgram1/a ObjectEntity INTEGER(4)
+  !DEF: /MainProgram1/b ObjectEntity INTEGER(4)
+  integer a(5), b(5)
+
+  !REF: /MainProgram1/a
+  a = 0
+  !REF: /MainProgram1/b
+  b = 0
+
+  !$omp parallel
+    !DEF: /MainProgram1/OtherConstruct1/Forall1/i (Implicit) ObjectEntity INTEGER(4)
+    !DEF: /MainProgram1/OtherConstruct1/a HostAssoc INTEGER(4)
+    !DEF: /MainProgram1/OtherConstruct1/b HostAssoc INTEGER(4)
+    forall(i = 1:5) a(i) = b(i) * 2
+  !$omp end parallel
+
+  !$omp parallel default(private)
+    !DEF: /MainProgram1/OtherConstruct2/Forall1/i (Implicit) ObjectEntity INTEGER(4)
+    !DEF: /MainProgram1/OtherConstruct2/a (OmpPrivate) HostAssoc INTEGER(4)
+    !DEF: /MainProgram1/OtherConstruct2/b (OmpPrivate) HostAssoc INTEGER(4)
+    forall(i = 1:5) a(i) = b(i) * 2
+  !$omp end parallel
+end program
diff --git flang/test/Semantics/OpenMP/task-untied01.f90 flang/test/Semantics/OpenMP/task-untied01.f90
new file mode 100644
index 000000000000..3eb14b8ca4be
--- /dev/null
+++ flang/test/Semantics/OpenMP/task-untied01.f90
@@ -0,0 +1,28 @@
+! RUN: %python %S/../test_errors.py %s %flang -fopenmp
+!
+! OpenMP 5.2: 5.2 threadprivate directive restriction
+
+subroutine task_untied01()
+    integer, save :: var_01, var_02(2)
+    real          :: var_03
+    common /c/ var_03
+
+    !$omp threadprivate(var_01, var_02)
+    !$omp threadprivate(/c/)
+
+    !$omp task untied
+        !ERROR: A THREADPRIVATE variable `var_01` cannot appear in an UNTIED TASK region
+        var_01    = 10
+        !ERROR: A THREADPRIVATE variable `var_02` cannot appear in an UNTIED TASK region
+        !ERROR: A THREADPRIVATE variable `var_01` cannot appear in an UNTIED TASK region
+        var_02(1) = sum([var_01, 20])
+    !$omp end task
+
+    !$omp task untied
+        !ERROR: A THREADPRIVATE variable `var_02` cannot appear in an UNTIED TASK region
+        !ERROR: A THREADPRIVATE variable `var_02` cannot appear in an UNTIED TASK region
+        var_02(2) = product(var_02)
+        !ERROR: A THREADPRIVATE variable `var_03` cannot appear in an UNTIED TASK region
+        var_03    = 3.14
+    !$omp end task
+end subroutine task_untied01
diff --git flang/test/Transforms/generic-loop-rewriting-todo.mlir flang/test/Transforms/generic-loop-rewriting-todo.mlir
index 9ea6bf001b66..becd6b8dcb5c 100644
--- flang/test/Transforms/generic-loop-rewriting-todo.mlir
+++ flang/test/Transforms/generic-loop-rewriting-todo.mlir
@@ -1,18 +1,5 @@
 // RUN: fir-opt --omp-generic-loop-conversion -verify-diagnostics %s
 
-func.func @_QPtarget_loop() {
-  %c0 = arith.constant 0 : i32
-  %c10 = arith.constant 10 : i32
-  %c1 = arith.constant 1 : i32
-  // expected-error@below {{not yet implemented: Standalone `omp loop` directive}}
-  omp.loop {
-    omp.loop_nest (%arg3) : i32 = (%c0) to (%c10) inclusive step (%c1) {
-      omp.yield
-    }
-  }
-  return
-}
-
 func.func @_QPtarget_parallel_loop() {
   omp.target {
     omp.parallel {
diff --git flang/tools/bbc/bbc.cpp flang/tools/bbc/bbc.cpp
index 7efc460be867..dafbcd856389 100644
--- flang/tools/bbc/bbc.cpp
+++ flang/tools/bbc/bbc.cpp
@@ -234,6 +234,11 @@ static llvm::cl::opt<bool> integerWrapAround(
     llvm::cl::desc("Treat signed integer overflow as two's complement"),
     llvm::cl::init(false));
 
+static llvm::cl::opt<bool> initGlobalZero(
+    "finit-global-zero",
+    llvm::cl::desc("Zero initialize globals without default initialization"),
+    llvm::cl::init(true));
+
 static llvm::cl::opt<bool>
     reallocateLHS("frealloc-lhs",
                   llvm::cl::desc("Follow Fortran 2003 rules for (re)allocating "
@@ -381,6 +386,7 @@ static llvm::LogicalResult convertFortranSourceToMLIR(
   loweringOptions.setNoPPCNativeVecElemOrder(enableNoPPCNativeVecElemOrder);
   loweringOptions.setLowerToHighLevelFIR(useHLFIR || emitHLFIR);
   loweringOptions.setIntegerWrapAround(integerWrapAround);
+  loweringOptions.setInitGlobalZero(initGlobalZero);
   loweringOptions.setReallocateLHS(reallocateLHS);
   std::vector<Fortran::lower::EnvironmentDefault> envDefaults = {};
   Fortran::frontend::TargetOptions targetOpts;
diff --git flang/unittests/Optimizer/CMakeLists.txt flang/unittests/Optimizer/CMakeLists.txt
index c58fb226a175..f535677c19fd 100644
--- flang/unittests/Optimizer/CMakeLists.txt
+++ flang/unittests/Optimizer/CMakeLists.txt
@@ -9,8 +9,6 @@ set(LIBS
   FIRDialectSupport
   FIRSupport
   HLFIRDialect
-  ${dialect_libs}
-  ${extension_libs}
   LLVMTargetParser
 )
 
@@ -47,3 +45,8 @@ DEPENDS
 target_link_libraries(FlangOptimizerTests
   PRIVATE
   ${LIBS})
+mlir_target_link_libraries(FlangOptimizerTests
+  PRIVATE
+  ${dialect_libs}
+  ${extension_libs}
+  )
diff --git libc/src/__support/block.h libc/src/__support/block.h
index 50a745326eac..a58c38bbb7ac 100644
--- libc/src/__support/block.h
+++ libc/src/__support/block.h
@@ -227,15 +227,17 @@ public:
     *new (&next()->prev_) size_t = outer_size();
   }
 
-  /// Marks this block as the last one in the chain. Makes next() return
-  /// nullptr.
-  LIBC_INLINE void mark_last() { next_ |= LAST_MASK; }
-
-  LIBC_INLINE Block(size_t outer_size) : next_(outer_size) {
-    LIBC_ASSERT(outer_size % alignof(max_align_t) == 0 &&
-                "block sizes must be aligned");
+  LIBC_INLINE Block(size_t outer_size, bool is_last) : next_(outer_size) {
+    // Last blocks are not usable, so they need not have sizes aligned to
+    // max_align_t. Their lower bits must still be free, so they must be aligned
+    // to Block.
+    LIBC_ASSERT(
+        outer_size % (is_last ? alignof(Block) : alignof(max_align_t)) == 0 &&
+        "block sizes must be aligned");
     LIBC_ASSERT(is_usable_space_aligned(alignof(max_align_t)) &&
                 "usable space must be aligned to a multiple of max_align_t");
+    if (is_last)
+      next_ |= LAST_MASK;
   }
 
   LIBC_INLINE bool is_usable_space_aligned(size_t alignment) const {
@@ -325,7 +327,13 @@ private:
     LIBC_ASSERT(reinterpret_cast<uintptr_t>(bytes.data()) % alignof(Block) ==
                     0 &&
                 "block start must be suitably aligned");
-    return ::new (bytes.data()) Block(bytes.size());
+    return ::new (bytes.data()) Block(bytes.size(), /*is_last=*/false);
+  }
+
+  LIBC_INLINE static void make_last_block(cpp::byte *start) {
+    LIBC_ASSERT(reinterpret_cast<uintptr_t>(start) % alignof(Block) == 0 &&
+                "block start must be suitably aligned");
+    ::new (start) Block(sizeof(Block), /*is_last=*/true);
   }
 
   /// Offset from this block to the previous block. 0 if this is the first
@@ -353,7 +361,7 @@ public:
   static constexpr size_t PREV_FIELD_SIZE = sizeof(prev_);
 };
 
-static_assert(alignof(max_align_t) >= 4,
+static_assert(alignof(Block) >= 4,
               "at least 2 bits must be available in block sizes for flags");
 
 LIBC_INLINE
@@ -380,9 +388,8 @@ optional<Block *> Block::init(ByteSpan region) {
   auto *last_start_ptr = reinterpret_cast<cpp::byte *>(last_start);
   Block *block =
       as_block({reinterpret_cast<cpp::byte *>(block_start), last_start_ptr});
-  Block *last = as_block({last_start_ptr, sizeof(Block)});
+  make_last_block(last_start_ptr);
   block->mark_free();
-  last->mark_last();
   return block;
 }
 
diff --git libc/test/src/__support/block_test.cpp libc/test/src/__support/block_test.cpp
index c2d9833fb943..904ac5c66994 100644
--- libc/test/src/__support/block_test.cpp
+++ libc/test/src/__support/block_test.cpp
@@ -75,8 +75,11 @@ TEST(LlvmLibcBlockTest, CannotCreateTooSmallBlock) {
 
 TEST(LlvmLibcBlockTest, CanSplitBlock) {
   constexpr size_t kN = 1024;
-  // Give the split position a large alignment.
-  constexpr size_t kSplitN = 512 + Block::PREV_FIELD_SIZE;
+
+  // Choose a split position such that the next block's usable space is 512
+  // bytes from this one's. This should be sufficient for any machine's
+  // alignment.
+  const size_t kSplitN = Block::inner_size(512);
 
   array<byte, kN> bytes;
   auto result = Block::init(bytes);
diff --git libc/test/src/__support/freestore_test.cpp libc/test/src/__support/freestore_test.cpp
index a32badb39b1e..39292b6a1211 100644
--- libc/test/src/__support/freestore_test.cpp
+++ libc/test/src/__support/freestore_test.cpp
@@ -26,6 +26,10 @@ TEST(LlvmLibcFreeStore, TooSmall) {
   Block *too_small = *maybeBlock;
   maybeBlock = too_small->split(Block::PREV_FIELD_SIZE);
   ASSERT_TRUE(maybeBlock.has_value());
+  // On platforms with high alignment the smallest legal block may be large
+  // enough for a node.
+  if (too_small->outer_size() >= sizeof(Block) + sizeof(FreeList::Node))
+    return;
   Block *remainder = *maybeBlock;
 
   FreeStore store;
diff --git libclc/clc/include/clc/clcmacro.h libclc/clc/include/clc/clcmacro.h
index 3c3a69f4f848..44928b2e428b 100644
--- libclc/clc/include/clc/clcmacro.h
+++ libclc/clc/include/clc/clcmacro.h
@@ -184,6 +184,33 @@
     return BUILTIN(x);                                                         \
   }
 
+#define _CLC_DEFINE_TERNARY_BUILTIN(RET_TYPE, FUNCTION, BUILTIN, ARG1_TYPE,    \
+                                    ARG2_TYPE, ARG3_TYPE)                      \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x, ARG2_TYPE y,           \
+                                           ARG3_TYPE z) {                      \
+    return BUILTIN(x, y, z);                                                   \
+  }                                                                            \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x, ARG2_TYPE##2 y,  \
+                                              ARG3_TYPE##2 z) {                \
+    return BUILTIN(x, y, z);                                                   \
+  }                                                                            \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x, ARG2_TYPE##3 y,  \
+                                              ARG3_TYPE##3 z) {                \
+    return BUILTIN(x, y, z);                                                   \
+  }                                                                            \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE##4 FUNCTION(ARG1_TYPE##4 x, ARG2_TYPE##4 y,  \
+                                              ARG3_TYPE##4 z) {                \
+    return BUILTIN(x, y, z);                                                   \
+  }                                                                            \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE##8 FUNCTION(ARG1_TYPE##8 x, ARG2_TYPE##8 y,  \
+                                              ARG3_TYPE##8 z) {                \
+    return BUILTIN(x, y, z);                                                   \
+  }                                                                            \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE##16 FUNCTION(                                \
+      ARG1_TYPE##16 x, ARG2_TYPE##16 y, ARG3_TYPE##16 z) {                     \
+    return BUILTIN(x, y, z);                                                   \
+  }
+
 #ifdef cl_khr_fp16
 
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
diff --git libclc/clc/include/clc/common/clc_degrees.h libclc/clc/include/clc/common/clc_degrees.h
index e8bb684fcd4d..617030aed443 100644
--- libclc/clc/include/clc/common/clc_degrees.h
+++ libclc/clc/include/clc/common/clc_degrees.h
@@ -1,5 +1,5 @@
-#ifndef __CLC_MATH_CLC_DEGREES_H__
-#define __CLC_MATH_CLC_DEGREES_H__
+#ifndef __CLC_COMMON_CLC_DEGREES_H__
+#define __CLC_COMMON_CLC_DEGREES_H__
 
 #define __CLC_BODY <clc/math/unary_decl.inc>
 #define __CLC_FUNCTION __clc_degrees
@@ -9,4 +9,4 @@
 #undef __CLC_BODY
 #undef __CLC_FUNCTION
 
-#endif // __CLC_MATH_CLC_DEGREES_H__
+#endif // __CLC_COMMON_CLC_DEGREES_H__
diff --git libclc/clc/include/clc/common/clc_radians.h libclc/clc/include/clc/common/clc_radians.h
index 80d481e8de72..018a675cdc89 100644
--- libclc/clc/include/clc/common/clc_radians.h
+++ libclc/clc/include/clc/common/clc_radians.h
@@ -1,5 +1,5 @@
-#ifndef __CLC_MATH_CLC_RADIANS_H__
-#define __CLC_MATH_CLC_RADIANS_H__
+#ifndef __CLC_COMMON_CLC_RADIANS_H__
+#define __CLC_COMMON_CLC_RADIANS_H__
 
 #define __CLC_BODY <clc/math/unary_decl.inc>
 #define __CLC_FUNCTION __clc_radians
@@ -9,4 +9,4 @@
 #undef __CLC_BODY
 #undef __CLC_FUNCTION
 
-#endif // __CLC_MATH_CLC_RADIANS_H__
+#endif // __CLC_COMMON_CLC_RADIANS_H__
diff --git libclc/clc/include/clc/math/clc_mad.h libclc/clc/include/clc/math/clc_mad.h
new file mode 100644
index 000000000000..3eb718e87f37
--- /dev/null
+++ libclc/clc/include/clc/math/clc_mad.h
@@ -0,0 +1,12 @@
+#ifndef __CLC_MATH_CLC_MAD_H__
+#define __CLC_MATH_CLC_MAD_H__
+
+#define __CLC_BODY <clc/math/ternary_decl.inc>
+#define __CLC_FUNCTION __clc_mad
+
+#include <clc/math/gentype.inc>
+
+#undef __CLC_BODY
+#undef __CLC_FUNCTION
+
+#endif // __CLC_MATH_CLC_MAD_H__
diff --git libclc/clc/include/clc/math/ternary_decl.inc libclc/clc/include/clc/math/ternary_decl.inc
new file mode 100644
index 000000000000..6c1507b3fcf7
--- /dev/null
+++ libclc/clc/include/clc/math/ternary_decl.inc
@@ -0,0 +1,3 @@
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE a,
+                                                     __CLC_GENTYPE b,
+                                                     __CLC_GENTYPE c);
diff --git libclc/clc/include/clc/relational/clc_select.h libclc/clc/include/clc/relational/clc_select.h
index ddea7c528ec7..a92f2051b577 100644
--- libclc/clc/include/clc/relational/clc_select.h
+++ libclc/clc/include/clc/relational/clc_select.h
@@ -1,23 +1,19 @@
 #ifndef __CLC_RELATIONAL_CLC_SELECT_H__
 #define __CLC_RELATIONAL_CLC_SELECT_H__
 
-#if defined(CLC_CLSPV) || defined(CLC_SPIRV)
-// clspv and spir-v targets provide their own OpenCL-compatible select
-#define __clc_select select
-#else
-
 /* Duplciate these so we don't have to distribute utils.h */
 #define __CLC_CONCAT(x, y) x##y
 #define __CLC_XCONCAT(x, y) __CLC_CONCAT(x, y)
 
-#define __CLC_BODY <clc/relational/clc_select.inc>
+#define __CLC_SELECT_FN __clc_select
+
+#define __CLC_BODY <clc/relational/clc_select_decl.inc>
 #include <clc/math/gentype.inc>
-#define __CLC_BODY <clc/relational/clc_select.inc>
+#define __CLC_BODY <clc/relational/clc_select_decl.inc>
 #include <clc/integer/gentype.inc>
 
+#undef __CLC_SELECT_FN
 #undef __CLC_CONCAT
 #undef __CLC_XCONCAT
 
-#endif
-
 #endif // __CLC_RELATIONAL_CLC_SELECT_H__
diff --git libclc/clc/include/clc/relational/clc_select.inc libclc/clc/include/clc/relational/clc_select.inc
deleted file mode 100644
index abf0e0fa4360..000000000000
--- libclc/clc/include/clc/relational/clc_select.inc
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifdef __CLC_SCALAR
-#define __CLC_VECSIZE
-#endif
-
-#if __CLC_FPSIZE == 64
-#define __CLC_S_GENTYPE __CLC_XCONCAT(long, __CLC_VECSIZE)
-#define __CLC_U_GENTYPE __CLC_XCONCAT(ulong, __CLC_VECSIZE)
-#elif __CLC_FPSIZE == 32
-#define __CLC_S_GENTYPE __CLC_XCONCAT(int, __CLC_VECSIZE)
-#define __CLC_U_GENTYPE __CLC_XCONCAT(uint, __CLC_VECSIZE)
-#elif __CLC_FPSIZE == 16
-#define __CLC_S_GENTYPE __CLC_XCONCAT(short, __CLC_VECSIZE)
-#define __CLC_U_GENTYPE __CLC_XCONCAT(ushort, __CLC_VECSIZE)
-#endif
-
-_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_select(__CLC_GENTYPE x,
-                                                   __CLC_GENTYPE y,
-                                                   __CLC_S_GENTYPE z);
-_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_select(__CLC_GENTYPE x,
-                                                   __CLC_GENTYPE y,
-                                                   __CLC_U_GENTYPE z);
-
-#ifdef __CLC_FPSIZE
-#undef __CLC_S_GENTYPE
-#undef __CLC_U_GENTYPE
-#endif
-#ifdef __CLC_SCALAR
-#undef __CLC_VECSIZE
-#endif
diff --git libclc/generic/include/clc/relational/select.inc libclc/clc/include/clc/relational/clc_select_decl.inc
similarity index 58%
rename from libclc/generic/include/clc/relational/select.inc
rename to libclc/clc/include/clc/relational/clc_select_decl.inc
index 11a957a56e4b..3a4f2dcb7517 100644
--- libclc/generic/include/clc/relational/select.inc
+++ libclc/clc/include/clc/relational/clc_select_decl.inc
@@ -13,8 +13,12 @@
 #define __CLC_U_GENTYPE __CLC_XCONCAT(ushort, __CLC_VECSIZE)
 #endif
 
-_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE select(__CLC_GENTYPE x, __CLC_GENTYPE y, __CLC_S_GENTYPE z);
-_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE select(__CLC_GENTYPE x, __CLC_GENTYPE y, __CLC_U_GENTYPE z);
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_SELECT_FN(__CLC_GENTYPE x,
+                                                      __CLC_GENTYPE y,
+                                                      __CLC_S_GENTYPE z);
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_SELECT_FN(__CLC_GENTYPE x,
+                                                      __CLC_GENTYPE y,
+                                                      __CLC_U_GENTYPE z);
 
 #ifdef __CLC_FPSIZE
 #undef __CLC_S_GENTYPE
diff --git libclc/generic/lib/relational/select.inc libclc/clc/include/clc/relational/clc_select_impl.inc
similarity index 55%
rename from libclc/generic/lib/relational/select.inc
rename to libclc/clc/include/clc/relational/clc_select_impl.inc
index 7691af266918..ad53e822179f 100644
--- libclc/generic/lib/relational/select.inc
+++ libclc/clc/include/clc/relational/clc_select_impl.inc
@@ -13,14 +13,16 @@
 #define __CLC_U_GENTYPE __CLC_XCONCAT(ushort, __CLC_VECSIZE)
 #endif
 
-_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE select(__CLC_GENTYPE x, __CLC_GENTYPE y, __CLC_S_GENTYPE z)
-{
-	return z ? y : x;
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __CLC_SELECT_FN(__CLC_GENTYPE x,
+                                                     __CLC_GENTYPE y,
+                                                     __CLC_S_GENTYPE z) {
+  __CLC_SELECT_DEF(x, y, z);
 }
 
-_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE select(__CLC_GENTYPE x, __CLC_GENTYPE y, __CLC_U_GENTYPE z)
-{
-	return z ? y : x;
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __CLC_SELECT_FN(__CLC_GENTYPE x,
+                                                     __CLC_GENTYPE y,
+                                                     __CLC_U_GENTYPE z) {
+  __CLC_SELECT_DEF(x, y, z);
 }
 
 #ifdef __CLC_FPSIZE
diff --git libclc/clc/lib/clspv/SOURCES libclc/clc/lib/clspv/SOURCES
index e6573f586080..c3fc03c0b3dd 100644
--- libclc/clc/lib/clspv/SOURCES
+++ libclc/clc/lib/clspv/SOURCES
@@ -1,6 +1,8 @@
 ../generic/math/clc_ceil.cl
 ../generic/math/clc_fabs.cl
 ../generic/math/clc_floor.cl
+../generic/math/clc_mad.cl
 ../generic/math/clc_rint.cl
 ../generic/math/clc_trunc.cl
+../generic/relational/clc_select.cl
 ../generic/shared/clc_clamp.cl
diff --git libclc/clc/lib/generic/SOURCES libclc/clc/lib/generic/SOURCES
index d74bff20ba87..877a0a390a74 100644
--- libclc/clc/lib/generic/SOURCES
+++ libclc/clc/lib/generic/SOURCES
@@ -7,6 +7,7 @@ integer/clc_abs_diff.cl
 math/clc_ceil.cl
 math/clc_fabs.cl
 math/clc_floor.cl
+math/clc_mad.cl
 math/clc_rint.cl
 math/clc_trunc.cl
 relational/clc_all.cl
diff --git libclc/clc/lib/generic/math/clc_mad.cl libclc/clc/lib/generic/math/clc_mad.cl
new file mode 100644
index 000000000000..58618cf24771
--- /dev/null
+++ libclc/clc/lib/generic/math/clc_mad.cl
@@ -0,0 +1,4 @@
+#include <clc/internal/clc.h>
+
+#define __CLC_BODY <clc_mad.inc>
+#include <clc/math/gentype.inc>
diff --git libclc/clc/lib/generic/math/clc_mad.inc libclc/clc/lib/generic/math/clc_mad.inc
new file mode 100644
index 000000000000..a4dbdf1bc83a
--- /dev/null
+++ libclc/clc/lib/generic/math/clc_mad.inc
@@ -0,0 +1,5 @@
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_mad(__CLC_GENTYPE a, __CLC_GENTYPE b,
+                                               __CLC_GENTYPE c) {
+#pragma OPENCL FP_CONTRACT ON
+  return a * b + c;
+}
diff --git libclc/clc/lib/generic/relational/clc_bitselect.cl libclc/clc/lib/generic/relational/clc_bitselect.cl
index 66b28af71b38..6281eeea1abb 100644
--- libclc/clc/lib/generic/relational/clc_bitselect.cl
+++ libclc/clc/lib/generic/relational/clc_bitselect.cl
@@ -53,3 +53,15 @@ FLOAT_BITSELECT(double, ulong, 8)
 FLOAT_BITSELECT(double, ulong, 16)
 
 #endif
+
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+FLOAT_BITSELECT(half, ushort, )
+FLOAT_BITSELECT(half, ushort, 2)
+FLOAT_BITSELECT(half, ushort, 3)
+FLOAT_BITSELECT(half, ushort, 4)
+FLOAT_BITSELECT(half, ushort, 8)
+FLOAT_BITSELECT(half, ushort, 16)
+
+#endif
diff --git libclc/clc/lib/generic/relational/clc_select.cl libclc/clc/lib/generic/relational/clc_select.cl
index bb016ed2993e..210db7867eef 100644
--- libclc/clc/lib/generic/relational/clc_select.cl
+++ libclc/clc/lib/generic/relational/clc_select.cl
@@ -1,7 +1,10 @@
 #include <clc/internal/clc.h>
 #include <clc/utils.h>
 
-#define __CLC_BODY <clc_select.inc>
+#define __CLC_SELECT_FN __clc_select
+#define __CLC_SELECT_DEF(x, y, z) return z ? y : x
+
+#define __CLC_BODY <clc/relational/clc_select_impl.inc>
 #include <clc/math/gentype.inc>
-#define __CLC_BODY <clc_select.inc>
+#define __CLC_BODY <clc/relational/clc_select_impl.inc>
 #include <clc/integer/gentype.inc>
diff --git libclc/clc/lib/generic/relational/clc_select.inc libclc/clc/lib/generic/relational/clc_select.inc
deleted file mode 100644
index 47db80672a02..000000000000
--- libclc/clc/lib/generic/relational/clc_select.inc
+++ /dev/null
@@ -1,35 +0,0 @@
-#ifdef __CLC_SCALAR
-#define __CLC_VECSIZE
-#endif
-
-#if __CLC_FPSIZE == 64
-#define __CLC_S_GENTYPE __CLC_XCONCAT(long, __CLC_VECSIZE)
-#define __CLC_U_GENTYPE __CLC_XCONCAT(ulong, __CLC_VECSIZE)
-#elif __CLC_FPSIZE == 32
-#define __CLC_S_GENTYPE __CLC_XCONCAT(int, __CLC_VECSIZE)
-#define __CLC_U_GENTYPE __CLC_XCONCAT(uint, __CLC_VECSIZE)
-#elif __CLC_FPSIZE == 16
-#define __CLC_S_GENTYPE __CLC_XCONCAT(short, __CLC_VECSIZE)
-#define __CLC_U_GENTYPE __CLC_XCONCAT(ushort, __CLC_VECSIZE)
-#endif
-
-_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_select(__CLC_GENTYPE x,
-                                                  __CLC_GENTYPE y,
-                                                  __CLC_S_GENTYPE z) {
-  return z ? y : x;
-}
-
-_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_select(__CLC_GENTYPE x,
-                                                  __CLC_GENTYPE y,
-                                                  __CLC_U_GENTYPE z) {
-  return z ? y : x;
-}
-
-#ifdef __CLC_FPSIZE
-#undef __CLC_S_GENTYPE
-#undef __CLC_U_GENTYPE
-#endif
-
-#ifdef __CLC_SCALAR
-#undef __CLC_VECSIZE
-#endif
diff --git libclc/clc/lib/spirv/SOURCES libclc/clc/lib/spirv/SOURCES
index ac855ea5184e..55d109478faa 100644
--- libclc/clc/lib/spirv/SOURCES
+++ libclc/clc/lib/spirv/SOURCES
@@ -5,6 +5,8 @@
 ../generic/math/clc_ceil.cl
 ../generic/math/clc_fabs.cl
 ../generic/math/clc_floor.cl
+../generic/math/clc_mad.cl
 ../generic/math/clc_rint.cl
 ../generic/math/clc_trunc.cl
+../generic/relational/clc_select.cl
 ../generic/shared/clc_clamp.cl
diff --git libclc/clc/lib/spirv64/SOURCES libclc/clc/lib/spirv64/SOURCES
index ac855ea5184e..55d109478faa 100644
--- libclc/clc/lib/spirv64/SOURCES
+++ libclc/clc/lib/spirv64/SOURCES
@@ -5,6 +5,8 @@
 ../generic/math/clc_ceil.cl
 ../generic/math/clc_fabs.cl
 ../generic/math/clc_floor.cl
+../generic/math/clc_mad.cl
 ../generic/math/clc_rint.cl
 ../generic/math/clc_trunc.cl
+../generic/relational/clc_select.cl
 ../generic/shared/clc_clamp.cl
diff --git libclc/generic/include/clc/math/ternary_decl.inc libclc/generic/include/clc/math/ternary_decl.inc
deleted file mode 100644
index 0598684ea406..000000000000
--- libclc/generic/include/clc/math/ternary_decl.inc
+++ /dev/null
@@ -1 +0,0 @@
-_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE a, __CLC_GENTYPE b, __CLC_GENTYPE c);
diff --git libclc/generic/include/clc/relational/select.h libclc/generic/include/clc/relational/select.h
index d20deae0d2cf..847884a07b7f 100644
--- libclc/generic/include/clc/relational/select.h
+++ libclc/generic/include/clc/relational/select.h
@@ -2,10 +2,13 @@
 #define __CLC_CONCAT(x, y) x ## y
 #define __CLC_XCONCAT(x, y) __CLC_CONCAT(x, y)
 
-#define __CLC_BODY <clc/relational/select.inc>
+#define __CLC_SELECT_FN select
+
+#define __CLC_BODY <clc/relational/clc_select_decl.inc>
 #include <clc/math/gentype.inc>
-#define __CLC_BODY <clc/relational/select.inc>
+#define __CLC_BODY <clc/relational/clc_select_decl.inc>
 #include <clc/integer/gentype.inc>
 
+#undef __CLC_SELECT_FN
 #undef __CLC_CONCAT
 #undef __CLC_XCONCAT
diff --git libclc/generic/lib/common/mix.cl libclc/generic/lib/common/mix.cl
index 7f3d5b61497b..756e8619f1b3 100644
--- libclc/generic/lib/common/mix.cl
+++ libclc/generic/lib/common/mix.cl
@@ -1,4 +1,5 @@
 #include <clc/clc.h>
+#include <clc/math/clc_mad.h>
 
 #define __CLC_BODY <mix.inc>
 #include <clc/math/gentype.inc>
diff --git libclc/generic/lib/common/mix.inc libclc/generic/lib/common/mix.inc
index 1e8b936149bb..fd45a810b0ed 100644
--- libclc/generic/lib/common/mix.inc
+++ libclc/generic/lib/common/mix.inc
@@ -1,9 +1,11 @@
-_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE mix(__CLC_GENTYPE x, __CLC_GENTYPE y, __CLC_GENTYPE a) {
-  return mad( y - x, a, x );
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE mix(__CLC_GENTYPE x, __CLC_GENTYPE y,
+                                         __CLC_GENTYPE a) {
+  return __clc_mad(y - x, a, x);
 }
 
 #ifndef __CLC_SCALAR
-_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE mix(__CLC_GENTYPE x, __CLC_GENTYPE y, __CLC_SCALAR_GENTYPE a) {
-    return mix(x, y, (__CLC_GENTYPE)a);
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE mix(__CLC_GENTYPE x, __CLC_GENTYPE y,
+                                         __CLC_SCALAR_GENTYPE a) {
+  return mix(x, y, (__CLC_GENTYPE)a);
 }
 #endif
diff --git libclc/generic/lib/math/clc_exp10.cl libclc/generic/lib/math/clc_exp10.cl
index 6ea8743e39c5..572aa396942b 100644
--- libclc/generic/lib/math/clc_exp10.cl
+++ libclc/generic/lib/math/clc_exp10.cl
@@ -22,6 +22,7 @@
 
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
+#include <clc/math/clc_mad.h>
 #include <clc/relational/clc_isnan.h>
 
 #include "config.h"
@@ -53,98 +54,109 @@
 //
 //    e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
 
-_CLC_DEF _CLC_OVERLOAD float __clc_exp10(float x)
-{
-    const float X_MAX =  0x1.344134p+5f; // 128*log2/log10 : 38.53183944498959
-    const float X_MIN = -0x1.66d3e8p+5f; // -149*log2/log10 : -44.8534693539332
-
-    const float R_64_BY_LOG10_2 = 0x1.a934f0p+7f; // 64*log10/log2 : 212.6033980727912
-    const float R_LOG10_2_BY_64_LD = 0x1.340000p-8f; // log2/(64 * log10) lead : 0.004699707
-    const float R_LOG10_2_BY_64_TL = 0x1.04d426p-18f; // log2/(64 * log10) tail : 0.00000388665057
-    const float R_LN10 = 0x1.26bb1cp+1f;
-
-    int return_nan = __clc_isnan(x);
-    int return_inf = x > X_MAX;
-    int return_zero = x < X_MIN;
-
-    int n = convert_int(x * R_64_BY_LOG10_2);
-
-    float fn = (float)n;
-    int j = n & 0x3f;
-    int m = n >> 6;
-    int m2 = m << EXPSHIFTBITS_SP32;
-    float r;
-
-    r = R_LN10 * mad(fn, -R_LOG10_2_BY_64_TL, mad(fn, -R_LOG10_2_BY_64_LD, x));
-
-    // Truncated Taylor series for e^r
-    float z2 = mad(mad(mad(r, 0x1.555556p-5f, 0x1.555556p-3f), r, 0x1.000000p-1f), r*r, r);
-
-    float two_to_jby64 = USE_TABLE(exp_tbl, j);
-    z2 = mad(two_to_jby64, z2, two_to_jby64);
-
-    float z2s = z2 * as_float(0x1 << (m + 149));
-    float z2n = as_float(as_int(z2) + m2);
-    z2 = m <= -126 ? z2s : z2n;
-
-
-    z2 = return_inf ? as_float(PINFBITPATT_SP32) : z2;
-    z2 = return_zero ? 0.0f : z2;
-    z2 = return_nan ? x : z2;
-    return z2;
+_CLC_DEF _CLC_OVERLOAD float __clc_exp10(float x) {
+  // 128*log2/log10 : 38.53183944498959
+  const float X_MAX = 0x1.344134p+5f;
+  // -149*log2/log10 : -44.8534693539332
+  const float X_MIN = -0x1.66d3e8p+5f;
+  // 64*log10/log2 : 212.6033980727912
+  const float R_64_BY_LOG10_2 = 0x1.a934f0p+7f;
+  // log2/(64 * log10) lead : 0.004699707
+  const float R_LOG10_2_BY_64_LD = 0x1.340000p-8f;
+  // log2/(64 * log10) tail : 0.00000388665057
+  const float R_LOG10_2_BY_64_TL = 0x1.04d426p-18f;
+  const float R_LN10 = 0x1.26bb1cp+1f;
+
+  int return_nan = __clc_isnan(x);
+  int return_inf = x > X_MAX;
+  int return_zero = x < X_MIN;
+
+  int n = convert_int(x * R_64_BY_LOG10_2);
+
+  float fn = (float)n;
+  int j = n & 0x3f;
+  int m = n >> 6;
+  int m2 = m << EXPSHIFTBITS_SP32;
+  float r;
+
+  r = R_LN10 *
+      __clc_mad(fn, -R_LOG10_2_BY_64_TL, __clc_mad(fn, -R_LOG10_2_BY_64_LD, x));
+
+  // Truncated Taylor series for e^r
+  float z2 = __clc_mad(__clc_mad(__clc_mad(r, 0x1.555556p-5f, 0x1.555556p-3f),
+                                 r, 0x1.000000p-1f),
+                       r * r, r);
+
+  float two_to_jby64 = USE_TABLE(exp_tbl, j);
+  z2 = __clc_mad(two_to_jby64, z2, two_to_jby64);
+
+  float z2s = z2 * as_float(0x1 << (m + 149));
+  float z2n = as_float(as_int(z2) + m2);
+  z2 = m <= -126 ? z2s : z2n;
+
+  z2 = return_inf ? as_float(PINFBITPATT_SP32) : z2;
+  z2 = return_zero ? 0.0f : z2;
+  z2 = return_nan ? x : z2;
+  return z2;
 }
 _CLC_UNARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, float, __clc_exp10, float)
 
 #ifdef cl_khr_fp64
-_CLC_DEF _CLC_OVERLOAD double __clc_exp10(double x)
-{
-    const double X_MAX = 0x1.34413509f79ffp+8; // 1024*ln(2)/ln(10)
-    const double X_MIN = -0x1.434e6420f4374p+8; // -1074*ln(2)/ln(10)
-
-    const double R_64_BY_LOG10_2 = 0x1.a934f0979a371p+7; // 64*ln(10)/ln(2)
-    const double R_LOG10_2_BY_64_LD = 0x1.3441350000000p-8; // head ln(2)/(64*ln(10))
-    const double R_LOG10_2_BY_64_TL = 0x1.3ef3fde623e25p-37; // tail ln(2)/(64*ln(10))
-    const double R_LN10 = 0x1.26bb1bbb55516p+1; // ln(10)
-
-    int n = convert_int(x * R_64_BY_LOG10_2);
-
-    double dn = (double)n;
-
-    int j = n & 0x3f;
-    int m = n >> 6;
-
-    double r = R_LN10 * fma(-R_LOG10_2_BY_64_TL, dn, fma(-R_LOG10_2_BY_64_LD, dn, x));
-
-    // 6 term tail of Taylor expansion of e^r
-    double z2 = r * fma(r,
-	                fma(r,
-		            fma(r,
-			        fma(r,
-			            fma(r, 0x1.6c16c16c16c17p-10, 0x1.1111111111111p-7),
-			            0x1.5555555555555p-5),
-			        0x1.5555555555555p-3),
-		            0x1.0000000000000p-1),
-		        1.0);
-
-    double2 tv = USE_TABLE(two_to_jby64_ep_tbl, j);
-    z2 = fma(tv.s0 + tv.s1, z2, tv.s1) + tv.s0;
-
-    int small_value = (m < -1022) || ((m == -1022) && (z2 < 1.0));
-
-	int n1 = m >> 2;
-	int n2 = m-n1;
-	double z3= z2 * as_double(((long)n1 + 1023) << 52);
-	z3 *= as_double(((long)n2 + 1023) << 52);
-
-    z2 = ldexp(z2, m);
-    z2 = small_value ? z3: z2;
-
-    z2 = __clc_isnan(x) ? x : z2;
-
-    z2 = x > X_MAX ? as_double(PINFBITPATT_DP64) : z2;
-    z2 = x < X_MIN ? 0.0 : z2;
-
-    return z2;
+_CLC_DEF _CLC_OVERLOAD double __clc_exp10(double x) {
+  // 1024*ln(2)/ln(10)
+  const double X_MAX = 0x1.34413509f79ffp+8;
+  // -1074*ln(2)/ln(10)
+  const double X_MIN = -0x1.434e6420f4374p+8;
+  // 64*ln(10)/ln(2)
+  const double R_64_BY_LOG10_2 = 0x1.a934f0979a371p+7;
+  // head ln(2)/(64*ln(10))
+  const double R_LOG10_2_BY_64_LD = 0x1.3441350000000p-8;
+  // tail ln(2)/(64*ln(10))
+  const double R_LOG10_2_BY_64_TL = 0x1.3ef3fde623e25p-37;
+  // ln(10)
+  const double R_LN10 = 0x1.26bb1bbb55516p+1;
+
+  int n = convert_int(x * R_64_BY_LOG10_2);
+
+  double dn = (double)n;
+
+  int j = n & 0x3f;
+  int m = n >> 6;
+
+  double r =
+      R_LN10 * fma(-R_LOG10_2_BY_64_TL, dn, fma(-R_LOG10_2_BY_64_LD, dn, x));
+
+  // 6 term tail of Taylor expansion of e^r
+  double z2 =
+      r *
+      fma(r,
+          fma(r,
+              fma(r,
+                  fma(r, fma(r, 0x1.6c16c16c16c17p-10, 0x1.1111111111111p-7),
+                      0x1.5555555555555p-5),
+                  0x1.5555555555555p-3),
+              0x1.0000000000000p-1),
+          1.0);
+
+  double2 tv = USE_TABLE(two_to_jby64_ep_tbl, j);
+  z2 = fma(tv.s0 + tv.s1, z2, tv.s1) + tv.s0;
+
+  int small_value = (m < -1022) || ((m == -1022) && (z2 < 1.0));
+
+  int n1 = m >> 2;
+  int n2 = m - n1;
+  double z3 = z2 * as_double(((long)n1 + 1023) << 52);
+  z3 *= as_double(((long)n2 + 1023) << 52);
+
+  z2 = ldexp(z2, m);
+  z2 = small_value ? z3 : z2;
+
+  z2 = __clc_isnan(x) ? x : z2;
+
+  z2 = x > X_MAX ? as_double(PINFBITPATT_DP64) : z2;
+  z2 = x < X_MIN ? 0.0 : z2;
+
+  return z2;
 }
 _CLC_UNARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, double, __clc_exp10, double)
 #endif
diff --git libclc/generic/lib/math/clc_hypot.cl libclc/generic/lib/math/clc_hypot.cl
index a17e661603fa..d889969d6d8c 100644
--- libclc/generic/lib/math/clc_hypot.cl
+++ libclc/generic/lib/math/clc_hypot.cl
@@ -23,6 +23,7 @@
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
 #include <clc/integer/clc_abs.h>
+#include <clc/math/clc_mad.h>
 #include <clc/relational/clc_isnan.h>
 #include <clc/shared/clc_clamp.h>
 #include <math/clc_hypot.h>
@@ -48,7 +49,7 @@ _CLC_DEF _CLC_OVERLOAD float __clc_hypot(float x, float y) {
   float fi_exp = as_float((-xexp + EXPBIAS_SP32) << EXPSHIFTBITS_SP32);
   float fx = as_float(ux) * fi_exp;
   float fy = as_float(uy) * fi_exp;
-  retval = sqrt(mad(fx, fx, fy * fy)) * fx_exp;
+  retval = sqrt(__clc_mad(fx, fx, fy * fy)) * fx_exp;
 
   retval = ux > PINFBITPATT_SP32 | uy == 0 ? as_float(ux) : retval;
   retval = ux == PINFBITPATT_SP32 | uy == PINFBITPATT_SP32
diff --git libclc/generic/lib/math/clc_pow.cl libclc/generic/lib/math/clc_pow.cl
index 2e2dade0d6b8..4abfaf1c10df 100644
--- libclc/generic/lib/math/clc_pow.cl
+++ libclc/generic/lib/math/clc_pow.cl
@@ -23,6 +23,7 @@
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
 #include <clc/math/clc_fabs.h>
+#include <clc/math/clc_mad.h>
 
 #include "config.h"
 #include "math.h"
@@ -66,334 +67,351 @@
  ((((expT * poly) + expT) + expH*poly) + expH)
 */
 
-_CLC_DEF _CLC_OVERLOAD float __clc_pow(float x, float y)
-{
-
-    int ix = as_int(x);
-    int ax = ix & EXSIGNBIT_SP32;
-    int xpos = ix == ax;
-
-    int iy = as_int(y);
-    int ay = iy & EXSIGNBIT_SP32;
-    int ypos = iy == ay;
-
-    /* Extra precise log calculation
-     *  First handle case that x is close to 1
-     */
-    float r = 1.0f - as_float(ax);
-    int near1 = __clc_fabs(r) < 0x1.0p-4f;
-    float r2 = r*r;
-
-    /* Coefficients are just 1/3, 1/4, 1/5 and 1/6 */
-    float poly = mad(r,
-                     mad(r,
-                         mad(r,
-                             mad(r, 0x1.24924ap-3f, 0x1.555556p-3f),
-                             0x1.99999ap-3f),
-                         0x1.000000p-2f),
-                     0x1.555556p-2f);
-
-    poly *= r2*r;
-
-    float lth_near1 = -r2 * 0.5f;
-    float ltt_near1 = -poly;
-    float lt_near1 = lth_near1 + ltt_near1;
-    float lh_near1 = -r;
-    float l_near1 = lh_near1 + lt_near1;
-
-    /* Computations for x not near 1 */
-    int m = (int)(ax >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
-    float mf = (float)m;
-    int ixs = as_int(as_float(ax | 0x3f800000) - 1.0f);
-    float mfs = (float)((ixs >> EXPSHIFTBITS_SP32) - 253);
-    int c = m == -127;
-    int ixn = c ? ixs : ax;
-    float mfn = c ? mfs : mf;
-
-    int indx = (ixn & 0x007f0000) + ((ixn & 0x00008000) << 1);
-
-    /* F - Y */
-    float f = as_float(0x3f000000 | indx) - as_float(0x3f000000 | (ixn & MANTBITS_SP32));
-
-    indx = indx >> 16;
-    float2 tv = USE_TABLE(log_inv_tbl_ep, indx);
-    float rh = f * tv.s0;
-    float rt = f * tv.s1;
-    r = rh + rt;
-
-    poly = mad(r, mad(r, 0x1.0p-2f, 0x1.555556p-2f), 0x1.0p-1f) * (r*r);
-    poly += (rh - r) + rt;
-
-    const float LOG2_HEAD = 0x1.62e000p-1f;  /* 0.693115234 */
-    const float LOG2_TAIL = 0x1.0bfbe8p-15f; /* 0.0000319461833 */
-    tv = USE_TABLE(loge_tbl, indx);
-    float lth = -r;
-    float ltt = mad(mfn, LOG2_TAIL, -poly) + tv.s1;
-    float lt = lth + ltt;
-    float lh = mad(mfn, LOG2_HEAD, tv.s0);
-    float l = lh + lt;
-
-    /* Select near 1 or not */
-    lth = near1 ? lth_near1 : lth;
-    ltt = near1 ? ltt_near1 : ltt;
-    lt = near1 ? lt_near1 : lt;
-    lh = near1 ? lh_near1 : lh;
-    l = near1 ? l_near1 : l;
-
-    float gh = as_float(as_int(l) & 0xfffff000);
-    float gt = ((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh);
-
-    float yh = as_float(iy & 0xfffff000);
-
-    float yt = y - yh;
-
-    float ylogx_s = mad(gt, yh, mad(gh, yt, yt*gt));
-    float ylogx = mad(yh, gh, ylogx_s);
-    float ylogx_t = mad(yh, gh, -ylogx) + ylogx_s;
-
-    /* Extra precise exp of ylogx */
-    const float R_64_BY_LOG2 = 0x1.715476p+6f; /* 64/log2 : 92.332482616893657 */
-    int n = convert_int(ylogx * R_64_BY_LOG2);
-    float nf = (float) n;
-
-    int j = n & 0x3f;
-    m = n >> 6;
-    int m2 = m << EXPSHIFTBITS_SP32;
-
-    const float R_LOG2_BY_64_LD = 0x1.620000p-7f;  /* log2/64 lead: 0.0108032227 */
-    const float R_LOG2_BY_64_TL = 0x1.c85fdep-16f; /* log2/64 tail: 0.0000272020388 */
-    r = mad(nf, -R_LOG2_BY_64_TL, mad(nf, -R_LOG2_BY_64_LD, ylogx)) + ylogx_t;
-
-    /* Truncated Taylor series for e^r */
-    poly = mad(mad(mad(r, 0x1.555556p-5f, 0x1.555556p-3f), r, 0x1.000000p-1f), r*r, r);
-
-    tv = USE_TABLE(exp_tbl_ep, j);
-
-    float expylogx = mad(tv.s0, poly, mad(tv.s1, poly, tv.s1)) + tv.s0;
-    float sexpylogx = expylogx * as_float(0x1 << (m + 149));
-    float texpylogx = as_float(as_int(expylogx) + m2);
-    expylogx = m < -125 ? sexpylogx : texpylogx;
-
-    /* Result is +-Inf if (ylogx + ylogx_t) > 128*log2 */
-    expylogx = (ylogx > 0x1.62e430p+6f) | (ylogx == 0x1.62e430p+6f & ylogx_t > -0x1.05c610p-22f) ? as_float(PINFBITPATT_SP32) : expylogx;
-
-    /* Result is 0 if ylogx < -149*log2 */
-    expylogx = ylogx <  -0x1.9d1da0p+6f ? 0.0f : expylogx;
-
-    /* Classify y:
-     *   inty = 0 means not an integer.
-     *   inty = 1 means odd integer.
-     *   inty = 2 means even integer.
-     */
-
-    int yexp = (int)(ay >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32 + 1;
-    int mask = (1 << (24 - yexp)) - 1;
-    int yodd = ((iy >> (24 - yexp)) & 0x1) != 0;
-    int inty = yodd ? 1 : 2;
-    inty = (iy & mask) != 0 ? 0 : inty;
-    inty = yexp < 1 ? 0 : inty;
-    inty = yexp > 24 ? 2 : inty;
-
-    float signval = as_float((as_uint(expylogx) ^ SIGNBIT_SP32));
-    expylogx = ((inty == 1) & !xpos) ? signval : expylogx;
-    int ret = as_int(expylogx);
-
-    /* Corner case handling */
-    ret = (!xpos & (inty == 0)) ? QNANBITPATT_SP32 : ret;
-    ret = ax < 0x3f800000 & iy == NINFBITPATT_SP32 ? PINFBITPATT_SP32 : ret;
-    ret = ax > 0x3f800000 & iy == NINFBITPATT_SP32 ? 0 : ret;
-    ret = ax < 0x3f800000 & iy == PINFBITPATT_SP32 ? 0 : ret;
-    ret = ax > 0x3f800000 & iy == PINFBITPATT_SP32 ? PINFBITPATT_SP32 : ret;
-    int xinf = xpos ? PINFBITPATT_SP32 : NINFBITPATT_SP32;
-    ret = ((ax == 0) & !ypos & (inty == 1)) ? xinf : ret;
-    ret = ((ax == 0) & !ypos & (inty != 1)) ? PINFBITPATT_SP32 : ret;
-    int xzero = xpos ? 0 : 0x80000000;
-    ret = ((ax == 0) & ypos & (inty == 1)) ? xzero : ret;
-    ret = ((ax == 0) & ypos & (inty != 1)) ? 0 : ret;
-    ret = ((ax == 0) & (iy == NINFBITPATT_SP32)) ? PINFBITPATT_SP32 : ret;
-    ret = ((ix == 0xbf800000) & (ay == PINFBITPATT_SP32)) ? 0x3f800000 : ret;
-    ret = ((ix == NINFBITPATT_SP32) & !ypos & (inty == 1)) ? 0x80000000 : ret;
-    ret = ((ix == NINFBITPATT_SP32) & !ypos & (inty != 1)) ? 0 : ret;
-    ret = ((ix == NINFBITPATT_SP32) & ypos & (inty == 1)) ? NINFBITPATT_SP32 : ret;
-    ret = ((ix == NINFBITPATT_SP32) & ypos & (inty != 1)) ? PINFBITPATT_SP32 : ret;
-    ret = ((ix == PINFBITPATT_SP32) & !ypos) ? 0 : ret;
-    ret = ((ix == PINFBITPATT_SP32) & ypos) ? PINFBITPATT_SP32 : ret;
-    ret = (ax > PINFBITPATT_SP32) ? ix : ret;
-    ret = (ay > PINFBITPATT_SP32) ? iy : ret;
-    ret = ay == 0 ? 0x3f800000 : ret;
-    ret = ix == 0x3f800000 ? 0x3f800000 : ret;
-
-    return as_float(ret);
+_CLC_DEF _CLC_OVERLOAD float __clc_pow(float x, float y) {
+
+  int ix = as_int(x);
+  int ax = ix & EXSIGNBIT_SP32;
+  int xpos = ix == ax;
+
+  int iy = as_int(y);
+  int ay = iy & EXSIGNBIT_SP32;
+  int ypos = iy == ay;
+
+  /* Extra precise log calculation
+   *  First handle case that x is close to 1
+   */
+  float r = 1.0f - as_float(ax);
+  int near1 = __clc_fabs(r) < 0x1.0p-4f;
+  float r2 = r * r;
+
+  /* Coefficients are just 1/3, 1/4, 1/5 and 1/6 */
+  float poly = __clc_mad(
+      r,
+      __clc_mad(r,
+                __clc_mad(r, __clc_mad(r, 0x1.24924ap-3f, 0x1.555556p-3f),
+                          0x1.99999ap-3f),
+                0x1.000000p-2f),
+      0x1.555556p-2f);
+
+  poly *= r2 * r;
+
+  float lth_near1 = -r2 * 0.5f;
+  float ltt_near1 = -poly;
+  float lt_near1 = lth_near1 + ltt_near1;
+  float lh_near1 = -r;
+  float l_near1 = lh_near1 + lt_near1;
+
+  /* Computations for x not near 1 */
+  int m = (int)(ax >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
+  float mf = (float)m;
+  int ixs = as_int(as_float(ax | 0x3f800000) - 1.0f);
+  float mfs = (float)((ixs >> EXPSHIFTBITS_SP32) - 253);
+  int c = m == -127;
+  int ixn = c ? ixs : ax;
+  float mfn = c ? mfs : mf;
+
+  int indx = (ixn & 0x007f0000) + ((ixn & 0x00008000) << 1);
+
+  /* F - Y */
+  float f = as_float(0x3f000000 | indx) -
+            as_float(0x3f000000 | (ixn & MANTBITS_SP32));
+
+  indx = indx >> 16;
+  float2 tv = USE_TABLE(log_inv_tbl_ep, indx);
+  float rh = f * tv.s0;
+  float rt = f * tv.s1;
+  r = rh + rt;
+
+  poly = __clc_mad(r, __clc_mad(r, 0x1.0p-2f, 0x1.555556p-2f), 0x1.0p-1f) *
+         (r * r);
+  poly += (rh - r) + rt;
+
+  const float LOG2_HEAD = 0x1.62e000p-1f;  /* 0.693115234 */
+  const float LOG2_TAIL = 0x1.0bfbe8p-15f; /* 0.0000319461833 */
+  tv = USE_TABLE(loge_tbl, indx);
+  float lth = -r;
+  float ltt = __clc_mad(mfn, LOG2_TAIL, -poly) + tv.s1;
+  float lt = lth + ltt;
+  float lh = __clc_mad(mfn, LOG2_HEAD, tv.s0);
+  float l = lh + lt;
+
+  /* Select near 1 or not */
+  lth = near1 ? lth_near1 : lth;
+  ltt = near1 ? ltt_near1 : ltt;
+  lt = near1 ? lt_near1 : lt;
+  lh = near1 ? lh_near1 : lh;
+  l = near1 ? l_near1 : l;
+
+  float gh = as_float(as_int(l) & 0xfffff000);
+  float gt = ((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh);
+
+  float yh = as_float(iy & 0xfffff000);
+
+  float yt = y - yh;
+
+  float ylogx_s = __clc_mad(gt, yh, __clc_mad(gh, yt, yt * gt));
+  float ylogx = __clc_mad(yh, gh, ylogx_s);
+  float ylogx_t = __clc_mad(yh, gh, -ylogx) + ylogx_s;
+
+  /* Extra precise exp of ylogx */
+  /* 64/log2 : 92.332482616893657 */
+  const float R_64_BY_LOG2 = 0x1.715476p+6f;
+  int n = convert_int(ylogx * R_64_BY_LOG2);
+  float nf = (float)n;
+
+  int j = n & 0x3f;
+  m = n >> 6;
+  int m2 = m << EXPSHIFTBITS_SP32;
+
+  /* log2/64 lead: 0.0108032227 */
+  const float R_LOG2_BY_64_LD = 0x1.620000p-7f;
+  /* log2/64 tail: 0.0000272020388 */
+  const float R_LOG2_BY_64_TL = 0x1.c85fdep-16f;
+  r = __clc_mad(nf, -R_LOG2_BY_64_TL, __clc_mad(nf, -R_LOG2_BY_64_LD, ylogx)) +
+      ylogx_t;
+
+  /* Truncated Taylor series for e^r */
+  poly = __clc_mad(__clc_mad(__clc_mad(r, 0x1.555556p-5f, 0x1.555556p-3f), r,
+                             0x1.000000p-1f),
+                   r * r, r);
+
+  tv = USE_TABLE(exp_tbl_ep, j);
+
+  float expylogx =
+      __clc_mad(tv.s0, poly, __clc_mad(tv.s1, poly, tv.s1)) + tv.s0;
+  float sexpylogx = expylogx * as_float(0x1 << (m + 149));
+  float texpylogx = as_float(as_int(expylogx) + m2);
+  expylogx = m < -125 ? sexpylogx : texpylogx;
+
+  /* Result is +-Inf if (ylogx + ylogx_t) > 128*log2 */
+  expylogx = (ylogx > 0x1.62e430p+6f) |
+                     (ylogx == 0x1.62e430p+6f & ylogx_t > -0x1.05c610p-22f)
+                 ? as_float(PINFBITPATT_SP32)
+                 : expylogx;
+
+  /* Result is 0 if ylogx < -149*log2 */
+  expylogx = ylogx < -0x1.9d1da0p+6f ? 0.0f : expylogx;
+
+  /* Classify y:
+   *   inty = 0 means not an integer.
+   *   inty = 1 means odd integer.
+   *   inty = 2 means even integer.
+   */
+
+  int yexp = (int)(ay >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32 + 1;
+  int mask = (1 << (24 - yexp)) - 1;
+  int yodd = ((iy >> (24 - yexp)) & 0x1) != 0;
+  int inty = yodd ? 1 : 2;
+  inty = (iy & mask) != 0 ? 0 : inty;
+  inty = yexp < 1 ? 0 : inty;
+  inty = yexp > 24 ? 2 : inty;
+
+  float signval = as_float((as_uint(expylogx) ^ SIGNBIT_SP32));
+  expylogx = ((inty == 1) & !xpos) ? signval : expylogx;
+  int ret = as_int(expylogx);
+
+  /* Corner case handling */
+  ret = (!xpos & (inty == 0)) ? QNANBITPATT_SP32 : ret;
+  ret = ax < 0x3f800000 & iy == NINFBITPATT_SP32 ? PINFBITPATT_SP32 : ret;
+  ret = ax > 0x3f800000 & iy == NINFBITPATT_SP32 ? 0 : ret;
+  ret = ax < 0x3f800000 & iy == PINFBITPATT_SP32 ? 0 : ret;
+  ret = ax > 0x3f800000 & iy == PINFBITPATT_SP32 ? PINFBITPATT_SP32 : ret;
+  int xinf = xpos ? PINFBITPATT_SP32 : NINFBITPATT_SP32;
+  ret = ((ax == 0) & !ypos & (inty == 1)) ? xinf : ret;
+  ret = ((ax == 0) & !ypos & (inty != 1)) ? PINFBITPATT_SP32 : ret;
+  int xzero = xpos ? 0 : 0x80000000;
+  ret = ((ax == 0) & ypos & (inty == 1)) ? xzero : ret;
+  ret = ((ax == 0) & ypos & (inty != 1)) ? 0 : ret;
+  ret = ((ax == 0) & (iy == NINFBITPATT_SP32)) ? PINFBITPATT_SP32 : ret;
+  ret = ((ix == 0xbf800000) & (ay == PINFBITPATT_SP32)) ? 0x3f800000 : ret;
+  ret = ((ix == NINFBITPATT_SP32) & !ypos & (inty == 1)) ? 0x80000000 : ret;
+  ret = ((ix == NINFBITPATT_SP32) & !ypos & (inty != 1)) ? 0 : ret;
+  ret =
+      ((ix == NINFBITPATT_SP32) & ypos & (inty == 1)) ? NINFBITPATT_SP32 : ret;
+  ret =
+      ((ix == NINFBITPATT_SP32) & ypos & (inty != 1)) ? PINFBITPATT_SP32 : ret;
+  ret = ((ix == PINFBITPATT_SP32) & !ypos) ? 0 : ret;
+  ret = ((ix == PINFBITPATT_SP32) & ypos) ? PINFBITPATT_SP32 : ret;
+  ret = (ax > PINFBITPATT_SP32) ? ix : ret;
+  ret = (ay > PINFBITPATT_SP32) ? iy : ret;
+  ret = ay == 0 ? 0x3f800000 : ret;
+  ret = ix == 0x3f800000 ? 0x3f800000 : ret;
+
+  return as_float(ret);
 }
 _CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, float, __clc_pow, float, float)
 
 #ifdef cl_khr_fp64
-_CLC_DEF _CLC_OVERLOAD double __clc_pow(double x, double y)
-{
-    const double real_log2_tail = 5.76999904754328540596e-08;
-    const double real_log2_lead = 6.93147122859954833984e-01;
-
-    long ux = as_long(x);
-    long ax = ux & (~SIGNBIT_DP64);
-    int xpos = ax == ux;
-
-    long uy = as_long(y);
-    long ay = uy & (~SIGNBIT_DP64);
-    int ypos = ay == uy;
-
-    // Extended precision log
-    double v, vt;
-    {
-        int exp = (int)(ax >> 52) - 1023;
-        int mask_exp_1023 = exp == -1023;
-        double xexp = (double) exp;
-        long mantissa = ax & 0x000FFFFFFFFFFFFFL;
-
-        long temp_ux = as_long(as_double(0x3ff0000000000000L | mantissa) - 1.0);
-        exp = ((temp_ux & 0x7FF0000000000000L) >> 52) - 2045;
-        double xexp1 = (double) exp;
-        long mantissa1 = temp_ux & 0x000FFFFFFFFFFFFFL;
-
-        xexp = mask_exp_1023 ? xexp1 : xexp;
-        mantissa = mask_exp_1023 ? mantissa1 : mantissa;
-
-        long rax = (mantissa & 0x000ff00000000000) + ((mantissa & 0x0000080000000000) << 1);
-        int index = rax >> 44;
-
-        double F = as_double(rax | 0x3FE0000000000000L);
-        double Y = as_double(mantissa | 0x3FE0000000000000L);
-        double f = F - Y;
-        double2 tv = USE_TABLE(log_f_inv_tbl, index);
-        double log_h = tv.s0;
-        double log_t = tv.s1;
-        double f_inv = (log_h + log_t) * f;
-        double r1 = as_double(as_long(f_inv) & 0xfffffffff8000000L);
-        double r2 = fma(-F, r1, f) * (log_h + log_t);
-        double r = r1 + r2;
-
-        double poly = fma(r,
-                          fma(r,
-                              fma(r,
-                                  fma(r, 1.0/7.0, 1.0/6.0),
-                                  1.0/5.0),
-                              1.0/4.0),
-                          1.0/3.0);
-        poly = poly * r * r * r;
-
-        double hr1r1 = 0.5*r1*r1;
-        double poly0h = r1 + hr1r1;
-        double poly0t = r1 - poly0h + hr1r1;
-        poly = fma(r1, r2, fma(0.5*r2, r2, poly)) + r2 + poly0t;
-
-        tv = USE_TABLE(powlog_tbl, index);
-        log_h = tv.s0;
-        log_t = tv.s1;
-
-        double resT_t = fma(xexp, real_log2_tail, + log_t) - poly;
-        double resT = resT_t - poly0h;
-        double resH = fma(xexp, real_log2_lead, log_h);
-        double resT_h = poly0h;
-
-        double H = resT + resH;
-        double H_h = as_double(as_long(H) & 0xfffffffff8000000L);
-        double T = (resH - H + resT) + (resT_t - (resT + resT_h)) + (H - H_h);
-        H = H_h;
-
-        double y_head = as_double(uy & 0xfffffffff8000000L);
-        double y_tail = y - y_head;
-
-        double temp = fma(y_tail, H, fma(y_head, T, y_tail*T));
-        v = fma(y_head, H, temp);
-        vt = fma(y_head, H, -v) + temp;
-    }
-
-    // Now calculate exp of (v,vt)
-
-    double expv;
-    {
-        const double max_exp_arg = 709.782712893384;
-        const double min_exp_arg = -745.1332191019411;
-        const double sixtyfour_by_lnof2 = 92.33248261689366;
-        const double lnof2_by_64_head = 0.010830424260348081;
-        const double lnof2_by_64_tail = -4.359010638708991e-10;
-
-        double temp = v * sixtyfour_by_lnof2;
-        int n = (int)temp;
-        double dn = (double)n;
-        int j = n & 0x0000003f;
-        int m = n >> 6;
-
-        double2 tv = USE_TABLE(two_to_jby64_ep_tbl, j);
-        double f1 = tv.s0;
-        double f2 = tv.s1;
-        double f = f1 + f2;
-
-        double r1 = fma(dn, -lnof2_by_64_head, v);
-        double r2 = dn * lnof2_by_64_tail;
-        double r = (r1 + r2) + vt;
-
-        double q = fma(r,
-                       fma(r,
-                           fma(r,
-                               fma(r, 1.38889490863777199667e-03, 8.33336798434219616221e-03),
-                               4.16666666662260795726e-02),
-                           1.66666666665260878863e-01),
-                       5.00000000000000008883e-01);
-        q = fma(r*r, q, r);
-
-        expv = fma(f, q, f2) + f1;
-	      expv = ldexp(expv, m);
-
-        expv = v > max_exp_arg ? as_double(0x7FF0000000000000L) : expv;
-        expv = v < min_exp_arg ? 0.0 : expv;
-    }
-
-    // See whether y is an integer.
-    // inty = 0 means not an integer.
-    // inty = 1 means odd integer.
-    // inty = 2 means even integer.
-
-    int inty;
-    {
-        int yexp = (int)(ay >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64 + 1;
-        inty = yexp < 1 ? 0 : 2;
-        inty = yexp > 53 ? 2 : inty;
-        long mask = (1L << (53 - yexp)) - 1L;
-        int inty1 = (((ay & ~mask) >> (53 - yexp)) & 1L) == 1L ? 1 : 2;
-        inty1 = (ay & mask) != 0 ? 0 : inty1;
-        inty = !(yexp < 1) & !(yexp > 53) ? inty1 : inty;
-    }
-
-    expv *= (inty == 1) & !xpos ? -1.0 : 1.0;
-
-    long ret = as_long(expv);
-
-    // Now all the edge cases
-    ret = !xpos & (inty == 0) ? QNANBITPATT_DP64 : ret;
-    ret = ax < 0x3ff0000000000000L & uy == NINFBITPATT_DP64 ? PINFBITPATT_DP64 : ret;
-    ret = ax > 0x3ff0000000000000L & uy == NINFBITPATT_DP64 ? 0L : ret;
-    ret = ax < 0x3ff0000000000000L & uy == PINFBITPATT_DP64 ? 0L : ret;
-    ret = ax > 0x3ff0000000000000L & uy == PINFBITPATT_DP64 ? PINFBITPATT_DP64 : ret;
-    long xinf = xpos ? PINFBITPATT_DP64 : NINFBITPATT_DP64;
-    ret = ((ax == 0L) & !ypos & (inty == 1)) ? xinf : ret;
-    ret = ((ax == 0L) & !ypos & (inty != 1)) ? PINFBITPATT_DP64 : ret;
-    long xzero = xpos ? 0L : 0x8000000000000000L;
-    ret = ((ax == 0L) & ypos & (inty == 1)) ? xzero : ret;
-    ret = ((ax == 0L) & ypos & (inty != 1)) ? 0L : ret;
-    ret = ((ax == 0L) & (uy == NINFBITPATT_DP64)) ? PINFBITPATT_DP64 : ret;
-    ret = ((ux == 0xbff0000000000000L) & (ay == PINFBITPATT_DP64)) ? 0x3ff0000000000000L : ret;
-    ret = ((ux == NINFBITPATT_DP64) & !ypos & (inty == 1)) ? 0x8000000000000000L : ret;
-    ret = ((ux == NINFBITPATT_DP64) & !ypos & (inty != 1)) ? 0L : ret;
-    ret = ((ux == NINFBITPATT_DP64) & ypos & (inty == 1)) ? NINFBITPATT_DP64 : ret;
-    ret = ((ux == NINFBITPATT_DP64) & ypos & (inty != 1)) ? PINFBITPATT_DP64 : ret;
-    ret = (ux == PINFBITPATT_DP64) & !ypos ? 0L : ret;
-    ret = (ux == PINFBITPATT_DP64) & ypos ? PINFBITPATT_DP64 : ret;
-    ret = ax > PINFBITPATT_DP64 ? ux : ret;
-    ret = ay > PINFBITPATT_DP64 ? uy : ret;
-    ret = ay == 0L ? 0x3ff0000000000000L : ret;
-    ret = ux == 0x3ff0000000000000L ? 0x3ff0000000000000L : ret;
-
-    return as_double(ret);
+_CLC_DEF _CLC_OVERLOAD double __clc_pow(double x, double y) {
+  const double real_log2_tail = 5.76999904754328540596e-08;
+  const double real_log2_lead = 6.93147122859954833984e-01;
+
+  long ux = as_long(x);
+  long ax = ux & (~SIGNBIT_DP64);
+  int xpos = ax == ux;
+
+  long uy = as_long(y);
+  long ay = uy & (~SIGNBIT_DP64);
+  int ypos = ay == uy;
+
+  // Extended precision log
+  double v, vt;
+  {
+    int exp = (int)(ax >> 52) - 1023;
+    int mask_exp_1023 = exp == -1023;
+    double xexp = (double)exp;
+    long mantissa = ax & 0x000FFFFFFFFFFFFFL;
+
+    long temp_ux = as_long(as_double(0x3ff0000000000000L | mantissa) - 1.0);
+    exp = ((temp_ux & 0x7FF0000000000000L) >> 52) - 2045;
+    double xexp1 = (double)exp;
+    long mantissa1 = temp_ux & 0x000FFFFFFFFFFFFFL;
+
+    xexp = mask_exp_1023 ? xexp1 : xexp;
+    mantissa = mask_exp_1023 ? mantissa1 : mantissa;
+
+    long rax = (mantissa & 0x000ff00000000000) +
+               ((mantissa & 0x0000080000000000) << 1);
+    int index = rax >> 44;
+
+    double F = as_double(rax | 0x3FE0000000000000L);
+    double Y = as_double(mantissa | 0x3FE0000000000000L);
+    double f = F - Y;
+    double2 tv = USE_TABLE(log_f_inv_tbl, index);
+    double log_h = tv.s0;
+    double log_t = tv.s1;
+    double f_inv = (log_h + log_t) * f;
+    double r1 = as_double(as_long(f_inv) & 0xfffffffff8000000L);
+    double r2 = fma(-F, r1, f) * (log_h + log_t);
+    double r = r1 + r2;
+
+    double poly = fma(
+        r, fma(r, fma(r, fma(r, 1.0 / 7.0, 1.0 / 6.0), 1.0 / 5.0), 1.0 / 4.0),
+        1.0 / 3.0);
+    poly = poly * r * r * r;
+
+    double hr1r1 = 0.5 * r1 * r1;
+    double poly0h = r1 + hr1r1;
+    double poly0t = r1 - poly0h + hr1r1;
+    poly = fma(r1, r2, fma(0.5 * r2, r2, poly)) + r2 + poly0t;
+
+    tv = USE_TABLE(powlog_tbl, index);
+    log_h = tv.s0;
+    log_t = tv.s1;
+
+    double resT_t = fma(xexp, real_log2_tail, +log_t) - poly;
+    double resT = resT_t - poly0h;
+    double resH = fma(xexp, real_log2_lead, log_h);
+    double resT_h = poly0h;
+
+    double H = resT + resH;
+    double H_h = as_double(as_long(H) & 0xfffffffff8000000L);
+    double T = (resH - H + resT) + (resT_t - (resT + resT_h)) + (H - H_h);
+    H = H_h;
+
+    double y_head = as_double(uy & 0xfffffffff8000000L);
+    double y_tail = y - y_head;
+
+    double temp = fma(y_tail, H, fma(y_head, T, y_tail * T));
+    v = fma(y_head, H, temp);
+    vt = fma(y_head, H, -v) + temp;
+  }
+
+  // Now calculate exp of (v,vt)
+
+  double expv;
+  {
+    const double max_exp_arg = 709.782712893384;
+    const double min_exp_arg = -745.1332191019411;
+    const double sixtyfour_by_lnof2 = 92.33248261689366;
+    const double lnof2_by_64_head = 0.010830424260348081;
+    const double lnof2_by_64_tail = -4.359010638708991e-10;
+
+    double temp = v * sixtyfour_by_lnof2;
+    int n = (int)temp;
+    double dn = (double)n;
+    int j = n & 0x0000003f;
+    int m = n >> 6;
+
+    double2 tv = USE_TABLE(two_to_jby64_ep_tbl, j);
+    double f1 = tv.s0;
+    double f2 = tv.s1;
+    double f = f1 + f2;
+
+    double r1 = fma(dn, -lnof2_by_64_head, v);
+    double r2 = dn * lnof2_by_64_tail;
+    double r = (r1 + r2) + vt;
+
+    double q = fma(
+        r,
+        fma(r,
+            fma(r,
+                fma(r, 1.38889490863777199667e-03, 8.33336798434219616221e-03),
+                4.16666666662260795726e-02),
+            1.66666666665260878863e-01),
+        5.00000000000000008883e-01);
+    q = fma(r * r, q, r);
+
+    expv = fma(f, q, f2) + f1;
+    expv = ldexp(expv, m);
+
+    expv = v > max_exp_arg ? as_double(0x7FF0000000000000L) : expv;
+    expv = v < min_exp_arg ? 0.0 : expv;
+  }
+
+  // See whether y is an integer.
+  // inty = 0 means not an integer.
+  // inty = 1 means odd integer.
+  // inty = 2 means even integer.
+
+  int inty;
+  {
+    int yexp = (int)(ay >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64 + 1;
+    inty = yexp < 1 ? 0 : 2;
+    inty = yexp > 53 ? 2 : inty;
+    long mask = (1L << (53 - yexp)) - 1L;
+    int inty1 = (((ay & ~mask) >> (53 - yexp)) & 1L) == 1L ? 1 : 2;
+    inty1 = (ay & mask) != 0 ? 0 : inty1;
+    inty = !(yexp < 1) & !(yexp > 53) ? inty1 : inty;
+  }
+
+  expv *= (inty == 1) & !xpos ? -1.0 : 1.0;
+
+  long ret = as_long(expv);
+
+  // Now all the edge cases
+  ret = !xpos & (inty == 0) ? QNANBITPATT_DP64 : ret;
+  ret = ax < 0x3ff0000000000000L & uy == NINFBITPATT_DP64 ? PINFBITPATT_DP64
+                                                          : ret;
+  ret = ax > 0x3ff0000000000000L & uy == NINFBITPATT_DP64 ? 0L : ret;
+  ret = ax < 0x3ff0000000000000L & uy == PINFBITPATT_DP64 ? 0L : ret;
+  ret = ax > 0x3ff0000000000000L & uy == PINFBITPATT_DP64 ? PINFBITPATT_DP64
+                                                          : ret;
+  long xinf = xpos ? PINFBITPATT_DP64 : NINFBITPATT_DP64;
+  ret = ((ax == 0L) & !ypos & (inty == 1)) ? xinf : ret;
+  ret = ((ax == 0L) & !ypos & (inty != 1)) ? PINFBITPATT_DP64 : ret;
+  long xzero = xpos ? 0L : 0x8000000000000000L;
+  ret = ((ax == 0L) & ypos & (inty == 1)) ? xzero : ret;
+  ret = ((ax == 0L) & ypos & (inty != 1)) ? 0L : ret;
+  ret = ((ax == 0L) & (uy == NINFBITPATT_DP64)) ? PINFBITPATT_DP64 : ret;
+  ret = ((ux == 0xbff0000000000000L) & (ay == PINFBITPATT_DP64))
+            ? 0x3ff0000000000000L
+            : ret;
+  ret = ((ux == NINFBITPATT_DP64) & !ypos & (inty == 1)) ? 0x8000000000000000L
+                                                         : ret;
+  ret = ((ux == NINFBITPATT_DP64) & !ypos & (inty != 1)) ? 0L : ret;
+  ret =
+      ((ux == NINFBITPATT_DP64) & ypos & (inty == 1)) ? NINFBITPATT_DP64 : ret;
+  ret =
+      ((ux == NINFBITPATT_DP64) & ypos & (inty != 1)) ? PINFBITPATT_DP64 : ret;
+  ret = (ux == PINFBITPATT_DP64) & !ypos ? 0L : ret;
+  ret = (ux == PINFBITPATT_DP64) & ypos ? PINFBITPATT_DP64 : ret;
+  ret = ax > PINFBITPATT_DP64 ? ux : ret;
+  ret = ay > PINFBITPATT_DP64 ? uy : ret;
+  ret = ay == 0L ? 0x3ff0000000000000L : ret;
+  ret = ux == 0x3ff0000000000000L ? 0x3ff0000000000000L : ret;
+
+  return as_double(ret);
 }
 _CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, double, __clc_pow, double, double)
 #endif
diff --git libclc/generic/lib/math/clc_pown.cl libclc/generic/lib/math/clc_pown.cl
index 031bf9b25e6a..c02089266460 100644
--- libclc/generic/lib/math/clc_pown.cl
+++ libclc/generic/lib/math/clc_pown.cl
@@ -23,6 +23,7 @@
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
 #include <clc/math/clc_fabs.h>
+#include <clc/math/clc_mad.h>
 
 #include "config.h"
 #include "math.h"
@@ -64,308 +65,321 @@
 // At the end of exp, do
 // ((((expT * poly) + expT) + expH*poly) + expH)
 
-_CLC_DEF _CLC_OVERLOAD float __clc_pown(float x, int ny)
-{
-    float y = (float)ny;
-
-    int ix = as_int(x);
-    int ax = ix & EXSIGNBIT_SP32;
-    int xpos = ix == ax;
-
-    int iy = as_int(y);
-    int ay = iy & EXSIGNBIT_SP32;
-    int ypos = iy == ay;
-
-    // Extra precise log calculation
-    // First handle case that x is close to 1
-    float r = 1.0f - as_float(ax);
-    int near1 = __clc_fabs(r) < 0x1.0p-4f;
-    float r2 = r*r;
-
-    // Coefficients are just 1/3, 1/4, 1/5 and 1/6
-    float poly = mad(r,
-                     mad(r,
-                         mad(r,
-                             mad(r, 0x1.24924ap-3f, 0x1.555556p-3f),
-                             0x1.99999ap-3f),
-                         0x1.000000p-2f),
-                     0x1.555556p-2f);
-
-    poly *= r2*r;
-
-    float lth_near1 = -r2 * 0.5f;
-    float ltt_near1 = -poly;
-    float lt_near1 = lth_near1 + ltt_near1;
-    float lh_near1 = -r;
-    float l_near1 = lh_near1 + lt_near1;
-
-    // Computations for x not near 1
-    int m = (int)(ax >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
-    float mf = (float)m;
-    int ixs = as_int(as_float(ax | 0x3f800000) - 1.0f);
-    float mfs = (float)((ixs >> EXPSHIFTBITS_SP32) - 253);
-    int c = m == -127;
-    int ixn = c ? ixs : ax;
-    float mfn = c ? mfs : mf;
-
-    int indx = (ixn & 0x007f0000) + ((ixn & 0x00008000) << 1);
-
-    // F - Y
-    float f = as_float(0x3f000000 | indx) - as_float(0x3f000000 | (ixn & MANTBITS_SP32));
-
-    indx = indx >> 16;
-    float2 tv = USE_TABLE(log_inv_tbl_ep, indx);
-    float rh = f * tv.s0;
-    float rt = f * tv.s1;
-    r = rh + rt;
-
-    poly = mad(r, mad(r, 0x1.0p-2f, 0x1.555556p-2f), 0x1.0p-1f) * (r*r);
-    poly += (rh - r) + rt;
-
-    const float LOG2_HEAD = 0x1.62e000p-1f;  // 0.693115234
-    const float LOG2_TAIL = 0x1.0bfbe8p-15f; // 0.0000319461833
-    tv = USE_TABLE(loge_tbl, indx);
-    float lth = -r;
-    float ltt = mad(mfn, LOG2_TAIL, -poly) + tv.s1;
-    float lt = lth + ltt;
-    float lh = mad(mfn, LOG2_HEAD, tv.s0);
-    float l = lh + lt;
-
-    // Select near 1 or not
-    lth = near1 ? lth_near1 : lth;
-    ltt = near1 ? ltt_near1 : ltt;
-    lt = near1 ? lt_near1 : lt;
-    lh = near1 ? lh_near1 : lh;
-    l = near1 ? l_near1 : l;
-
-    float gh = as_float(as_int(l) & 0xfffff000);
-    float gt = ((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh);
-
-    float yh = as_float(iy & 0xfffff000);
-
-    float yt = (float)(ny - (int)yh);
-
-    float ylogx_s = mad(gt, yh, mad(gh, yt, yt*gt));
-    float ylogx = mad(yh, gh, ylogx_s);
-    float ylogx_t = mad(yh, gh, -ylogx) + ylogx_s;
-
-    // Extra precise exp of ylogx
-    const float R_64_BY_LOG2 = 0x1.715476p+6f; // 64/log2 : 92.332482616893657
-    int n = convert_int(ylogx * R_64_BY_LOG2);
-    float nf = (float) n;
-
-    int j = n & 0x3f;
-    m = n >> 6;
-    int m2 = m << EXPSHIFTBITS_SP32;
-
-    const float R_LOG2_BY_64_LD = 0x1.620000p-7f;  // log2/64 lead: 0.0108032227
-    const float R_LOG2_BY_64_TL = 0x1.c85fdep-16f; // log2/64 tail: 0.0000272020388
-    r = mad(nf, -R_LOG2_BY_64_TL, mad(nf, -R_LOG2_BY_64_LD, ylogx)) + ylogx_t;
-
-    // Truncated Taylor series for e^r
-    poly = mad(mad(mad(r, 0x1.555556p-5f, 0x1.555556p-3f), r, 0x1.000000p-1f), r*r, r);
-
-    tv = USE_TABLE(exp_tbl_ep, j);
-
-    float expylogx = mad(tv.s0, poly, mad(tv.s1, poly, tv.s1)) + tv.s0;
-    float sexpylogx = expylogx * as_float(0x1 << (m + 149));
-    float texpylogx = as_float(as_int(expylogx) + m2);
-    expylogx = m < -125 ? sexpylogx : texpylogx;
-
-    // Result is +-Inf if (ylogx + ylogx_t) > 128*log2
-    expylogx = ((ylogx > 0x1.62e430p+6f) | (ylogx == 0x1.62e430p+6f & ylogx_t > -0x1.05c610p-22f)) ? as_float(PINFBITPATT_SP32) : expylogx;
-
-    // Result is 0 if ylogx < -149*log2
-    expylogx = ylogx <  -0x1.9d1da0p+6f ? 0.0f : expylogx;
-
-    // Classify y:
-    //   inty = 0 means not an integer.
-    //   inty = 1 means odd integer.
-    //   inty = 2 means even integer.
-
-    int inty = 2 - (ny & 1);
-
-    float signval = as_float((as_uint(expylogx) ^ SIGNBIT_SP32));
-    expylogx = ((inty == 1) & !xpos) ? signval : expylogx;
-    int ret = as_int(expylogx);
-
-    // Corner case handling
-    int xinf = xpos ? PINFBITPATT_SP32 : NINFBITPATT_SP32;
-    ret = ((ax == 0) & !ypos & (inty == 1)) ? xinf : ret;
-    ret = ((ax == 0) & !ypos & (inty == 2)) ? PINFBITPATT_SP32 : ret;
-    ret = ((ax == 0) & ypos & (inty == 2)) ? 0 : ret;
-    int xzero = !xpos ? 0x80000000 : 0L;
-    ret = ((ax == 0) & ypos & (inty == 1)) ? xzero : ret;
-    ret = ((ix == NINFBITPATT_SP32) & !ypos & (inty == 1)) ? 0x80000000 : ret;
-    ret = ((ix == NINFBITPATT_SP32) & !ypos & (inty != 1)) ? 0 : ret;
-    ret = ((ix == NINFBITPATT_SP32) & ypos & (inty == 1)) ? NINFBITPATT_SP32 : ret;
-    ret = ((ix == NINFBITPATT_SP32) & ypos & (inty != 1)) ? PINFBITPATT_SP32 : ret;
-    ret = ((ix == PINFBITPATT_SP32) & !ypos) ? 0 : ret;
-    ret = ((ix == PINFBITPATT_SP32) & ypos) ? PINFBITPATT_SP32 : ret;
-    ret = ax > PINFBITPATT_SP32 ? ix : ret;
-    ret = ny == 0 ? 0x3f800000 : ret;
-
-    return as_float(ret);
+_CLC_DEF _CLC_OVERLOAD float __clc_pown(float x, int ny) {
+  float y = (float)ny;
+
+  int ix = as_int(x);
+  int ax = ix & EXSIGNBIT_SP32;
+  int xpos = ix == ax;
+
+  int iy = as_int(y);
+  int ay = iy & EXSIGNBIT_SP32;
+  int ypos = iy == ay;
+
+  // Extra precise log calculation
+  // First handle case that x is close to 1
+  float r = 1.0f - as_float(ax);
+  int near1 = __clc_fabs(r) < 0x1.0p-4f;
+  float r2 = r * r;
+
+  // Coefficients are just 1/3, 1/4, 1/5 and 1/6
+  float poly = __clc_mad(
+      r,
+      __clc_mad(r,
+                __clc_mad(r, __clc_mad(r, 0x1.24924ap-3f, 0x1.555556p-3f),
+                          0x1.99999ap-3f),
+                0x1.000000p-2f),
+      0x1.555556p-2f);
+
+  poly *= r2 * r;
+
+  float lth_near1 = -r2 * 0.5f;
+  float ltt_near1 = -poly;
+  float lt_near1 = lth_near1 + ltt_near1;
+  float lh_near1 = -r;
+  float l_near1 = lh_near1 + lt_near1;
+
+  // Computations for x not near 1
+  int m = (int)(ax >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
+  float mf = (float)m;
+  int ixs = as_int(as_float(ax | 0x3f800000) - 1.0f);
+  float mfs = (float)((ixs >> EXPSHIFTBITS_SP32) - 253);
+  int c = m == -127;
+  int ixn = c ? ixs : ax;
+  float mfn = c ? mfs : mf;
+
+  int indx = (ixn & 0x007f0000) + ((ixn & 0x00008000) << 1);
+
+  // F - Y
+  float f = as_float(0x3f000000 | indx) -
+            as_float(0x3f000000 | (ixn & MANTBITS_SP32));
+
+  indx = indx >> 16;
+  float2 tv = USE_TABLE(log_inv_tbl_ep, indx);
+  float rh = f * tv.s0;
+  float rt = f * tv.s1;
+  r = rh + rt;
+
+  poly = __clc_mad(r, __clc_mad(r, 0x1.0p-2f, 0x1.555556p-2f), 0x1.0p-1f) *
+         (r * r);
+  poly += (rh - r) + rt;
+
+  const float LOG2_HEAD = 0x1.62e000p-1f;  // 0.693115234
+  const float LOG2_TAIL = 0x1.0bfbe8p-15f; // 0.0000319461833
+  tv = USE_TABLE(loge_tbl, indx);
+  float lth = -r;
+  float ltt = __clc_mad(mfn, LOG2_TAIL, -poly) + tv.s1;
+  float lt = lth + ltt;
+  float lh = __clc_mad(mfn, LOG2_HEAD, tv.s0);
+  float l = lh + lt;
+
+  // Select near 1 or not
+  lth = near1 ? lth_near1 : lth;
+  ltt = near1 ? ltt_near1 : ltt;
+  lt = near1 ? lt_near1 : lt;
+  lh = near1 ? lh_near1 : lh;
+  l = near1 ? l_near1 : l;
+
+  float gh = as_float(as_int(l) & 0xfffff000);
+  float gt = ((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh);
+
+  float yh = as_float(iy & 0xfffff000);
+
+  float yt = (float)(ny - (int)yh);
+
+  float ylogx_s = __clc_mad(gt, yh, __clc_mad(gh, yt, yt * gt));
+  float ylogx = __clc_mad(yh, gh, ylogx_s);
+  float ylogx_t = __clc_mad(yh, gh, -ylogx) + ylogx_s;
+
+  // Extra precise exp of ylogx
+  // 64/log2 : 92.332482616893657
+  const float R_64_BY_LOG2 = 0x1.715476p+6f;
+  int n = convert_int(ylogx * R_64_BY_LOG2);
+  float nf = (float)n;
+
+  int j = n & 0x3f;
+  m = n >> 6;
+  int m2 = m << EXPSHIFTBITS_SP32;
+
+  // log2/64 lead: 0.0108032227
+  const float R_LOG2_BY_64_LD = 0x1.620000p-7f;
+  // log2/64 tail: 0.0000272020388
+  const float R_LOG2_BY_64_TL = 0x1.c85fdep-16f;
+  r = __clc_mad(nf, -R_LOG2_BY_64_TL, __clc_mad(nf, -R_LOG2_BY_64_LD, ylogx)) +
+      ylogx_t;
+
+  // Truncated Taylor series for e^r
+  poly = __clc_mad(__clc_mad(__clc_mad(r, 0x1.555556p-5f, 0x1.555556p-3f), r,
+                             0x1.000000p-1f),
+                   r * r, r);
+
+  tv = USE_TABLE(exp_tbl_ep, j);
+
+  float expylogx =
+      __clc_mad(tv.s0, poly, __clc_mad(tv.s1, poly, tv.s1)) + tv.s0;
+  float sexpylogx = expylogx * as_float(0x1 << (m + 149));
+  float texpylogx = as_float(as_int(expylogx) + m2);
+  expylogx = m < -125 ? sexpylogx : texpylogx;
+
+  // Result is +-Inf if (ylogx + ylogx_t) > 128*log2
+  expylogx = ((ylogx > 0x1.62e430p+6f) |
+              (ylogx == 0x1.62e430p+6f & ylogx_t > -0x1.05c610p-22f))
+                 ? as_float(PINFBITPATT_SP32)
+                 : expylogx;
+
+  // Result is 0 if ylogx < -149*log2
+  expylogx = ylogx < -0x1.9d1da0p+6f ? 0.0f : expylogx;
+
+  // Classify y:
+  //   inty = 0 means not an integer.
+  //   inty = 1 means odd integer.
+  //   inty = 2 means even integer.
+
+  int inty = 2 - (ny & 1);
+
+  float signval = as_float((as_uint(expylogx) ^ SIGNBIT_SP32));
+  expylogx = ((inty == 1) & !xpos) ? signval : expylogx;
+  int ret = as_int(expylogx);
+
+  // Corner case handling
+  int xinf = xpos ? PINFBITPATT_SP32 : NINFBITPATT_SP32;
+  ret = ((ax == 0) & !ypos & (inty == 1)) ? xinf : ret;
+  ret = ((ax == 0) & !ypos & (inty == 2)) ? PINFBITPATT_SP32 : ret;
+  ret = ((ax == 0) & ypos & (inty == 2)) ? 0 : ret;
+  int xzero = !xpos ? 0x80000000 : 0L;
+  ret = ((ax == 0) & ypos & (inty == 1)) ? xzero : ret;
+  ret = ((ix == NINFBITPATT_SP32) & !ypos & (inty == 1)) ? 0x80000000 : ret;
+  ret = ((ix == NINFBITPATT_SP32) & !ypos & (inty != 1)) ? 0 : ret;
+  ret =
+      ((ix == NINFBITPATT_SP32) & ypos & (inty == 1)) ? NINFBITPATT_SP32 : ret;
+  ret =
+      ((ix == NINFBITPATT_SP32) & ypos & (inty != 1)) ? PINFBITPATT_SP32 : ret;
+  ret = ((ix == PINFBITPATT_SP32) & !ypos) ? 0 : ret;
+  ret = ((ix == PINFBITPATT_SP32) & ypos) ? PINFBITPATT_SP32 : ret;
+  ret = ax > PINFBITPATT_SP32 ? ix : ret;
+  ret = ny == 0 ? 0x3f800000 : ret;
+
+  return as_float(ret);
 }
 _CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, float, __clc_pown, float, int)
 
 #ifdef cl_khr_fp64
-_CLC_DEF _CLC_OVERLOAD double __clc_pown(double x, int ny)
-{
-    const double real_log2_tail = 5.76999904754328540596e-08;
-    const double real_log2_lead = 6.93147122859954833984e-01;
-
-    double y = (double) ny;
-
-    long ux = as_long(x);
-    long ax = ux & (~SIGNBIT_DP64);
-    int xpos = ax == ux;
-
-    long uy = as_long(y);
-    long ay = uy & (~SIGNBIT_DP64);
-    int ypos = ay == uy;
-
-    // Extended precision log
-    double v, vt;
-    {
-        int exp = (int)(ax >> 52) - 1023;
-        int mask_exp_1023 = exp == -1023;
-        double xexp = (double) exp;
-        long mantissa = ax & 0x000FFFFFFFFFFFFFL;
-
-        long temp_ux = as_long(as_double(0x3ff0000000000000L | mantissa) - 1.0);
-        exp = ((temp_ux & 0x7FF0000000000000L) >> 52) - 2045;
-        double xexp1 = (double) exp;
-        long mantissa1 = temp_ux & 0x000FFFFFFFFFFFFFL;
-
-        xexp = mask_exp_1023 ? xexp1 : xexp;
-        mantissa = mask_exp_1023 ? mantissa1 : mantissa;
-
-        long rax = (mantissa & 0x000ff00000000000) + ((mantissa & 0x0000080000000000) << 1);
-        int index = rax >> 44;
-
-        double F = as_double(rax | 0x3FE0000000000000L);
-        double Y = as_double(mantissa | 0x3FE0000000000000L);
-        double f = F - Y;
-        double2 tv = USE_TABLE(log_f_inv_tbl, index);
-        double log_h = tv.s0;
-        double log_t = tv.s1;
-        double f_inv = (log_h + log_t) * f;
-        double r1 = as_double(as_long(f_inv) & 0xfffffffff8000000L);
-        double r2 = fma(-F, r1, f) * (log_h + log_t);
-        double r = r1 + r2;
-
-        double poly = fma(r,
-                          fma(r,
-                              fma(r,
-                                  fma(r, 1.0/7.0, 1.0/6.0),
-                                  1.0/5.0),
-                              1.0/4.0),
-                          1.0/3.0);
-        poly = poly * r * r * r;
-
-        double hr1r1 = 0.5*r1*r1;
-        double poly0h = r1 + hr1r1;
-        double poly0t = r1 - poly0h + hr1r1;
-        poly = fma(r1, r2, fma(0.5*r2, r2, poly)) + r2 + poly0t;
-
-        tv = USE_TABLE(powlog_tbl, index);
-        log_h = tv.s0;
-        log_t = tv.s1;
-
-        double resT_t = fma(xexp, real_log2_tail, + log_t) - poly;
-        double resT = resT_t - poly0h;
-        double resH = fma(xexp, real_log2_lead, log_h);
-        double resT_h = poly0h;
-
-        double H = resT + resH;
-        double H_h = as_double(as_long(H) & 0xfffffffff8000000L);
-        double T = (resH - H + resT) + (resT_t - (resT + resT_h)) + (H - H_h);
-        H = H_h;
-
-        double y_head = as_double(uy & 0xfffffffff8000000L);
-        double y_tail = y - y_head;
-
-        int mask_2_24 = ay > 0x4170000000000000; // 2^24
-        int nyh = convert_int(y_head);
-        int nyt = ny - nyh;
-        double y_tail1 = (double)nyt;
-        y_tail = mask_2_24 ? y_tail1 : y_tail;
-
-        double temp = fma(y_tail, H, fma(y_head, T, y_tail*T));
-        v = fma(y_head, H, temp);
-        vt = fma(y_head, H, -v) + temp;
-    }
-
-    // Now calculate exp of (v,vt)
-
-    double expv;
-    {
-        const double max_exp_arg = 709.782712893384;
-        const double min_exp_arg = -745.1332191019411;
-        const double sixtyfour_by_lnof2 = 92.33248261689366;
-        const double lnof2_by_64_head = 0.010830424260348081;
-        const double lnof2_by_64_tail = -4.359010638708991e-10;
-
-        double temp = v * sixtyfour_by_lnof2;
-        int n = (int)temp;
-        double dn = (double)n;
-        int j = n & 0x0000003f;
-        int m = n >> 6;
-
-        double2 tv = USE_TABLE(two_to_jby64_ep_tbl, j);
-        double f1 = tv.s0;
-        double f2 = tv.s1;
-        double f = f1 + f2;
-
-        double r1 = fma(dn, -lnof2_by_64_head, v);
-        double r2 = dn * lnof2_by_64_tail;
-        double r = (r1 + r2) + vt;
-
-        double q = fma(r,
-                       fma(r,
-                           fma(r,
-                               fma(r, 1.38889490863777199667e-03, 8.33336798434219616221e-03),
-                               4.16666666662260795726e-02),
-                           1.66666666665260878863e-01),
-                       5.00000000000000008883e-01);
-        q = fma(r*r, q, r);
-
-        expv = fma(f, q, f2) + f1;
-	      expv = ldexp(expv, m);
-
-        expv = v > max_exp_arg ? as_double(0x7FF0000000000000L) : expv;
-        expv = v < min_exp_arg ? 0.0 : expv;
-    }
-
-    // See whether y is an integer.
-    // inty = 0 means not an integer.
-    // inty = 1 means odd integer.
-    // inty = 2 means even integer.
-
-    int inty = 2 - (ny & 1);
-
-    expv *= ((inty == 1) & !xpos) ? -1.0 : 1.0;
-
-    long ret = as_long(expv);
-
-    // Now all the edge cases
-    long xinf = xpos ? PINFBITPATT_DP64 : NINFBITPATT_DP64;
-    ret = ((ax == 0L) & !ypos & (inty == 1)) ? xinf : ret;
-    ret = ((ax == 0L) & !ypos & (inty == 2)) ? PINFBITPATT_DP64 : ret;
-    ret = ((ax == 0L) & ypos & (inty == 2)) ? 0L : ret;
-    long xzero = !xpos ? 0x8000000000000000L : 0L;
-    ret = ((ax == 0L) & ypos & (inty == 1)) ? xzero : ret;
-    ret = ((ux == NINFBITPATT_DP64) & !ypos & (inty == 1)) ? 0x8000000000000000L : ret;
-    ret = ((ux == NINFBITPATT_DP64) & !ypos & (inty != 1)) ? 0L : ret;
-    ret = ((ux == NINFBITPATT_DP64) & ypos & (inty == 1)) ? NINFBITPATT_DP64 : ret;
-    ret = ((ux == NINFBITPATT_DP64) & ypos & (inty != 1)) ? PINFBITPATT_DP64 : ret;
-    ret = ((ux == PINFBITPATT_DP64) & !ypos) ? 0L : ret;
-    ret = ((ux == PINFBITPATT_DP64) & ypos) ? PINFBITPATT_DP64 : ret;
-    ret = ax > PINFBITPATT_DP64 ? ux : ret;
-    ret = ny == 0 ? 0x3ff0000000000000L : ret;
-
-    return as_double(ret);
+_CLC_DEF _CLC_OVERLOAD double __clc_pown(double x, int ny) {
+  const double real_log2_tail = 5.76999904754328540596e-08;
+  const double real_log2_lead = 6.93147122859954833984e-01;
+
+  double y = (double)ny;
+
+  long ux = as_long(x);
+  long ax = ux & (~SIGNBIT_DP64);
+  int xpos = ax == ux;
+
+  long uy = as_long(y);
+  long ay = uy & (~SIGNBIT_DP64);
+  int ypos = ay == uy;
+
+  // Extended precision log
+  double v, vt;
+  {
+    int exp = (int)(ax >> 52) - 1023;
+    int mask_exp_1023 = exp == -1023;
+    double xexp = (double)exp;
+    long mantissa = ax & 0x000FFFFFFFFFFFFFL;
+
+    long temp_ux = as_long(as_double(0x3ff0000000000000L | mantissa) - 1.0);
+    exp = ((temp_ux & 0x7FF0000000000000L) >> 52) - 2045;
+    double xexp1 = (double)exp;
+    long mantissa1 = temp_ux & 0x000FFFFFFFFFFFFFL;
+
+    xexp = mask_exp_1023 ? xexp1 : xexp;
+    mantissa = mask_exp_1023 ? mantissa1 : mantissa;
+
+    long rax = (mantissa & 0x000ff00000000000) +
+               ((mantissa & 0x0000080000000000) << 1);
+    int index = rax >> 44;
+
+    double F = as_double(rax | 0x3FE0000000000000L);
+    double Y = as_double(mantissa | 0x3FE0000000000000L);
+    double f = F - Y;
+    double2 tv = USE_TABLE(log_f_inv_tbl, index);
+    double log_h = tv.s0;
+    double log_t = tv.s1;
+    double f_inv = (log_h + log_t) * f;
+    double r1 = as_double(as_long(f_inv) & 0xfffffffff8000000L);
+    double r2 = fma(-F, r1, f) * (log_h + log_t);
+    double r = r1 + r2;
+
+    double poly = fma(
+        r, fma(r, fma(r, fma(r, 1.0 / 7.0, 1.0 / 6.0), 1.0 / 5.0), 1.0 / 4.0),
+        1.0 / 3.0);
+    poly = poly * r * r * r;
+
+    double hr1r1 = 0.5 * r1 * r1;
+    double poly0h = r1 + hr1r1;
+    double poly0t = r1 - poly0h + hr1r1;
+    poly = fma(r1, r2, fma(0.5 * r2, r2, poly)) + r2 + poly0t;
+
+    tv = USE_TABLE(powlog_tbl, index);
+    log_h = tv.s0;
+    log_t = tv.s1;
+
+    double resT_t = fma(xexp, real_log2_tail, +log_t) - poly;
+    double resT = resT_t - poly0h;
+    double resH = fma(xexp, real_log2_lead, log_h);
+    double resT_h = poly0h;
+
+    double H = resT + resH;
+    double H_h = as_double(as_long(H) & 0xfffffffff8000000L);
+    double T = (resH - H + resT) + (resT_t - (resT + resT_h)) + (H - H_h);
+    H = H_h;
+
+    double y_head = as_double(uy & 0xfffffffff8000000L);
+    double y_tail = y - y_head;
+
+    int mask_2_24 = ay > 0x4170000000000000; // 2^24
+    int nyh = convert_int(y_head);
+    int nyt = ny - nyh;
+    double y_tail1 = (double)nyt;
+    y_tail = mask_2_24 ? y_tail1 : y_tail;
+
+    double temp = fma(y_tail, H, fma(y_head, T, y_tail * T));
+    v = fma(y_head, H, temp);
+    vt = fma(y_head, H, -v) + temp;
+  }
+
+  // Now calculate exp of (v,vt)
+
+  double expv;
+  {
+    const double max_exp_arg = 709.782712893384;
+    const double min_exp_arg = -745.1332191019411;
+    const double sixtyfour_by_lnof2 = 92.33248261689366;
+    const double lnof2_by_64_head = 0.010830424260348081;
+    const double lnof2_by_64_tail = -4.359010638708991e-10;
+
+    double temp = v * sixtyfour_by_lnof2;
+    int n = (int)temp;
+    double dn = (double)n;
+    int j = n & 0x0000003f;
+    int m = n >> 6;
+
+    double2 tv = USE_TABLE(two_to_jby64_ep_tbl, j);
+    double f1 = tv.s0;
+    double f2 = tv.s1;
+    double f = f1 + f2;
+
+    double r1 = fma(dn, -lnof2_by_64_head, v);
+    double r2 = dn * lnof2_by_64_tail;
+    double r = (r1 + r2) + vt;
+
+    double q = fma(
+        r,
+        fma(r,
+            fma(r,
+                fma(r, 1.38889490863777199667e-03, 8.33336798434219616221e-03),
+                4.16666666662260795726e-02),
+            1.66666666665260878863e-01),
+        5.00000000000000008883e-01);
+    q = fma(r * r, q, r);
+
+    expv = fma(f, q, f2) + f1;
+    expv = ldexp(expv, m);
+
+    expv = v > max_exp_arg ? as_double(0x7FF0000000000000L) : expv;
+    expv = v < min_exp_arg ? 0.0 : expv;
+  }
+
+  // See whether y is an integer.
+  // inty = 0 means not an integer.
+  // inty = 1 means odd integer.
+  // inty = 2 means even integer.
+
+  int inty = 2 - (ny & 1);
+
+  expv *= ((inty == 1) & !xpos) ? -1.0 : 1.0;
+
+  long ret = as_long(expv);
+
+  // Now all the edge cases
+  long xinf = xpos ? PINFBITPATT_DP64 : NINFBITPATT_DP64;
+  ret = ((ax == 0L) & !ypos & (inty == 1)) ? xinf : ret;
+  ret = ((ax == 0L) & !ypos & (inty == 2)) ? PINFBITPATT_DP64 : ret;
+  ret = ((ax == 0L) & ypos & (inty == 2)) ? 0L : ret;
+  long xzero = !xpos ? 0x8000000000000000L : 0L;
+  ret = ((ax == 0L) & ypos & (inty == 1)) ? xzero : ret;
+  ret = ((ux == NINFBITPATT_DP64) & !ypos & (inty == 1)) ? 0x8000000000000000L
+                                                         : ret;
+  ret = ((ux == NINFBITPATT_DP64) & !ypos & (inty != 1)) ? 0L : ret;
+  ret =
+      ((ux == NINFBITPATT_DP64) & ypos & (inty == 1)) ? NINFBITPATT_DP64 : ret;
+  ret =
+      ((ux == NINFBITPATT_DP64) & ypos & (inty != 1)) ? PINFBITPATT_DP64 : ret;
+  ret = ((ux == PINFBITPATT_DP64) & !ypos) ? 0L : ret;
+  ret = ((ux == PINFBITPATT_DP64) & ypos) ? PINFBITPATT_DP64 : ret;
+  ret = ax > PINFBITPATT_DP64 ? ux : ret;
+  ret = ny == 0 ? 0x3ff0000000000000L : ret;
+
+  return as_double(ret);
 }
 _CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, double, __clc_pown, double, int)
 #endif
diff --git libclc/generic/lib/math/clc_powr.cl libclc/generic/lib/math/clc_powr.cl
index c431f529f3b9..9516be34456b 100644
--- libclc/generic/lib/math/clc_powr.cl
+++ libclc/generic/lib/math/clc_powr.cl
@@ -23,6 +23,7 @@
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
 #include <clc/math/clc_fabs.h>
+#include <clc/math/clc_mad.h>
 
 #include "config.h"
 #include "math.h"
@@ -64,319 +65,332 @@
 // At the end of exp, do
 // ((((expT * poly) + expT) + expH*poly) + expH)
 
-_CLC_DEF _CLC_OVERLOAD float __clc_powr(float x, float y)
-{
-    int ix = as_int(x);
-    int ax = ix & EXSIGNBIT_SP32;
-    int xpos = ix == ax;
-
-    int iy = as_int(y);
-    int ay = iy & EXSIGNBIT_SP32;
-    int ypos = iy == ay;
-
-    // Extra precise log calculation
-    // First handle case that x is close to 1
-    float r = 1.0f - as_float(ax);
-    int near1 = __clc_fabs(r) < 0x1.0p-4f;
-    float r2 = r*r;
-
-    // Coefficients are just 1/3, 1/4, 1/5 and 1/6
-    float poly = mad(r,
-                     mad(r,
-                         mad(r,
-                             mad(r, 0x1.24924ap-3f, 0x1.555556p-3f),
-                             0x1.99999ap-3f),
-                         0x1.000000p-2f),
-                     0x1.555556p-2f);
-
-    poly *= r2*r;
-
-    float lth_near1 = -r2 * 0.5f;
-    float ltt_near1 = -poly;
-    float lt_near1 = lth_near1 + ltt_near1;
-    float lh_near1 = -r;
-    float l_near1 = lh_near1 + lt_near1;
-
-    // Computations for x not near 1
-    int m = (int)(ax >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
-    float mf = (float)m;
-    int ixs = as_int(as_float(ax | 0x3f800000) - 1.0f);
-    float mfs = (float)((ixs >> EXPSHIFTBITS_SP32) - 253);
-    int c = m == -127;
-    int ixn = c ? ixs : ax;
-    float mfn = c ? mfs : mf;
-
-    int indx = (ixn & 0x007f0000) + ((ixn & 0x00008000) << 1);
-
-    // F - Y
-    float f = as_float(0x3f000000 | indx) - as_float(0x3f000000 | (ixn & MANTBITS_SP32));
-
-    indx = indx >> 16;
-    float2 tv = USE_TABLE(log_inv_tbl_ep, indx);
-    float rh = f * tv.s0;
-    float rt = f * tv.s1;
-    r = rh + rt;
-
-    poly = mad(r, mad(r, 0x1.0p-2f, 0x1.555556p-2f), 0x1.0p-1f) * (r*r);
-    poly += (rh - r) + rt;
-
-    const float LOG2_HEAD = 0x1.62e000p-1f;  // 0.693115234
-    const float LOG2_TAIL = 0x1.0bfbe8p-15f; // 0.0000319461833
-    tv = USE_TABLE(loge_tbl, indx);
-    float lth = -r;
-    float ltt = mad(mfn, LOG2_TAIL, -poly) + tv.s1;
-    float lt = lth + ltt;
-    float lh = mad(mfn, LOG2_HEAD, tv.s0);
-    float l = lh + lt;
-
-    // Select near 1 or not
-    lth = near1 ? lth_near1 : lth;
-    ltt = near1 ? ltt_near1 : ltt;
-    lt = near1 ? lt_near1 : lt;
-    lh = near1 ? lh_near1 : lh;
-    l = near1 ? l_near1 : l;
-
-    float gh = as_float(as_int(l) & 0xfffff000);
-    float gt = ((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh);
-
-    float yh = as_float(iy & 0xfffff000);
-
-    float yt = y - yh;
-
-    float ylogx_s = mad(gt, yh, mad(gh, yt, yt*gt));
-    float ylogx = mad(yh, gh, ylogx_s);
-    float ylogx_t = mad(yh, gh, -ylogx) + ylogx_s;
-
-    // Extra precise exp of ylogx
-    const float R_64_BY_LOG2 = 0x1.715476p+6f; // 64/log2 : 92.332482616893657
-    int n = convert_int(ylogx * R_64_BY_LOG2);
-    float nf = (float) n;
-
-    int j = n & 0x3f;
-    m = n >> 6;
-    int m2 = m << EXPSHIFTBITS_SP32;
-
-    const float R_LOG2_BY_64_LD = 0x1.620000p-7f;  // log2/64 lead: 0.0108032227
-    const float R_LOG2_BY_64_TL = 0x1.c85fdep-16f; // log2/64 tail: 0.0000272020388
-    r = mad(nf, -R_LOG2_BY_64_TL, mad(nf, -R_LOG2_BY_64_LD, ylogx)) + ylogx_t;
-
-    // Truncated Taylor series for e^r
-    poly = mad(mad(mad(r, 0x1.555556p-5f, 0x1.555556p-3f), r, 0x1.000000p-1f), r*r, r);
-
-    tv = USE_TABLE(exp_tbl_ep, j);
-
-    float expylogx = mad(tv.s0, poly, mad(tv.s1, poly, tv.s1)) + tv.s0;
-    float sexpylogx = expylogx * as_float(0x1 << (m + 149));
-    float texpylogx = as_float(as_int(expylogx) + m2);
-    expylogx = m < -125 ? sexpylogx : texpylogx;
-
-    // Result is +-Inf if (ylogx + ylogx_t) > 128*log2
-    expylogx = ((ylogx > 0x1.62e430p+6f) | (ylogx == 0x1.62e430p+6f & ylogx_t > -0x1.05c610p-22f)) ? as_float(PINFBITPATT_SP32) : expylogx;
-
-    // Result is 0 if ylogx < -149*log2
-    expylogx = ylogx <  -0x1.9d1da0p+6f ? 0.0f : expylogx;
-
-    // Classify y:
-    //   inty = 0 means not an integer.
-    //   inty = 1 means odd integer.
-    //   inty = 2 means even integer.
-
-    int yexp = (int)(ay >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32 + 1;
-    int mask = (1 << (24 - yexp)) - 1;
-    int yodd = ((iy >> (24 - yexp)) & 0x1) != 0;
-    int inty = yodd ? 1 : 2;
-    inty = (iy & mask) != 0 ? 0 : inty;
-    inty = yexp < 1 ? 0 : inty;
-    inty = yexp > 24 ? 2 : inty;
-
-    float signval = as_float((as_uint(expylogx) ^ SIGNBIT_SP32));
-    expylogx = ((inty == 1) & !xpos) ? signval : expylogx;
-    int ret = as_int(expylogx);
-
-    // Corner case handling
-    ret = ax < 0x3f800000 & iy == NINFBITPATT_SP32 ? PINFBITPATT_SP32 : ret;
-    ret = ax < 0x3f800000 & iy == PINFBITPATT_SP32 ? 0 : ret;
-    ret = ax == 0x3f800000 & ay < PINFBITPATT_SP32 ? 0x3f800000 : ret;
-    ret = ax == 0x3f800000 & ay == PINFBITPATT_SP32 ? QNANBITPATT_SP32 : ret;
-    ret = ax > 0x3f800000 & iy == NINFBITPATT_SP32 ? 0 : ret;
-    ret = ax > 0x3f800000 & iy == PINFBITPATT_SP32 ? PINFBITPATT_SP32 : ret;
-    ret = ((ix < PINFBITPATT_SP32) & (ay == 0)) ? 0x3f800000 : ret;
-    ret = ((ax == PINFBITPATT_SP32) & !ypos) ? 0 : ret;
-    ret = ((ax == PINFBITPATT_SP32) & ypos) ? PINFBITPATT_SP32 : ret;
-    ret = ((ax == PINFBITPATT_SP32) & (iy == PINFBITPATT_SP32)) ? PINFBITPATT_SP32 : ret;
-    ret = ((ax == PINFBITPATT_SP32) & (ay == 0)) ? QNANBITPATT_SP32 : ret;
-    ret = ((ax == 0) & !ypos) ? PINFBITPATT_SP32 : ret;
-    ret = ((ax == 0) & ypos) ? 0 : ret;
-    ret = ((ax == 0) & (ay == 0)) ? QNANBITPATT_SP32 : ret;
-    ret = ((ax != 0) & !xpos) ? QNANBITPATT_SP32 : ret;
-    ret = ax > PINFBITPATT_SP32 ? ix : ret;
-    ret = ay > PINFBITPATT_SP32 ? iy : ret;
-
-    return as_float(ret);
+_CLC_DEF _CLC_OVERLOAD float __clc_powr(float x, float y) {
+  int ix = as_int(x);
+  int ax = ix & EXSIGNBIT_SP32;
+  int xpos = ix == ax;
+
+  int iy = as_int(y);
+  int ay = iy & EXSIGNBIT_SP32;
+  int ypos = iy == ay;
+
+  // Extra precise log calculation
+  // First handle case that x is close to 1
+  float r = 1.0f - as_float(ax);
+  int near1 = __clc_fabs(r) < 0x1.0p-4f;
+  float r2 = r * r;
+
+  // Coefficients are just 1/3, 1/4, 1/5 and 1/6
+  float poly = __clc_mad(
+      r,
+      __clc_mad(r,
+                __clc_mad(r, __clc_mad(r, 0x1.24924ap-3f, 0x1.555556p-3f),
+                          0x1.99999ap-3f),
+                0x1.000000p-2f),
+      0x1.555556p-2f);
+
+  poly *= r2 * r;
+
+  float lth_near1 = -r2 * 0.5f;
+  float ltt_near1 = -poly;
+  float lt_near1 = lth_near1 + ltt_near1;
+  float lh_near1 = -r;
+  float l_near1 = lh_near1 + lt_near1;
+
+  // Computations for x not near 1
+  int m = (int)(ax >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
+  float mf = (float)m;
+  int ixs = as_int(as_float(ax | 0x3f800000) - 1.0f);
+  float mfs = (float)((ixs >> EXPSHIFTBITS_SP32) - 253);
+  int c = m == -127;
+  int ixn = c ? ixs : ax;
+  float mfn = c ? mfs : mf;
+
+  int indx = (ixn & 0x007f0000) + ((ixn & 0x00008000) << 1);
+
+  // F - Y
+  float f = as_float(0x3f000000 | indx) -
+            as_float(0x3f000000 | (ixn & MANTBITS_SP32));
+
+  indx = indx >> 16;
+  float2 tv = USE_TABLE(log_inv_tbl_ep, indx);
+  float rh = f * tv.s0;
+  float rt = f * tv.s1;
+  r = rh + rt;
+
+  poly = __clc_mad(r, __clc_mad(r, 0x1.0p-2f, 0x1.555556p-2f), 0x1.0p-1f) *
+         (r * r);
+  poly += (rh - r) + rt;
+
+  const float LOG2_HEAD = 0x1.62e000p-1f;  // 0.693115234
+  const float LOG2_TAIL = 0x1.0bfbe8p-15f; // 0.0000319461833
+  tv = USE_TABLE(loge_tbl, indx);
+  float lth = -r;
+  float ltt = __clc_mad(mfn, LOG2_TAIL, -poly) + tv.s1;
+  float lt = lth + ltt;
+  float lh = __clc_mad(mfn, LOG2_HEAD, tv.s0);
+  float l = lh + lt;
+
+  // Select near 1 or not
+  lth = near1 ? lth_near1 : lth;
+  ltt = near1 ? ltt_near1 : ltt;
+  lt = near1 ? lt_near1 : lt;
+  lh = near1 ? lh_near1 : lh;
+  l = near1 ? l_near1 : l;
+
+  float gh = as_float(as_int(l) & 0xfffff000);
+  float gt = ((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh);
+
+  float yh = as_float(iy & 0xfffff000);
+
+  float yt = y - yh;
+
+  float ylogx_s = __clc_mad(gt, yh, __clc_mad(gh, yt, yt * gt));
+  float ylogx = __clc_mad(yh, gh, ylogx_s);
+  float ylogx_t = __clc_mad(yh, gh, -ylogx) + ylogx_s;
+
+  // Extra precise exp of ylogx
+  // 64/log2 : 92.332482616893657
+  const float R_64_BY_LOG2 = 0x1.715476p+6f;
+  int n = convert_int(ylogx * R_64_BY_LOG2);
+  float nf = (float)n;
+
+  int j = n & 0x3f;
+  m = n >> 6;
+  int m2 = m << EXPSHIFTBITS_SP32;
+  // log2/64 lead: 0.0108032227
+  const float R_LOG2_BY_64_LD = 0x1.620000p-7f;
+  // log2/64 tail: 0.0000272020388
+  const float R_LOG2_BY_64_TL = 0x1.c85fdep-16f;
+  r = __clc_mad(nf, -R_LOG2_BY_64_TL, __clc_mad(nf, -R_LOG2_BY_64_LD, ylogx)) +
+      ylogx_t;
+
+  // Truncated Taylor series for e^r
+  poly = __clc_mad(__clc_mad(__clc_mad(r, 0x1.555556p-5f, 0x1.555556p-3f), r,
+                             0x1.000000p-1f),
+                   r * r, r);
+
+  tv = USE_TABLE(exp_tbl_ep, j);
+
+  float expylogx =
+      __clc_mad(tv.s0, poly, __clc_mad(tv.s1, poly, tv.s1)) + tv.s0;
+  float sexpylogx = expylogx * as_float(0x1 << (m + 149));
+  float texpylogx = as_float(as_int(expylogx) + m2);
+  expylogx = m < -125 ? sexpylogx : texpylogx;
+
+  // Result is +-Inf if (ylogx + ylogx_t) > 128*log2
+  expylogx = ((ylogx > 0x1.62e430p+6f) |
+              (ylogx == 0x1.62e430p+6f & ylogx_t > -0x1.05c610p-22f))
+                 ? as_float(PINFBITPATT_SP32)
+                 : expylogx;
+
+  // Result is 0 if ylogx < -149*log2
+  expylogx = ylogx < -0x1.9d1da0p+6f ? 0.0f : expylogx;
+
+  // Classify y:
+  //   inty = 0 means not an integer.
+  //   inty = 1 means odd integer.
+  //   inty = 2 means even integer.
+
+  int yexp = (int)(ay >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32 + 1;
+  int mask = (1 << (24 - yexp)) - 1;
+  int yodd = ((iy >> (24 - yexp)) & 0x1) != 0;
+  int inty = yodd ? 1 : 2;
+  inty = (iy & mask) != 0 ? 0 : inty;
+  inty = yexp < 1 ? 0 : inty;
+  inty = yexp > 24 ? 2 : inty;
+
+  float signval = as_float((as_uint(expylogx) ^ SIGNBIT_SP32));
+  expylogx = ((inty == 1) & !xpos) ? signval : expylogx;
+  int ret = as_int(expylogx);
+
+  // Corner case handling
+  ret = ax < 0x3f800000 & iy == NINFBITPATT_SP32 ? PINFBITPATT_SP32 : ret;
+  ret = ax < 0x3f800000 & iy == PINFBITPATT_SP32 ? 0 : ret;
+  ret = ax == 0x3f800000 & ay < PINFBITPATT_SP32 ? 0x3f800000 : ret;
+  ret = ax == 0x3f800000 & ay == PINFBITPATT_SP32 ? QNANBITPATT_SP32 : ret;
+  ret = ax > 0x3f800000 & iy == NINFBITPATT_SP32 ? 0 : ret;
+  ret = ax > 0x3f800000 & iy == PINFBITPATT_SP32 ? PINFBITPATT_SP32 : ret;
+  ret = ((ix < PINFBITPATT_SP32) & (ay == 0)) ? 0x3f800000 : ret;
+  ret = ((ax == PINFBITPATT_SP32) & !ypos) ? 0 : ret;
+  ret = ((ax == PINFBITPATT_SP32) & ypos) ? PINFBITPATT_SP32 : ret;
+  ret = ((ax == PINFBITPATT_SP32) & (iy == PINFBITPATT_SP32)) ? PINFBITPATT_SP32
+                                                              : ret;
+  ret = ((ax == PINFBITPATT_SP32) & (ay == 0)) ? QNANBITPATT_SP32 : ret;
+  ret = ((ax == 0) & !ypos) ? PINFBITPATT_SP32 : ret;
+  ret = ((ax == 0) & ypos) ? 0 : ret;
+  ret = ((ax == 0) & (ay == 0)) ? QNANBITPATT_SP32 : ret;
+  ret = ((ax != 0) & !xpos) ? QNANBITPATT_SP32 : ret;
+  ret = ax > PINFBITPATT_SP32 ? ix : ret;
+  ret = ay > PINFBITPATT_SP32 ? iy : ret;
+
+  return as_float(ret);
 }
 _CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, float, __clc_powr, float, float)
 
 #ifdef cl_khr_fp64
-_CLC_DEF _CLC_OVERLOAD double __clc_powr(double x, double y)
-{
-    const double real_log2_tail = 5.76999904754328540596e-08;
-    const double real_log2_lead = 6.93147122859954833984e-01;
-
-    long ux = as_long(x);
-    long ax = ux & (~SIGNBIT_DP64);
-    int xpos = ax == ux;
-
-    long uy = as_long(y);
-    long ay = uy & (~SIGNBIT_DP64);
-    int ypos = ay == uy;
-
-    // Extended precision log
-    double v, vt;
-    {
-        int exp = (int)(ax >> 52) - 1023;
-        int mask_exp_1023 = exp == -1023;
-        double xexp = (double) exp;
-        long mantissa = ax & 0x000FFFFFFFFFFFFFL;
-
-        long temp_ux = as_long(as_double(0x3ff0000000000000L | mantissa) - 1.0);
-        exp = ((temp_ux & 0x7FF0000000000000L) >> 52) - 2045;
-        double xexp1 = (double) exp;
-        long mantissa1 = temp_ux & 0x000FFFFFFFFFFFFFL;
-
-        xexp = mask_exp_1023 ? xexp1 : xexp;
-        mantissa = mask_exp_1023 ? mantissa1 : mantissa;
-
-        long rax = (mantissa & 0x000ff00000000000) + ((mantissa & 0x0000080000000000) << 1);
-        int index = rax >> 44;
-
-        double F = as_double(rax | 0x3FE0000000000000L);
-        double Y = as_double(mantissa | 0x3FE0000000000000L);
-        double f = F - Y;
-        double2 tv = USE_TABLE(log_f_inv_tbl, index);
-        double log_h = tv.s0;
-        double log_t = tv.s1;
-        double f_inv = (log_h + log_t) * f;
-        double r1 = as_double(as_long(f_inv) & 0xfffffffff8000000L);
-        double r2 = fma(-F, r1, f) * (log_h + log_t);
-        double r = r1 + r2;
-
-        double poly = fma(r,
-                          fma(r,
-                              fma(r,
-                                  fma(r, 1.0/7.0, 1.0/6.0),
-                                  1.0/5.0),
-                              1.0/4.0),
-                          1.0/3.0);
-        poly = poly * r * r * r;
-
-        double hr1r1 = 0.5*r1*r1;
-        double poly0h = r1 + hr1r1;
-        double poly0t = r1 - poly0h + hr1r1;
-        poly = fma(r1, r2, fma(0.5*r2, r2, poly)) + r2 + poly0t;
-
-        tv = USE_TABLE(powlog_tbl, index);
-        log_h = tv.s0;
-        log_t = tv.s1;
-
-        double resT_t = fma(xexp, real_log2_tail, + log_t) - poly;
-        double resT = resT_t - poly0h;
-        double resH = fma(xexp, real_log2_lead, log_h);
-        double resT_h = poly0h;
-
-        double H = resT + resH;
-        double H_h = as_double(as_long(H) & 0xfffffffff8000000L);
-        double T = (resH - H + resT) + (resT_t - (resT + resT_h)) + (H - H_h);
-        H = H_h;
-
-        double y_head = as_double(uy & 0xfffffffff8000000L);
-        double y_tail = y - y_head;
-
-        double temp = fma(y_tail, H, fma(y_head, T, y_tail*T));
-        v = fma(y_head, H, temp);
-        vt = fma(y_head, H, -v) + temp;
-    }
-
-    // Now calculate exp of (v,vt)
-
-    double expv;
-    {
-        const double max_exp_arg = 709.782712893384;
-        const double min_exp_arg = -745.1332191019411;
-        const double sixtyfour_by_lnof2 = 92.33248261689366;
-        const double lnof2_by_64_head = 0.010830424260348081;
-        const double lnof2_by_64_tail = -4.359010638708991e-10;
-
-        double temp = v * sixtyfour_by_lnof2;
-        int n = (int)temp;
-        double dn = (double)n;
-        int j = n & 0x0000003f;
-        int m = n >> 6;
-
-        double2 tv = USE_TABLE(two_to_jby64_ep_tbl, j);
-        double f1 = tv.s0;
-        double f2 = tv.s1;
-        double f = f1 + f2;
-
-        double r1 = fma(dn, -lnof2_by_64_head, v);
-        double r2 = dn * lnof2_by_64_tail;
-        double r = (r1 + r2) + vt;
-
-        double q = fma(r,
-                       fma(r,
-                           fma(r,
-                               fma(r, 1.38889490863777199667e-03, 8.33336798434219616221e-03),
-                               4.16666666662260795726e-02),
-                           1.66666666665260878863e-01),
-                       5.00000000000000008883e-01);
-        q = fma(r*r, q, r);
-
-        expv = fma(f, q, f2) + f1;
-	      expv = ldexp(expv, m);
-
-        expv = v > max_exp_arg ? as_double(0x7FF0000000000000L) : expv;
-        expv = v < min_exp_arg ? 0.0 : expv;
-    }
-
-    // See whether y is an integer.
-    // inty = 0 means not an integer.
-    // inty = 1 means odd integer.
-    // inty = 2 means even integer.
-
-    int inty;
-    {
-        int yexp = (int)(ay >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64 + 1;
-        inty = yexp < 1 ? 0 : 2;
-        inty = yexp > 53 ? 2 : inty;
-        long mask = (1L << (53 - yexp)) - 1L;
-        int inty1 = (((ay & ~mask) >> (53 - yexp)) & 1L) == 1L ? 1 : 2;
-        inty1 = (ay & mask) != 0 ? 0 : inty1;
-        inty = !(yexp < 1) & !(yexp > 53) ? inty1 : inty;
-    }
-
-    expv *= ((inty == 1) & !xpos) ? -1.0 : 1.0;
-
-    long ret = as_long(expv);
-
-    // Now all the edge cases
-    ret = ax < 0x3ff0000000000000L & uy == NINFBITPATT_DP64 ? PINFBITPATT_DP64 : ret;
-    ret = ax < 0x3ff0000000000000L & uy == PINFBITPATT_DP64 ? 0L : ret;
-    ret = ax == 0x3ff0000000000000L & ay < PINFBITPATT_DP64 ? 0x3ff0000000000000L : ret;
-    ret = ax == 0x3ff0000000000000L & ay == PINFBITPATT_DP64 ? QNANBITPATT_DP64 : ret;
-    ret = ax > 0x3ff0000000000000L & uy == NINFBITPATT_DP64 ? 0L : ret;
-    ret = ax > 0x3ff0000000000000L & uy == PINFBITPATT_DP64 ? PINFBITPATT_DP64 : ret;
-    ret = ux < PINFBITPATT_DP64 & ay == 0L ? 0x3ff0000000000000L : ret;
-    ret = ((ax == PINFBITPATT_DP64) & !ypos) ? 0L : ret;
-    ret = ((ax == PINFBITPATT_DP64) & ypos) ? PINFBITPATT_DP64 : ret;
-    ret = ((ax == PINFBITPATT_DP64) & (uy == PINFBITPATT_DP64)) ? PINFBITPATT_DP64 : ret;
-    ret = ((ax == PINFBITPATT_DP64) & (ay == 0L)) ? QNANBITPATT_DP64 : ret;
-    ret = ((ax == 0L) & !ypos) ? PINFBITPATT_DP64 : ret;
-    ret = ((ax == 0L) & ypos) ? 0L : ret;
-    ret = ((ax == 0L) & (ay == 0L)) ? QNANBITPATT_DP64 : ret;
-    ret = ((ax != 0L) & !xpos) ? QNANBITPATT_DP64 : ret;
-    ret = ax > PINFBITPATT_DP64 ? ux : ret;
-    ret = ay > PINFBITPATT_DP64 ? uy : ret;
-
-    return as_double(ret);
+_CLC_DEF _CLC_OVERLOAD double __clc_powr(double x, double y) {
+  const double real_log2_tail = 5.76999904754328540596e-08;
+  const double real_log2_lead = 6.93147122859954833984e-01;
+
+  long ux = as_long(x);
+  long ax = ux & (~SIGNBIT_DP64);
+  int xpos = ax == ux;
+
+  long uy = as_long(y);
+  long ay = uy & (~SIGNBIT_DP64);
+  int ypos = ay == uy;
+
+  // Extended precision log
+  double v, vt;
+  {
+    int exp = (int)(ax >> 52) - 1023;
+    int mask_exp_1023 = exp == -1023;
+    double xexp = (double)exp;
+    long mantissa = ax & 0x000FFFFFFFFFFFFFL;
+
+    long temp_ux = as_long(as_double(0x3ff0000000000000L | mantissa) - 1.0);
+    exp = ((temp_ux & 0x7FF0000000000000L) >> 52) - 2045;
+    double xexp1 = (double)exp;
+    long mantissa1 = temp_ux & 0x000FFFFFFFFFFFFFL;
+
+    xexp = mask_exp_1023 ? xexp1 : xexp;
+    mantissa = mask_exp_1023 ? mantissa1 : mantissa;
+
+    long rax = (mantissa & 0x000ff00000000000) +
+               ((mantissa & 0x0000080000000000) << 1);
+    int index = rax >> 44;
+
+    double F = as_double(rax | 0x3FE0000000000000L);
+    double Y = as_double(mantissa | 0x3FE0000000000000L);
+    double f = F - Y;
+    double2 tv = USE_TABLE(log_f_inv_tbl, index);
+    double log_h = tv.s0;
+    double log_t = tv.s1;
+    double f_inv = (log_h + log_t) * f;
+    double r1 = as_double(as_long(f_inv) & 0xfffffffff8000000L);
+    double r2 = fma(-F, r1, f) * (log_h + log_t);
+    double r = r1 + r2;
+
+    double poly = fma(
+        r, fma(r, fma(r, fma(r, 1.0 / 7.0, 1.0 / 6.0), 1.0 / 5.0), 1.0 / 4.0),
+        1.0 / 3.0);
+    poly = poly * r * r * r;
+
+    double hr1r1 = 0.5 * r1 * r1;
+    double poly0h = r1 + hr1r1;
+    double poly0t = r1 - poly0h + hr1r1;
+    poly = fma(r1, r2, fma(0.5 * r2, r2, poly)) + r2 + poly0t;
+
+    tv = USE_TABLE(powlog_tbl, index);
+    log_h = tv.s0;
+    log_t = tv.s1;
+
+    double resT_t = fma(xexp, real_log2_tail, +log_t) - poly;
+    double resT = resT_t - poly0h;
+    double resH = fma(xexp, real_log2_lead, log_h);
+    double resT_h = poly0h;
+
+    double H = resT + resH;
+    double H_h = as_double(as_long(H) & 0xfffffffff8000000L);
+    double T = (resH - H + resT) + (resT_t - (resT + resT_h)) + (H - H_h);
+    H = H_h;
+
+    double y_head = as_double(uy & 0xfffffffff8000000L);
+    double y_tail = y - y_head;
+
+    double temp = fma(y_tail, H, fma(y_head, T, y_tail * T));
+    v = fma(y_head, H, temp);
+    vt = fma(y_head, H, -v) + temp;
+  }
+
+  // Now calculate exp of (v,vt)
+
+  double expv;
+  {
+    const double max_exp_arg = 709.782712893384;
+    const double min_exp_arg = -745.1332191019411;
+    const double sixtyfour_by_lnof2 = 92.33248261689366;
+    const double lnof2_by_64_head = 0.010830424260348081;
+    const double lnof2_by_64_tail = -4.359010638708991e-10;
+
+    double temp = v * sixtyfour_by_lnof2;
+    int n = (int)temp;
+    double dn = (double)n;
+    int j = n & 0x0000003f;
+    int m = n >> 6;
+
+    double2 tv = USE_TABLE(two_to_jby64_ep_tbl, j);
+    double f1 = tv.s0;
+    double f2 = tv.s1;
+    double f = f1 + f2;
+
+    double r1 = fma(dn, -lnof2_by_64_head, v);
+    double r2 = dn * lnof2_by_64_tail;
+    double r = (r1 + r2) + vt;
+
+    double q = fma(
+        r,
+        fma(r,
+            fma(r,
+                fma(r, 1.38889490863777199667e-03, 8.33336798434219616221e-03),
+                4.16666666662260795726e-02),
+            1.66666666665260878863e-01),
+        5.00000000000000008883e-01);
+    q = fma(r * r, q, r);
+
+    expv = fma(f, q, f2) + f1;
+    expv = ldexp(expv, m);
+
+    expv = v > max_exp_arg ? as_double(0x7FF0000000000000L) : expv;
+    expv = v < min_exp_arg ? 0.0 : expv;
+  }
+
+  // See whether y is an integer.
+  // inty = 0 means not an integer.
+  // inty = 1 means odd integer.
+  // inty = 2 means even integer.
+
+  int inty;
+  {
+    int yexp = (int)(ay >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64 + 1;
+    inty = yexp < 1 ? 0 : 2;
+    inty = yexp > 53 ? 2 : inty;
+    long mask = (1L << (53 - yexp)) - 1L;
+    int inty1 = (((ay & ~mask) >> (53 - yexp)) & 1L) == 1L ? 1 : 2;
+    inty1 = (ay & mask) != 0 ? 0 : inty1;
+    inty = !(yexp < 1) & !(yexp > 53) ? inty1 : inty;
+  }
+
+  expv *= ((inty == 1) & !xpos) ? -1.0 : 1.0;
+
+  long ret = as_long(expv);
+
+  // Now all the edge cases
+  ret = ax < 0x3ff0000000000000L & uy == NINFBITPATT_DP64 ? PINFBITPATT_DP64
+                                                          : ret;
+  ret = ax < 0x3ff0000000000000L & uy == PINFBITPATT_DP64 ? 0L : ret;
+  ret = ax == 0x3ff0000000000000L & ay < PINFBITPATT_DP64 ? 0x3ff0000000000000L
+                                                          : ret;
+  ret = ax == 0x3ff0000000000000L & ay == PINFBITPATT_DP64 ? QNANBITPATT_DP64
+                                                           : ret;
+  ret = ax > 0x3ff0000000000000L & uy == NINFBITPATT_DP64 ? 0L : ret;
+  ret = ax > 0x3ff0000000000000L & uy == PINFBITPATT_DP64 ? PINFBITPATT_DP64
+                                                          : ret;
+  ret = ux < PINFBITPATT_DP64 & ay == 0L ? 0x3ff0000000000000L : ret;
+  ret = ((ax == PINFBITPATT_DP64) & !ypos) ? 0L : ret;
+  ret = ((ax == PINFBITPATT_DP64) & ypos) ? PINFBITPATT_DP64 : ret;
+  ret = ((ax == PINFBITPATT_DP64) & (uy == PINFBITPATT_DP64)) ? PINFBITPATT_DP64
+                                                              : ret;
+  ret = ((ax == PINFBITPATT_DP64) & (ay == 0L)) ? QNANBITPATT_DP64 : ret;
+  ret = ((ax == 0L) & !ypos) ? PINFBITPATT_DP64 : ret;
+  ret = ((ax == 0L) & ypos) ? 0L : ret;
+  ret = ((ax == 0L) & (ay == 0L)) ? QNANBITPATT_DP64 : ret;
+  ret = ((ax != 0L) & !xpos) ? QNANBITPATT_DP64 : ret;
+  ret = ax > PINFBITPATT_DP64 ? ux : ret;
+  ret = ay > PINFBITPATT_DP64 ? uy : ret;
+
+  return as_double(ret);
 }
 _CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, double, __clc_powr, double, double)
 #endif
diff --git libclc/generic/lib/math/clc_rootn.cl libclc/generic/lib/math/clc_rootn.cl
index eee9c9fcaa2d..70ae02ac2370 100644
--- libclc/generic/lib/math/clc_rootn.cl
+++ libclc/generic/lib/math/clc_rootn.cl
@@ -23,6 +23,7 @@
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
 #include <clc/math/clc_fabs.h>
+#include <clc/math/clc_mad.h>
 
 #include "config.h"
 #include "math.h"
@@ -64,308 +65,320 @@
 // At the end of exp, do
 // ((((expT * poly) + expT) + expH*poly) + expH)
 
-_CLC_DEF _CLC_OVERLOAD float __clc_rootn(float x, int ny)
-{
-    float y = MATH_RECIP((float)ny);
-
-    int ix = as_int(x);
-    int ax = ix & EXSIGNBIT_SP32;
-    int xpos = ix == ax;
-
-    int iy = as_int(y);
-    int ay = iy & EXSIGNBIT_SP32;
-    int ypos = iy == ay;
-
-    // Extra precise log calculation
-    // First handle case that x is close to 1
-    float r = 1.0f - as_float(ax);
-    int near1 = __clc_fabs(r) < 0x1.0p-4f;
-    float r2 = r*r;
-
-    // Coefficients are just 1/3, 1/4, 1/5 and 1/6
-    float poly = mad(r,
-                     mad(r,
-                         mad(r,
-                             mad(r, 0x1.24924ap-3f, 0x1.555556p-3f),
-                             0x1.99999ap-3f),
-                         0x1.000000p-2f),
-                     0x1.555556p-2f);
-
-    poly *= r2*r;
-
-    float lth_near1 = -r2 * 0.5f;
-    float ltt_near1 = -poly;
-    float lt_near1 = lth_near1 + ltt_near1;
-    float lh_near1 = -r;
-    float l_near1 = lh_near1 + lt_near1;
-
-    // Computations for x not near 1
-    int m = (int)(ax >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
-    float mf = (float)m;
-    int ixs = as_int(as_float(ax | 0x3f800000) - 1.0f);
-    float mfs = (float)((ixs >> EXPSHIFTBITS_SP32) - 253);
-    int c = m == -127;
-    int ixn = c ? ixs : ax;
-    float mfn = c ? mfs : mf;
-
-    int indx = (ixn & 0x007f0000) + ((ixn & 0x00008000) << 1);
-
-    // F - Y
-    float f = as_float(0x3f000000 | indx) - as_float(0x3f000000 | (ixn & MANTBITS_SP32));
-
-    indx = indx >> 16;
-    float2 tv = USE_TABLE(log_inv_tbl_ep, indx);
-    float rh = f * tv.s0;
-    float rt = f * tv.s1;
-    r = rh + rt;
-
-    poly = mad(r, mad(r, 0x1.0p-2f, 0x1.555556p-2f), 0x1.0p-1f) * (r*r);
-    poly += (rh - r) + rt;
-
-    const float LOG2_HEAD = 0x1.62e000p-1f;  // 0.693115234
-    const float LOG2_TAIL = 0x1.0bfbe8p-15f; // 0.0000319461833
-    tv = USE_TABLE(loge_tbl, indx);
-    float lth = -r;
-    float ltt = mad(mfn, LOG2_TAIL, -poly) + tv.s1;
-    float lt = lth + ltt;
-    float lh = mad(mfn, LOG2_HEAD, tv.s0);
-    float l = lh + lt;
-
-    // Select near 1 or not
-    lth = near1 ? lth_near1 : lth;
-    ltt = near1 ? ltt_near1 : ltt;
-    lt = near1 ? lt_near1 : lt;
-    lh = near1 ? lh_near1 : lh;
-    l = near1 ? l_near1 : l;
-
-    float gh = as_float(as_int(l) & 0xfffff000);
-    float gt = ((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh);
-
-    float yh = as_float(iy & 0xfffff000);
-
-    float fny = (float)ny;
-    float fnyh = as_float(as_int(fny) & 0xfffff000);
-    float fnyt = (float)(ny - (int)fnyh);
-    float yt = MATH_DIVIDE(mad(-fnyt, yh, mad(-fnyh, yh, 1.0f)), fny);
-
-    float ylogx_s = mad(gt, yh, mad(gh, yt, yt*gt));
-    float ylogx = mad(yh, gh, ylogx_s);
-    float ylogx_t = mad(yh, gh, -ylogx) + ylogx_s;
-
-    // Extra precise exp of ylogx
-    const float R_64_BY_LOG2 = 0x1.715476p+6f; // 64/log2 : 92.332482616893657
-    int n = convert_int(ylogx * R_64_BY_LOG2);
-    float nf = (float) n;
-
-    int j = n & 0x3f;
-    m = n >> 6;
-    int m2 = m << EXPSHIFTBITS_SP32;
-
-    const float R_LOG2_BY_64_LD = 0x1.620000p-7f;  // log2/64 lead: 0.0108032227
-    const float R_LOG2_BY_64_TL = 0x1.c85fdep-16f; // log2/64 tail: 0.0000272020388
-    r = mad(nf, -R_LOG2_BY_64_TL, mad(nf, -R_LOG2_BY_64_LD, ylogx)) + ylogx_t;
-
-    // Truncated Taylor series for e^r
-    poly = mad(mad(mad(r, 0x1.555556p-5f, 0x1.555556p-3f), r, 0x1.000000p-1f), r*r, r);
-
-    tv = USE_TABLE(exp_tbl_ep, j);
-
-    float expylogx = mad(tv.s0, poly, mad(tv.s1, poly, tv.s1)) + tv.s0;
-    float sexpylogx = __clc_fp32_subnormals_supported() ? expylogx * as_float(0x1 << (m + 149)) : 0.0f;
-
-    float texpylogx = as_float(as_int(expylogx) + m2);
-    expylogx = m < -125 ? sexpylogx : texpylogx;
-
-    // Result is +-Inf if (ylogx + ylogx_t) > 128*log2
-    expylogx = ((ylogx > 0x1.62e430p+6f) | (ylogx == 0x1.62e430p+6f & ylogx_t > -0x1.05c610p-22f)) ? as_float(PINFBITPATT_SP32) : expylogx;
-
-    // Result is 0 if ylogx < -149*log2
-    expylogx = ylogx <  -0x1.9d1da0p+6f ? 0.0f : expylogx;
-
-    // Classify y:
-    //   inty = 0 means not an integer.
-    //   inty = 1 means odd integer.
-    //   inty = 2 means even integer.
-
-    int inty = 2 - (ny & 1);
-
-    float signval = as_float((as_uint(expylogx) ^ SIGNBIT_SP32));
-    expylogx = ((inty == 1) & !xpos) ? signval : expylogx;
-    int ret = as_int(expylogx);
-
-    // Corner case handling
-    ret = (!xpos & (inty == 2)) ? QNANBITPATT_SP32 : ret;
-    int xinf = xpos ? PINFBITPATT_SP32 : NINFBITPATT_SP32;
-    ret = ((ax == 0) & !ypos & (inty == 1)) ? xinf : ret;
-    ret = ((ax == 0) & !ypos & (inty == 2)) ? PINFBITPATT_SP32 : ret;
-    ret = ((ax == 0) & ypos & (inty == 2)) ? 0 : ret;
-    int xzero = xpos ? 0 : 0x80000000;
-    ret = ((ax == 0) & ypos & (inty == 1)) ? xzero : ret;
-    ret = ((ix == NINFBITPATT_SP32) & ypos & (inty == 1)) ? NINFBITPATT_SP32 : ret;
-    ret = ((ix == NINFBITPATT_SP32) & !ypos & (inty == 1)) ? 0x80000000 : ret;
-    ret = ((ix == PINFBITPATT_SP32) & !ypos) ? 0 : ret;
-    ret = ((ix == PINFBITPATT_SP32) & ypos) ? PINFBITPATT_SP32 : ret;
-    ret = ax > PINFBITPATT_SP32 ? ix : ret;
-    ret = ny == 0 ? QNANBITPATT_SP32 : ret;
-
-    return as_float(ret);
+_CLC_DEF _CLC_OVERLOAD float __clc_rootn(float x, int ny) {
+  float y = MATH_RECIP((float)ny);
+
+  int ix = as_int(x);
+  int ax = ix & EXSIGNBIT_SP32;
+  int xpos = ix == ax;
+
+  int iy = as_int(y);
+  int ay = iy & EXSIGNBIT_SP32;
+  int ypos = iy == ay;
+
+  // Extra precise log calculation
+  // First handle case that x is close to 1
+  float r = 1.0f - as_float(ax);
+  int near1 = __clc_fabs(r) < 0x1.0p-4f;
+  float r2 = r * r;
+
+  // Coefficients are just 1/3, 1/4, 1/5 and 1/6
+  float poly = __clc_mad(
+      r,
+      __clc_mad(r,
+                __clc_mad(r, __clc_mad(r, 0x1.24924ap-3f, 0x1.555556p-3f),
+                          0x1.99999ap-3f),
+                0x1.000000p-2f),
+      0x1.555556p-2f);
+
+  poly *= r2 * r;
+
+  float lth_near1 = -r2 * 0.5f;
+  float ltt_near1 = -poly;
+  float lt_near1 = lth_near1 + ltt_near1;
+  float lh_near1 = -r;
+  float l_near1 = lh_near1 + lt_near1;
+
+  // Computations for x not near 1
+  int m = (int)(ax >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
+  float mf = (float)m;
+  int ixs = as_int(as_float(ax | 0x3f800000) - 1.0f);
+  float mfs = (float)((ixs >> EXPSHIFTBITS_SP32) - 253);
+  int c = m == -127;
+  int ixn = c ? ixs : ax;
+  float mfn = c ? mfs : mf;
+
+  int indx = (ixn & 0x007f0000) + ((ixn & 0x00008000) << 1);
+
+  // F - Y
+  float f = as_float(0x3f000000 | indx) -
+            as_float(0x3f000000 | (ixn & MANTBITS_SP32));
+
+  indx = indx >> 16;
+  float2 tv = USE_TABLE(log_inv_tbl_ep, indx);
+  float rh = f * tv.s0;
+  float rt = f * tv.s1;
+  r = rh + rt;
+
+  poly = __clc_mad(r, __clc_mad(r, 0x1.0p-2f, 0x1.555556p-2f), 0x1.0p-1f) *
+         (r * r);
+  poly += (rh - r) + rt;
+
+  const float LOG2_HEAD = 0x1.62e000p-1f;  // 0.693115234
+  const float LOG2_TAIL = 0x1.0bfbe8p-15f; // 0.0000319461833
+  tv = USE_TABLE(loge_tbl, indx);
+  float lth = -r;
+  float ltt = __clc_mad(mfn, LOG2_TAIL, -poly) + tv.s1;
+  float lt = lth + ltt;
+  float lh = __clc_mad(mfn, LOG2_HEAD, tv.s0);
+  float l = lh + lt;
+
+  // Select near 1 or not
+  lth = near1 ? lth_near1 : lth;
+  ltt = near1 ? ltt_near1 : ltt;
+  lt = near1 ? lt_near1 : lt;
+  lh = near1 ? lh_near1 : lh;
+  l = near1 ? l_near1 : l;
+
+  float gh = as_float(as_int(l) & 0xfffff000);
+  float gt = ((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh);
+
+  float yh = as_float(iy & 0xfffff000);
+
+  float fny = (float)ny;
+  float fnyh = as_float(as_int(fny) & 0xfffff000);
+  float fnyt = (float)(ny - (int)fnyh);
+  float yt = MATH_DIVIDE(__clc_mad(-fnyt, yh, __clc_mad(-fnyh, yh, 1.0f)), fny);
+
+  float ylogx_s = __clc_mad(gt, yh, __clc_mad(gh, yt, yt * gt));
+  float ylogx = __clc_mad(yh, gh, ylogx_s);
+  float ylogx_t = __clc_mad(yh, gh, -ylogx) + ylogx_s;
+
+  // Extra precise exp of ylogx
+  const float R_64_BY_LOG2 = 0x1.715476p+6f; // 64/log2 : 92.332482616893657
+  int n = convert_int(ylogx * R_64_BY_LOG2);
+  float nf = (float)n;
+
+  int j = n & 0x3f;
+  m = n >> 6;
+  int m2 = m << EXPSHIFTBITS_SP32;
+
+  // log2/64 lead: 0.0108032227
+  const float R_LOG2_BY_64_LD = 0x1.620000p-7f;
+  // log2/64 tail: 0.0000272020388
+  const float R_LOG2_BY_64_TL = 0x1.c85fdep-16f;
+  r = __clc_mad(nf, -R_LOG2_BY_64_TL, __clc_mad(nf, -R_LOG2_BY_64_LD, ylogx)) +
+      ylogx_t;
+
+  // Truncated Taylor series for e^r
+  poly = __clc_mad(__clc_mad(__clc_mad(r, 0x1.555556p-5f, 0x1.555556p-3f), r,
+                             0x1.000000p-1f),
+                   r * r, r);
+
+  tv = USE_TABLE(exp_tbl_ep, j);
+
+  float expylogx =
+      __clc_mad(tv.s0, poly, __clc_mad(tv.s1, poly, tv.s1)) + tv.s0;
+  float sexpylogx = __clc_fp32_subnormals_supported()
+                        ? expylogx * as_float(0x1 << (m + 149))
+                        : 0.0f;
+
+  float texpylogx = as_float(as_int(expylogx) + m2);
+  expylogx = m < -125 ? sexpylogx : texpylogx;
+
+  // Result is +-Inf if (ylogx + ylogx_t) > 128*log2
+  expylogx = ((ylogx > 0x1.62e430p+6f) |
+              (ylogx == 0x1.62e430p+6f & ylogx_t > -0x1.05c610p-22f))
+                 ? as_float(PINFBITPATT_SP32)
+                 : expylogx;
+
+  // Result is 0 if ylogx < -149*log2
+  expylogx = ylogx < -0x1.9d1da0p+6f ? 0.0f : expylogx;
+
+  // Classify y:
+  //   inty = 0 means not an integer.
+  //   inty = 1 means odd integer.
+  //   inty = 2 means even integer.
+
+  int inty = 2 - (ny & 1);
+
+  float signval = as_float((as_uint(expylogx) ^ SIGNBIT_SP32));
+  expylogx = ((inty == 1) & !xpos) ? signval : expylogx;
+  int ret = as_int(expylogx);
+
+  // Corner case handling
+  ret = (!xpos & (inty == 2)) ? QNANBITPATT_SP32 : ret;
+  int xinf = xpos ? PINFBITPATT_SP32 : NINFBITPATT_SP32;
+  ret = ((ax == 0) & !ypos & (inty == 1)) ? xinf : ret;
+  ret = ((ax == 0) & !ypos & (inty == 2)) ? PINFBITPATT_SP32 : ret;
+  ret = ((ax == 0) & ypos & (inty == 2)) ? 0 : ret;
+  int xzero = xpos ? 0 : 0x80000000;
+  ret = ((ax == 0) & ypos & (inty == 1)) ? xzero : ret;
+  ret =
+      ((ix == NINFBITPATT_SP32) & ypos & (inty == 1)) ? NINFBITPATT_SP32 : ret;
+  ret = ((ix == NINFBITPATT_SP32) & !ypos & (inty == 1)) ? 0x80000000 : ret;
+  ret = ((ix == PINFBITPATT_SP32) & !ypos) ? 0 : ret;
+  ret = ((ix == PINFBITPATT_SP32) & ypos) ? PINFBITPATT_SP32 : ret;
+  ret = ax > PINFBITPATT_SP32 ? ix : ret;
+  ret = ny == 0 ? QNANBITPATT_SP32 : ret;
+
+  return as_float(ret);
 }
 _CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, float, __clc_rootn, float, int)
 
 #ifdef cl_khr_fp64
-_CLC_DEF _CLC_OVERLOAD double __clc_rootn(double x, int ny)
-{
-    const double real_log2_tail = 5.76999904754328540596e-08;
-    const double real_log2_lead = 6.93147122859954833984e-01;
-
-    double dny = (double)ny;
-    double y = 1.0 / dny;
-
-    long ux = as_long(x);
-    long ax = ux & (~SIGNBIT_DP64);
-    int xpos = ax == ux;
-
-    long uy = as_long(y);
-    long ay = uy & (~SIGNBIT_DP64);
-    int ypos = ay == uy;
-
-    // Extended precision log
-    double v, vt;
-    {
-        int exp = (int)(ax >> 52) - 1023;
-        int mask_exp_1023 = exp == -1023;
-        double xexp = (double) exp;
-        long mantissa = ax & 0x000FFFFFFFFFFFFFL;
-
-        long temp_ux = as_long(as_double(0x3ff0000000000000L | mantissa) - 1.0);
-        exp = ((temp_ux & 0x7FF0000000000000L) >> 52) - 2045;
-        double xexp1 = (double) exp;
-        long mantissa1 = temp_ux & 0x000FFFFFFFFFFFFFL;
-
-        xexp = mask_exp_1023 ? xexp1 : xexp;
-        mantissa = mask_exp_1023 ? mantissa1 : mantissa;
-
-        long rax = (mantissa & 0x000ff00000000000) + ((mantissa & 0x0000080000000000) << 1);
-        int index = rax >> 44;
-
-        double F = as_double(rax | 0x3FE0000000000000L);
-        double Y = as_double(mantissa | 0x3FE0000000000000L);
-        double f = F - Y;
-        double2 tv = USE_TABLE(log_f_inv_tbl, index);
-        double log_h = tv.s0;
-        double log_t = tv.s1;
-        double f_inv = (log_h + log_t) * f;
-        double r1 = as_double(as_long(f_inv) & 0xfffffffff8000000L);
-        double r2 = fma(-F, r1, f) * (log_h + log_t);
-        double r = r1 + r2;
-
-        double poly = fma(r,
-                          fma(r,
-                              fma(r,
-                                  fma(r, 1.0/7.0, 1.0/6.0),
-                                  1.0/5.0),
-                              1.0/4.0),
-                          1.0/3.0);
-        poly = poly * r * r * r;
-
-        double hr1r1 = 0.5*r1*r1;
-        double poly0h = r1 + hr1r1;
-        double poly0t = r1 - poly0h + hr1r1;
-        poly = fma(r1, r2, fma(0.5*r2, r2, poly)) + r2 + poly0t;
-
-        tv = USE_TABLE(powlog_tbl, index);
-        log_h = tv.s0;
-        log_t = tv.s1;
-
-        double resT_t = fma(xexp, real_log2_tail, + log_t) - poly;
-        double resT = resT_t - poly0h;
-        double resH = fma(xexp, real_log2_lead, log_h);
-        double resT_h = poly0h;
-
-        double H = resT + resH;
-        double H_h = as_double(as_long(H) & 0xfffffffff8000000L);
-        double T = (resH - H + resT) + (resT_t - (resT + resT_h)) + (H - H_h);
-        H = H_h;
-
-        double y_head = as_double(uy & 0xfffffffff8000000L);
-        double y_tail = y - y_head;
-
-        double fnyh = as_double(as_long(dny) & 0xfffffffffff00000);
-        double fnyt = (double)(ny - (int)fnyh);
-        y_tail = fma(-fnyt, y_head, fma(-fnyh, y_head, 1.0))/ dny;
-
-        double temp = fma(y_tail, H, fma(y_head, T, y_tail*T));
-        v = fma(y_head, H, temp);
-        vt = fma(y_head, H, -v) + temp;
-    }
-
-    // Now calculate exp of (v,vt)
-
-    double expv;
-    {
-        const double max_exp_arg = 709.782712893384;
-        const double min_exp_arg = -745.1332191019411;
-        const double sixtyfour_by_lnof2 = 92.33248261689366;
-        const double lnof2_by_64_head = 0.010830424260348081;
-        const double lnof2_by_64_tail = -4.359010638708991e-10;
-
-        double temp = v * sixtyfour_by_lnof2;
-        int n = (int)temp;
-        double dn = (double)n;
-        int j = n & 0x0000003f;
-        int m = n >> 6;
-
-        double2 tv = USE_TABLE(two_to_jby64_ep_tbl, j);
-        double f1 = tv.s0;
-        double f2 = tv.s1;
-        double f = f1 + f2;
-
-        double r1 = fma(dn, -lnof2_by_64_head, v);
-        double r2 = dn * lnof2_by_64_tail;
-        double r = (r1 + r2) + vt;
-
-        double q = fma(r,
-                       fma(r,
-                           fma(r,
-                               fma(r, 1.38889490863777199667e-03, 8.33336798434219616221e-03),
-                               4.16666666662260795726e-02),
-                           1.66666666665260878863e-01),
-                       5.00000000000000008883e-01);
-        q = fma(r*r, q, r);
-
-        expv = fma(f, q, f2) + f1;
-	      expv = ldexp(expv, m);
-
-        expv = v > max_exp_arg ? as_double(0x7FF0000000000000L) : expv;
-        expv = v < min_exp_arg ? 0.0 : expv;
-    }
-
-    // See whether y is an integer.
-    // inty = 0 means not an integer.
-    // inty = 1 means odd integer.
-    // inty = 2 means even integer.
-
-    int inty = 2 - (ny & 1);
-
-    expv *= ((inty == 1) & !xpos) ? -1.0 : 1.0;
-
-    long ret = as_long(expv);
-
-    // Now all the edge cases
-    ret = (!xpos & (inty == 2)) ? QNANBITPATT_DP64 : ret;
-    long xinf = xpos ? PINFBITPATT_DP64 : NINFBITPATT_DP64;
-    ret = ((ax == 0L) & !ypos & (inty == 1)) ? xinf : ret;
-    ret = ((ax == 0L) & !ypos & (inty == 2)) ? PINFBITPATT_DP64 : ret;
-    ret = ((ax == 0L) & ypos & (inty == 2)) ? 0L : ret;
-    long xzero = xpos ? 0L : 0x8000000000000000L;
-    ret = ((ax == 0L) & ypos & (inty == 1)) ? xzero : ret;
-    ret = ((ux == NINFBITPATT_DP64) & ypos & (inty == 1)) ? NINFBITPATT_DP64 : ret;
-    ret = ((ux == NINFBITPATT_DP64) & !ypos & (inty == 1)) ? 0x8000000000000000L : ret;
-    ret = ((ux == PINFBITPATT_DP64) & !ypos) ? 0L : ret;
-    ret = ((ux == PINFBITPATT_DP64) & ypos) ? PINFBITPATT_DP64 : ret;
-    ret = ax > PINFBITPATT_DP64 ? ux : ret;
-    ret = ny == 0 ? QNANBITPATT_DP64 : ret;
-    return as_double(ret);
+_CLC_DEF _CLC_OVERLOAD double __clc_rootn(double x, int ny) {
+  const double real_log2_tail = 5.76999904754328540596e-08;
+  const double real_log2_lead = 6.93147122859954833984e-01;
+
+  double dny = (double)ny;
+  double y = 1.0 / dny;
+
+  long ux = as_long(x);
+  long ax = ux & (~SIGNBIT_DP64);
+  int xpos = ax == ux;
+
+  long uy = as_long(y);
+  long ay = uy & (~SIGNBIT_DP64);
+  int ypos = ay == uy;
+
+  // Extended precision log
+  double v, vt;
+  {
+    int exp = (int)(ax >> 52) - 1023;
+    int mask_exp_1023 = exp == -1023;
+    double xexp = (double)exp;
+    long mantissa = ax & 0x000FFFFFFFFFFFFFL;
+
+    long temp_ux = as_long(as_double(0x3ff0000000000000L | mantissa) - 1.0);
+    exp = ((temp_ux & 0x7FF0000000000000L) >> 52) - 2045;
+    double xexp1 = (double)exp;
+    long mantissa1 = temp_ux & 0x000FFFFFFFFFFFFFL;
+
+    xexp = mask_exp_1023 ? xexp1 : xexp;
+    mantissa = mask_exp_1023 ? mantissa1 : mantissa;
+
+    long rax = (mantissa & 0x000ff00000000000) +
+               ((mantissa & 0x0000080000000000) << 1);
+    int index = rax >> 44;
+
+    double F = as_double(rax | 0x3FE0000000000000L);
+    double Y = as_double(mantissa | 0x3FE0000000000000L);
+    double f = F - Y;
+    double2 tv = USE_TABLE(log_f_inv_tbl, index);
+    double log_h = tv.s0;
+    double log_t = tv.s1;
+    double f_inv = (log_h + log_t) * f;
+    double r1 = as_double(as_long(f_inv) & 0xfffffffff8000000L);
+    double r2 = fma(-F, r1, f) * (log_h + log_t);
+    double r = r1 + r2;
+
+    double poly = fma(
+        r, fma(r, fma(r, fma(r, 1.0 / 7.0, 1.0 / 6.0), 1.0 / 5.0), 1.0 / 4.0),
+        1.0 / 3.0);
+    poly = poly * r * r * r;
+
+    double hr1r1 = 0.5 * r1 * r1;
+    double poly0h = r1 + hr1r1;
+    double poly0t = r1 - poly0h + hr1r1;
+    poly = fma(r1, r2, fma(0.5 * r2, r2, poly)) + r2 + poly0t;
+
+    tv = USE_TABLE(powlog_tbl, index);
+    log_h = tv.s0;
+    log_t = tv.s1;
+
+    double resT_t = fma(xexp, real_log2_tail, +log_t) - poly;
+    double resT = resT_t - poly0h;
+    double resH = fma(xexp, real_log2_lead, log_h);
+    double resT_h = poly0h;
+
+    double H = resT + resH;
+    double H_h = as_double(as_long(H) & 0xfffffffff8000000L);
+    double T = (resH - H + resT) + (resT_t - (resT + resT_h)) + (H - H_h);
+    H = H_h;
+
+    double y_head = as_double(uy & 0xfffffffff8000000L);
+    double y_tail = y - y_head;
+
+    double fnyh = as_double(as_long(dny) & 0xfffffffffff00000);
+    double fnyt = (double)(ny - (int)fnyh);
+    y_tail = fma(-fnyt, y_head, fma(-fnyh, y_head, 1.0)) / dny;
+
+    double temp = fma(y_tail, H, fma(y_head, T, y_tail * T));
+    v = fma(y_head, H, temp);
+    vt = fma(y_head, H, -v) + temp;
+  }
+
+  // Now calculate exp of (v,vt)
+
+  double expv;
+  {
+    const double max_exp_arg = 709.782712893384;
+    const double min_exp_arg = -745.1332191019411;
+    const double sixtyfour_by_lnof2 = 92.33248261689366;
+    const double lnof2_by_64_head = 0.010830424260348081;
+    const double lnof2_by_64_tail = -4.359010638708991e-10;
+
+    double temp = v * sixtyfour_by_lnof2;
+    int n = (int)temp;
+    double dn = (double)n;
+    int j = n & 0x0000003f;
+    int m = n >> 6;
+
+    double2 tv = USE_TABLE(two_to_jby64_ep_tbl, j);
+    double f1 = tv.s0;
+    double f2 = tv.s1;
+    double f = f1 + f2;
+
+    double r1 = fma(dn, -lnof2_by_64_head, v);
+    double r2 = dn * lnof2_by_64_tail;
+    double r = (r1 + r2) + vt;
+
+    double q = fma(
+        r,
+        fma(r,
+            fma(r,
+                fma(r, 1.38889490863777199667e-03, 8.33336798434219616221e-03),
+                4.16666666662260795726e-02),
+            1.66666666665260878863e-01),
+        5.00000000000000008883e-01);
+    q = fma(r * r, q, r);
+
+    expv = fma(f, q, f2) + f1;
+    expv = ldexp(expv, m);
+
+    expv = v > max_exp_arg ? as_double(0x7FF0000000000000L) : expv;
+    expv = v < min_exp_arg ? 0.0 : expv;
+  }
+
+  // See whether y is an integer.
+  // inty = 0 means not an integer.
+  // inty = 1 means odd integer.
+  // inty = 2 means even integer.
+
+  int inty = 2 - (ny & 1);
+
+  expv *= ((inty == 1) & !xpos) ? -1.0 : 1.0;
+
+  long ret = as_long(expv);
+
+  // Now all the edge cases
+  ret = (!xpos & (inty == 2)) ? QNANBITPATT_DP64 : ret;
+  long xinf = xpos ? PINFBITPATT_DP64 : NINFBITPATT_DP64;
+  ret = ((ax == 0L) & !ypos & (inty == 1)) ? xinf : ret;
+  ret = ((ax == 0L) & !ypos & (inty == 2)) ? PINFBITPATT_DP64 : ret;
+  ret = ((ax == 0L) & ypos & (inty == 2)) ? 0L : ret;
+  long xzero = xpos ? 0L : 0x8000000000000000L;
+  ret = ((ax == 0L) & ypos & (inty == 1)) ? xzero : ret;
+  ret =
+      ((ux == NINFBITPATT_DP64) & ypos & (inty == 1)) ? NINFBITPATT_DP64 : ret;
+  ret = ((ux == NINFBITPATT_DP64) & !ypos & (inty == 1)) ? 0x8000000000000000L
+                                                         : ret;
+  ret = ((ux == PINFBITPATT_DP64) & !ypos) ? 0L : ret;
+  ret = ((ux == PINFBITPATT_DP64) & ypos) ? PINFBITPATT_DP64 : ret;
+  ret = ax > PINFBITPATT_DP64 ? ux : ret;
+  ret = ny == 0 ? QNANBITPATT_DP64 : ret;
+  return as_double(ret);
 }
 _CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, double, __clc_rootn, double, int)
 #endif
diff --git libclc/generic/lib/math/mad.cl libclc/generic/lib/math/mad.cl
index 86bc70d94bea..94012ab3df25 100644
--- libclc/generic/lib/math/mad.cl
+++ libclc/generic/lib/math/mad.cl
@@ -1,4 +1,19 @@
 #include <clc/clc.h>
+#include <clc/clcmacro.h>
+#include <clc/math/clc_mad.h>
 
-#define __CLC_BODY <mad.inc>
-#include <clc/math/gentype.inc>
+_CLC_DEFINE_TERNARY_BUILTIN(float, mad, __clc_mad, float, float, float)
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_DEFINE_TERNARY_BUILTIN(double, mad, __clc_mad, double, double, double)
+
+#endif
+
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEFINE_TERNARY_BUILTIN(half, mad, __clc_mad, half, half, half)
+
+#endif
diff --git libclc/generic/lib/math/mad.inc libclc/generic/lib/math/mad.inc
deleted file mode 100644
index d32c7839d1b9..000000000000
--- libclc/generic/lib/math/mad.inc
+++ /dev/null
@@ -1,3 +0,0 @@
-_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE mad(__CLC_GENTYPE a, __CLC_GENTYPE b, __CLC_GENTYPE c) {
-  return a * b + c;
-}
diff --git libclc/generic/lib/math/sincos_helpers.cl libclc/generic/lib/math/sincos_helpers.cl
index 0adecf6978bc..e291e81ed980 100644
--- libclc/generic/lib/math/sincos_helpers.cl
+++ libclc/generic/lib/math/sincos_helpers.cl
@@ -21,307 +21,319 @@
  */
 
 #include <clc/clc.h>
+#include <clc/math/clc_mad.h>
+#include <clc/math/clc_trunc.h>
 #include <clc/shared/clc_max.h>
 
 #include "math.h"
-#include "tables.h"
 #include "sincos_helpers.h"
+#include "tables.h"
 
-#define bitalign(hi, lo, shift) \
-  ((hi) << (32 - (shift))) | ((lo) >> (shift));
+#define bitalign(hi, lo, shift) ((hi) << (32 - (shift))) | ((lo) >> (shift));
 
-#define bytealign(src0, src1, src2) \
-  ((uint) (((((long)(src0)) << 32) | (long)(src1)) >> (((src2) & 3)*8)))
+#define bytealign(src0, src1, src2)                                            \
+  ((uint)(((((long)(src0)) << 32) | (long)(src1)) >> (((src2) & 3) * 8)))
 
 _CLC_DEF float __clc_sinf_piby4(float x, float y) {
-    // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
-    // = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
-    // = x * f(w)
-    // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
-    // We use a minimax approximation of (f(w) - 1) / w
-    // because this produces an expansion in even powers of x.
-
-    const float c1 = -0.1666666666e0f;
-    const float c2 = 0.8333331876e-2f;
-    const float c3 = -0.198400874e-3f;
-    const float c4 = 0.272500015e-5f;
-    const float c5 = -2.5050759689e-08f; // 0xb2d72f34
-    const float c6 = 1.5896910177e-10f;	 // 0x2f2ec9d3
-
-    float z = x * x;
-    float v = z * x;
-    float r = mad(z, mad(z, mad(z, mad(z, c6, c5), c4), c3), c2);
-    float ret = x - mad(v, -c1, mad(z, mad(y, 0.5f, -v*r), -y));
-
-    return ret;
+  // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
+  // = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
+  // = x * f(w)
+  // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
+  // We use a minimax approximation of (f(w) - 1) / w
+  // because this produces an expansion in even powers of x.
+
+  const float c1 = -0.1666666666e0f;
+  const float c2 = 0.8333331876e-2f;
+  const float c3 = -0.198400874e-3f;
+  const float c4 = 0.272500015e-5f;
+  const float c5 = -2.5050759689e-08f; // 0xb2d72f34
+  const float c6 = 1.5896910177e-10f;  // 0x2f2ec9d3
+
+  float z = x * x;
+  float v = z * x;
+  float r = __clc_mad(
+      z, __clc_mad(z, __clc_mad(z, __clc_mad(z, c6, c5), c4), c3), c2);
+  float ret =
+      x - __clc_mad(v, -c1, __clc_mad(z, __clc_mad(y, 0.5f, -v * r), -y));
+
+  return ret;
 }
 
 _CLC_DEF float __clc_cosf_piby4(float x, float y) {
-    // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
-    // = f(w)
-    // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
-    // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
-    // because this produces an expansion in even powers of x.
-
-    const float c1 = 0.416666666e-1f;
-    const float c2 = -0.138888876e-2f;
-    const float c3 = 0.248006008e-4f;
-    const float c4 = -0.2730101334e-6f;
-    const float c5 = 2.0875723372e-09f;	 // 0x310f74f6
-    const float c6 = -1.1359647598e-11f; // 0xad47d74e
-
-    float z = x * x;
-    float r = z * mad(z, mad(z, mad(z, mad(z, mad(z, c6,  c5), c4), c3), c2), c1);
-
-    // if |x| < 0.3
-    float qx = 0.0f;
-
-    int ix = as_int(x) & EXSIGNBIT_SP32;
-
-    //  0.78125 > |x| >= 0.3
-    float xby4 = as_float(ix - 0x01000000);
-    qx = (ix >= 0x3e99999a) & (ix <= 0x3f480000) ? xby4 : qx;
-
-    // x > 0.78125
-    qx = ix > 0x3f480000 ? 0.28125f : qx;
-
-    float hz = mad(z, 0.5f, -qx);
-    float a = 1.0f - qx;
-    float ret = a - (hz - mad(z, r, -x*y));
-    return ret;
+  // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
+  // = f(w)
+  // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
+  // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
+  // because this produces an expansion in even powers of x.
+
+  const float c1 = 0.416666666e-1f;
+  const float c2 = -0.138888876e-2f;
+  const float c3 = 0.248006008e-4f;
+  const float c4 = -0.2730101334e-6f;
+  const float c5 = 2.0875723372e-09f;  // 0x310f74f6
+  const float c6 = -1.1359647598e-11f; // 0xad47d74e
+
+  float z = x * x;
+  float r =
+      z *
+      __clc_mad(
+          z,
+          __clc_mad(z, __clc_mad(z, __clc_mad(z, __clc_mad(z, c6, c5), c4), c3),
+                    c2),
+          c1);
+
+  // if |x| < 0.3
+  float qx = 0.0f;
+
+  int ix = as_int(x) & EXSIGNBIT_SP32;
+
+  //  0.78125 > |x| >= 0.3
+  float xby4 = as_float(ix - 0x01000000);
+  qx = (ix >= 0x3e99999a) & (ix <= 0x3f480000) ? xby4 : qx;
+
+  // x > 0.78125
+  qx = ix > 0x3f480000 ? 0.28125f : qx;
+
+  float hz = __clc_mad(z, 0.5f, -qx);
+  float a = 1.0f - qx;
+  float ret = a - (hz - __clc_mad(z, r, -x * y));
+  return ret;
 }
 
-_CLC_DEF float __clc_tanf_piby4(float x, int regn)
-{
-    // Core Remez [1,2] approximation to tan(x) on the interval [0,pi/4].
-    float r = x * x;
+_CLC_DEF float __clc_tanf_piby4(float x, int regn) {
+  // Core Remez [1,2] approximation to tan(x) on the interval [0,pi/4].
+  float r = x * x;
 
-    float a = mad(r, -0.0172032480471481694693109f, 0.385296071263995406715129f);
+  float a =
+      __clc_mad(r, -0.0172032480471481694693109f, 0.385296071263995406715129f);
 
-    float b = mad(r,
-	          mad(r, 0.01844239256901656082986661f, -0.51396505478854532132342f),
-	          1.15588821434688393452299f);
+  float b = __clc_mad(
+      r,
+      __clc_mad(r, 0.01844239256901656082986661f, -0.51396505478854532132342f),
+      1.15588821434688393452299f);
 
-    float t = mad(x*r, native_divide(a, b), x);
-    float tr = -MATH_RECIP(t);
+  float t = __clc_mad(x * r, native_divide(a, b), x);
+  float tr = -MATH_RECIP(t);
 
-    return regn & 1 ? tr : t;
+  return regn & 1 ? tr : t;
 }
 
-_CLC_DEF void __clc_fullMulS(float *hi, float *lo, float a, float b, float bh, float bt)
-{
-    if (HAVE_HW_FMA32()) {
-        float ph = a * b;
-        *hi = ph;
-        *lo = fma(a, b, -ph);
-    } else {
-        float ah = as_float(as_uint(a) & 0xfffff000U);
-        float at = a - ah;
-        float ph = a * b;
-        float pt = mad(at, bt, mad(at, bh, mad(ah, bt, mad(ah, bh, -ph))));
-        *hi = ph;
-        *lo = pt;
-    }
+_CLC_DEF void __clc_fullMulS(float *hi, float *lo, float a, float b, float bh,
+                             float bt) {
+  if (HAVE_HW_FMA32()) {
+    float ph = a * b;
+    *hi = ph;
+    *lo = fma(a, b, -ph);
+  } else {
+    float ah = as_float(as_uint(a) & 0xfffff000U);
+    float at = a - ah;
+    float ph = a * b;
+    float pt = __clc_mad(
+        at, bt, __clc_mad(at, bh, __clc_mad(ah, bt, __clc_mad(ah, bh, -ph))));
+    *hi = ph;
+    *lo = pt;
+  }
 }
 
-_CLC_DEF float __clc_removePi2S(float *hi, float *lo, float x)
-{
-    // 72 bits of pi/2
-    const float fpiby2_1 = (float) 0xC90FDA / 0x1.0p+23f;
-    const float fpiby2_1_h = (float) 0xC90 / 0x1.0p+11f;
-    const float fpiby2_1_t = (float) 0xFDA / 0x1.0p+23f;
+_CLC_DEF float __clc_removePi2S(float *hi, float *lo, float x) {
+  // 72 bits of pi/2
+  const float fpiby2_1 = (float)0xC90FDA / 0x1.0p+23f;
+  const float fpiby2_1_h = (float)0xC90 / 0x1.0p+11f;
+  const float fpiby2_1_t = (float)0xFDA / 0x1.0p+23f;
 
-    const float fpiby2_2 = (float) 0xA22168 / 0x1.0p+47f;
-    const float fpiby2_2_h = (float) 0xA22 / 0x1.0p+35f;
-    const float fpiby2_2_t = (float) 0x168 / 0x1.0p+47f;
+  const float fpiby2_2 = (float)0xA22168 / 0x1.0p+47f;
+  const float fpiby2_2_h = (float)0xA22 / 0x1.0p+35f;
+  const float fpiby2_2_t = (float)0x168 / 0x1.0p+47f;
 
-    const float fpiby2_3 = (float) 0xC234C4 / 0x1.0p+71f;
-    const float fpiby2_3_h = (float) 0xC23 / 0x1.0p+59f;
-    const float fpiby2_3_t = (float) 0x4C4 / 0x1.0p+71f;
+  const float fpiby2_3 = (float)0xC234C4 / 0x1.0p+71f;
+  const float fpiby2_3_h = (float)0xC23 / 0x1.0p+59f;
+  const float fpiby2_3_t = (float)0x4C4 / 0x1.0p+71f;
 
-    const float twobypi = 0x1.45f306p-1f;
+  const float twobypi = 0x1.45f306p-1f;
 
-    float fnpi2 = trunc(mad(x, twobypi, 0.5f));
+  float fnpi2 = __clc_trunc(__clc_mad(x, twobypi, 0.5f));
 
-    // subtract n * pi/2 from x
-    float rhead, rtail;
-    __clc_fullMulS(&rhead, &rtail, fnpi2, fpiby2_1, fpiby2_1_h, fpiby2_1_t);
-    float v = x - rhead;
-    float rem = v + (((x - v) - rhead) - rtail);
+  // subtract n * pi/2 from x
+  float rhead, rtail;
+  __clc_fullMulS(&rhead, &rtail, fnpi2, fpiby2_1, fpiby2_1_h, fpiby2_1_t);
+  float v = x - rhead;
+  float rem = v + (((x - v) - rhead) - rtail);
 
-    float rhead2, rtail2;
-    __clc_fullMulS(&rhead2, &rtail2, fnpi2, fpiby2_2, fpiby2_2_h, fpiby2_2_t);
-    v = rem - rhead2;
-    rem = v + (((rem - v) - rhead2) - rtail2);
+  float rhead2, rtail2;
+  __clc_fullMulS(&rhead2, &rtail2, fnpi2, fpiby2_2, fpiby2_2_h, fpiby2_2_t);
+  v = rem - rhead2;
+  rem = v + (((rem - v) - rhead2) - rtail2);
 
-    float rhead3, rtail3;
-    __clc_fullMulS(&rhead3, &rtail3, fnpi2, fpiby2_3, fpiby2_3_h, fpiby2_3_t);
-    v = rem - rhead3;
+  float rhead3, rtail3;
+  __clc_fullMulS(&rhead3, &rtail3, fnpi2, fpiby2_3, fpiby2_3_h, fpiby2_3_t);
+  v = rem - rhead3;
 
-    *hi = v + ((rem - v) - rhead3);
-    *lo = -rtail3;
-    return fnpi2;
+  *hi = v + ((rem - v) - rhead3);
+  *lo = -rtail3;
+  return fnpi2;
 }
 
-_CLC_DEF int __clc_argReductionSmallS(float *r, float *rr, float x)
-{
-    float fnpi2 = __clc_removePi2S(r, rr, x);
-    return (int)fnpi2 & 0x3;
+_CLC_DEF int __clc_argReductionSmallS(float *r, float *rr, float x) {
+  float fnpi2 = __clc_removePi2S(r, rr, x);
+  return (int)fnpi2 & 0x3;
 }
 
-#define FULL_MUL(A, B, HI, LO) \
-    LO = A * B; \
-    HI = mul_hi(A, B)
-
-#define FULL_MAD(A, B, C, HI, LO) \
-    LO = ((A) * (B) + (C)); \
-    HI = mul_hi(A, B); \
-    HI += LO < C
-
-_CLC_DEF int __clc_argReductionLargeS(float *r, float *rr, float x)
-{
-    int xe = (int)(as_uint(x) >> 23) - 127;
-    uint xm = 0x00800000U | (as_uint(x) & 0x7fffffU);
-
-    // 224 bits of 2/PI: . A2F9836E 4E441529 FC2757D1 F534DDC0 DB629599 3C439041 FE5163AB
-    const uint b6 = 0xA2F9836EU;
-    const uint b5 = 0x4E441529U;
-    const uint b4 = 0xFC2757D1U;
-    const uint b3 = 0xF534DDC0U;
-    const uint b2 = 0xDB629599U;
-    const uint b1 = 0x3C439041U;
-    const uint b0 = 0xFE5163ABU;
-
-    uint p0, p1, p2, p3, p4, p5, p6, p7, c0, c1;
-
-    FULL_MUL(xm, b0, c0, p0);
-    FULL_MAD(xm, b1, c0, c1, p1);
-    FULL_MAD(xm, b2, c1, c0, p2);
-    FULL_MAD(xm, b3, c0, c1, p3);
-    FULL_MAD(xm, b4, c1, c0, p4);
-    FULL_MAD(xm, b5, c0, c1, p5);
-    FULL_MAD(xm, b6, c1, p7, p6);
-
-    uint fbits = 224 + 23 - xe;
-
-    // shift amount to get 2 lsb of integer part at top 2 bits
-    //   min: 25 (xe=18) max: 134 (xe=127)
-    uint shift = 256U - 2 - fbits;
-
-    // Shift by up to 134/32 = 4 words
-    int c = shift > 31;
-    p7 = c ? p6 : p7;
-    p6 = c ? p5 : p6;
-    p5 = c ? p4 : p5;
-    p4 = c ? p3 : p4;
-    p3 = c ? p2 : p3;
-    p2 = c ? p1 : p2;
-    p1 = c ? p0 : p1;
-    shift -= (-c) & 32;
-
-    c = shift > 31;
-    p7 = c ? p6 : p7;
-    p6 = c ? p5 : p6;
-    p5 = c ? p4 : p5;
-    p4 = c ? p3 : p4;
-    p3 = c ? p2 : p3;
-    p2 = c ? p1 : p2;
-    shift -= (-c) & 32;
-
-    c = shift > 31;
-    p7 = c ? p6 : p7;
-    p6 = c ? p5 : p6;
-    p5 = c ? p4 : p5;
-    p4 = c ? p3 : p4;
-    p3 = c ? p2 : p3;
-    shift -= (-c) & 32;
-
-    c = shift > 31;
-    p7 = c ? p6 : p7;
-    p6 = c ? p5 : p6;
-    p5 = c ? p4 : p5;
-    p4 = c ? p3 : p4;
-    shift -= (-c) & 32;
-
-    // bitalign cannot handle a shift of 32
-    c = shift > 0;
-    shift = 32 - shift;
-    uint t7 = bitalign(p7, p6, shift);
-    uint t6 = bitalign(p6, p5, shift);
-    uint t5 = bitalign(p5, p4, shift);
-    p7 = c ? t7 : p7;
-    p6 = c ? t6 : p6;
-    p5 = c ? t5 : p5;
-
-    // Get 2 lsb of int part and msb of fraction
-    int i = p7 >> 29;
-
-    // Scoot up 2 more bits so only fraction remains
-    p7 = bitalign(p7, p6, 30);
-    p6 = bitalign(p6, p5, 30);
-    p5 = bitalign(p5, p4, 30);
-
-    // Subtract 1 if msb of fraction is 1, i.e. fraction >= 0.5
-    uint flip = i & 1 ? 0xffffffffU : 0U;
-    uint sign = i & 1 ? 0x80000000U : 0U;
-    p7 = p7 ^ flip;
-    p6 = p6 ^ flip;
-    p5 = p5 ^ flip;
-
-    // Find exponent and shift away leading zeroes and hidden bit
-    xe = clz(p7) + 1;
-    shift = 32 - xe;
-    p7 = bitalign(p7, p6, shift);
-    p6 = bitalign(p6, p5, shift);
-
-    // Most significant part of fraction
-    float q1 = as_float(sign | ((127 - xe) << 23) | (p7 >> 9));
-
-    // Shift out bits we captured on q1
-    p7 = bitalign(p7, p6, 32-23);
-
-    // Get 24 more bits of fraction in another float, there are not long strings of zeroes here
-    int xxe = clz(p7) + 1;
-    p7 = bitalign(p7, p6, 32-xxe);
-    float q0 = as_float(sign | ((127 - (xe + 23 + xxe)) << 23) | (p7 >> 9));
-
-    // At this point, the fraction q1 + q0 is correct to at least 48 bits
-    // Now we need to multiply the fraction by pi/2
-    // This loses us about 4 bits
-    // pi/2 = C90 FDA A22 168 C23 4C4
-
-    const float pio2h = (float)0xc90fda / 0x1.0p+23f;
-    const float pio2hh = (float)0xc90 / 0x1.0p+11f;
-    const float pio2ht = (float)0xfda / 0x1.0p+23f;
-    const float pio2t = (float)0xa22168 / 0x1.0p+47f;
-
-    float rh, rt;
-
-    if (HAVE_HW_FMA32()) {
-        rh = q1 * pio2h;
-        rt = fma(q0, pio2h, fma(q1, pio2t, fma(q1, pio2h, -rh)));
-    } else {
-        float q1h = as_float(as_uint(q1) & 0xfffff000);
-        float q1t = q1 - q1h;
-        rh = q1 * pio2h;
-        rt = mad(q1t, pio2ht, mad(q1t, pio2hh, mad(q1h, pio2ht, mad(q1h, pio2hh, -rh))));
-        rt = mad(q0, pio2h, mad(q1, pio2t, rt));
-    }
-
-    float t = rh + rt;
-    rt = rt - (t - rh);
-
-    *r = t;
-    *rr = rt;
-    return ((i >> 1) + (i & 1)) & 0x3;
+#define FULL_MUL(A, B, HI, LO)                                                 \
+  LO = A * B;                                                                  \
+  HI = mul_hi(A, B)
+
+#define FULL_MAD(A, B, C, HI, LO)                                              \
+  LO = ((A) * (B) + (C));                                                      \
+  HI = mul_hi(A, B);                                                           \
+  HI += LO < C
+
+_CLC_DEF int __clc_argReductionLargeS(float *r, float *rr, float x) {
+  int xe = (int)(as_uint(x) >> 23) - 127;
+  uint xm = 0x00800000U | (as_uint(x) & 0x7fffffU);
+
+  // 224 bits of 2/PI: . A2F9836E 4E441529 FC2757D1 F534DDC0 DB629599 3C439041
+  // FE5163AB
+  const uint b6 = 0xA2F9836EU;
+  const uint b5 = 0x4E441529U;
+  const uint b4 = 0xFC2757D1U;
+  const uint b3 = 0xF534DDC0U;
+  const uint b2 = 0xDB629599U;
+  const uint b1 = 0x3C439041U;
+  const uint b0 = 0xFE5163ABU;
+
+  uint p0, p1, p2, p3, p4, p5, p6, p7, c0, c1;
+
+  FULL_MUL(xm, b0, c0, p0);
+  FULL_MAD(xm, b1, c0, c1, p1);
+  FULL_MAD(xm, b2, c1, c0, p2);
+  FULL_MAD(xm, b3, c0, c1, p3);
+  FULL_MAD(xm, b4, c1, c0, p4);
+  FULL_MAD(xm, b5, c0, c1, p5);
+  FULL_MAD(xm, b6, c1, p7, p6);
+
+  uint fbits = 224 + 23 - xe;
+
+  // shift amount to get 2 lsb of integer part at top 2 bits
+  //   min: 25 (xe=18) max: 134 (xe=127)
+  uint shift = 256U - 2 - fbits;
+
+  // Shift by up to 134/32 = 4 words
+  int c = shift > 31;
+  p7 = c ? p6 : p7;
+  p6 = c ? p5 : p6;
+  p5 = c ? p4 : p5;
+  p4 = c ? p3 : p4;
+  p3 = c ? p2 : p3;
+  p2 = c ? p1 : p2;
+  p1 = c ? p0 : p1;
+  shift -= (-c) & 32;
+
+  c = shift > 31;
+  p7 = c ? p6 : p7;
+  p6 = c ? p5 : p6;
+  p5 = c ? p4 : p5;
+  p4 = c ? p3 : p4;
+  p3 = c ? p2 : p3;
+  p2 = c ? p1 : p2;
+  shift -= (-c) & 32;
+
+  c = shift > 31;
+  p7 = c ? p6 : p7;
+  p6 = c ? p5 : p6;
+  p5 = c ? p4 : p5;
+  p4 = c ? p3 : p4;
+  p3 = c ? p2 : p3;
+  shift -= (-c) & 32;
+
+  c = shift > 31;
+  p7 = c ? p6 : p7;
+  p6 = c ? p5 : p6;
+  p5 = c ? p4 : p5;
+  p4 = c ? p3 : p4;
+  shift -= (-c) & 32;
+
+  // bitalign cannot handle a shift of 32
+  c = shift > 0;
+  shift = 32 - shift;
+  uint t7 = bitalign(p7, p6, shift);
+  uint t6 = bitalign(p6, p5, shift);
+  uint t5 = bitalign(p5, p4, shift);
+  p7 = c ? t7 : p7;
+  p6 = c ? t6 : p6;
+  p5 = c ? t5 : p5;
+
+  // Get 2 lsb of int part and msb of fraction
+  int i = p7 >> 29;
+
+  // Scoot up 2 more bits so only fraction remains
+  p7 = bitalign(p7, p6, 30);
+  p6 = bitalign(p6, p5, 30);
+  p5 = bitalign(p5, p4, 30);
+
+  // Subtract 1 if msb of fraction is 1, i.e. fraction >= 0.5
+  uint flip = i & 1 ? 0xffffffffU : 0U;
+  uint sign = i & 1 ? 0x80000000U : 0U;
+  p7 = p7 ^ flip;
+  p6 = p6 ^ flip;
+  p5 = p5 ^ flip;
+
+  // Find exponent and shift away leading zeroes and hidden bit
+  xe = clz(p7) + 1;
+  shift = 32 - xe;
+  p7 = bitalign(p7, p6, shift);
+  p6 = bitalign(p6, p5, shift);
+
+  // Most significant part of fraction
+  float q1 = as_float(sign | ((127 - xe) << 23) | (p7 >> 9));
+
+  // Shift out bits we captured on q1
+  p7 = bitalign(p7, p6, 32 - 23);
+
+  // Get 24 more bits of fraction in another float, there are not long strings
+  // of zeroes here
+  int xxe = clz(p7) + 1;
+  p7 = bitalign(p7, p6, 32 - xxe);
+  float q0 = as_float(sign | ((127 - (xe + 23 + xxe)) << 23) | (p7 >> 9));
+
+  // At this point, the fraction q1 + q0 is correct to at least 48 bits
+  // Now we need to multiply the fraction by pi/2
+  // This loses us about 4 bits
+  // pi/2 = C90 FDA A22 168 C23 4C4
+
+  const float pio2h = (float)0xc90fda / 0x1.0p+23f;
+  const float pio2hh = (float)0xc90 / 0x1.0p+11f;
+  const float pio2ht = (float)0xfda / 0x1.0p+23f;
+  const float pio2t = (float)0xa22168 / 0x1.0p+47f;
+
+  float rh, rt;
+
+  if (HAVE_HW_FMA32()) {
+    rh = q1 * pio2h;
+    rt = fma(q0, pio2h, fma(q1, pio2t, fma(q1, pio2h, -rh)));
+  } else {
+    float q1h = as_float(as_uint(q1) & 0xfffff000);
+    float q1t = q1 - q1h;
+    rh = q1 * pio2h;
+    rt = __clc_mad(
+        q1t, pio2ht,
+        __clc_mad(q1t, pio2hh,
+                  __clc_mad(q1h, pio2ht, __clc_mad(q1h, pio2hh, -rh))));
+    rt = __clc_mad(q0, pio2h, __clc_mad(q1, pio2t, rt));
+  }
+
+  float t = rh + rt;
+  rt = rt - (t - rh);
+
+  *r = t;
+  *rr = rt;
+  return ((i >> 1) + (i & 1)) & 0x3;
 }
 
-_CLC_DEF int __clc_argReductionS(float *r, float *rr, float x)
-{
-    if (x < 0x1.0p+23f)
-        return __clc_argReductionSmallS(r, rr, x);
-    else
-        return __clc_argReductionLargeS(r, rr, x);
+_CLC_DEF int __clc_argReductionS(float *r, float *rr, float x) {
+  if (x < 0x1.0p+23f)
+    return __clc_argReductionSmallS(r, rr, x);
+  else
+    return __clc_argReductionLargeS(r, rr, x);
 }
 
 #ifdef cl_khr_fp64
@@ -329,39 +341,44 @@ _CLC_DEF int __clc_argReductionS(float *r, float *rr, float x)
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 
 // Reduction for medium sized arguments
-_CLC_DEF void __clc_remainder_piby2_medium(double x, double *r, double *rr, int *regn) {
-    // How many pi/2 is x a multiple of?
-    const double two_by_pi = 0x1.45f306dc9c883p-1;
-    double dnpi2 = trunc(fma(x, two_by_pi, 0.5));
-
-    const double piby2_h = -7074237752028440.0 / 0x1.0p+52;
-    const double piby2_m = -2483878800010755.0 / 0x1.0p+105;
-    const double piby2_t = -3956492004828932.0 / 0x1.0p+158;
-
-    // Compute product of npi2 with 159 bits of 2/pi
-    double p_hh = piby2_h * dnpi2;
-    double p_ht = fma(piby2_h, dnpi2, -p_hh);
-    double p_mh = piby2_m * dnpi2;
-    double p_mt = fma(piby2_m, dnpi2, -p_mh);
-    double p_th = piby2_t * dnpi2;
-    double p_tt = fma(piby2_t, dnpi2, -p_th);
-
-    // Reduce to 159 bits
-    double ph = p_hh;
-    double pm = p_ht + p_mh;
-    double t = p_mh - (pm - p_ht);
-    double pt = p_th + t + p_mt + p_tt;
-    t = ph + pm; pm = pm - (t - ph); ph = t;
-    t = pm + pt; pt = pt - (t - pm); pm = t;
-
-    // Subtract from x
-    t = x + ph;
-    double qh = t + pm;
-    double qt = pm - (qh - t) + pt;
-
-    *r = qh;
-    *rr = qt;
-    *regn = (int)(long)dnpi2 & 0x3;
+_CLC_DEF void __clc_remainder_piby2_medium(double x, double *r, double *rr,
+                                           int *regn) {
+  // How many pi/2 is x a multiple of?
+  const double two_by_pi = 0x1.45f306dc9c883p-1;
+  double dnpi2 = __clc_trunc(fma(x, two_by_pi, 0.5));
+
+  const double piby2_h = -7074237752028440.0 / 0x1.0p+52;
+  const double piby2_m = -2483878800010755.0 / 0x1.0p+105;
+  const double piby2_t = -3956492004828932.0 / 0x1.0p+158;
+
+  // Compute product of npi2 with 159 bits of 2/pi
+  double p_hh = piby2_h * dnpi2;
+  double p_ht = fma(piby2_h, dnpi2, -p_hh);
+  double p_mh = piby2_m * dnpi2;
+  double p_mt = fma(piby2_m, dnpi2, -p_mh);
+  double p_th = piby2_t * dnpi2;
+  double p_tt = fma(piby2_t, dnpi2, -p_th);
+
+  // Reduce to 159 bits
+  double ph = p_hh;
+  double pm = p_ht + p_mh;
+  double t = p_mh - (pm - p_ht);
+  double pt = p_th + t + p_mt + p_tt;
+  t = ph + pm;
+  pm = pm - (t - ph);
+  ph = t;
+  t = pm + pt;
+  pt = pt - (t - pm);
+  pm = t;
+
+  // Subtract from x
+  t = x + ph;
+  double qh = t + pm;
+  double qt = pm - (qh - t) + pt;
+
+  *r = qh;
+  *rr = qt;
+  *regn = (int)(long)dnpi2 & 0x3;
 }
 
 // Given positive argument x, reduce it to the range [-pi/4,pi/4] using
@@ -369,195 +386,208 @@ _CLC_DEF void __clc_remainder_piby2_medium(double x, double *r, double *rr, int
 // Return value "regn" tells how many lots of pi/2 were subtracted
 // from x to put it in the range [-pi/4,pi/4], mod 4.
 
-_CLC_DEF void __clc_remainder_piby2_large(double x, double *r, double *rr, int *regn) {
-
-    long ux = as_long(x);
-    int e = (int)(ux >> 52) -  1023;
-    int i = __clc_max(23, (e >> 3) + 17);
-    int j = 150 - i;
-    int j16 = j & ~0xf;
-    double fract_temp;
-
-    // The following extracts 192 consecutive bits of 2/pi aligned on an arbitrary byte boundary
-    uint4 q0 = USE_TABLE(pibits_tbl, j16);
-    uint4 q1 = USE_TABLE(pibits_tbl, (j16 + 16));
-    uint4 q2 = USE_TABLE(pibits_tbl, (j16 + 32));
-
-    int k = (j >> 2) & 0x3;
-    int4 c = (int4)k == (int4)(0, 1, 2, 3);
-
-    uint u0, u1, u2, u3, u4, u5, u6;
-
-    u0 = c.s1 ? q0.s1 : q0.s0;
-    u0 = c.s2 ? q0.s2 : u0;
-    u0 = c.s3 ? q0.s3 : u0;
-
-    u1 = c.s1 ? q0.s2 : q0.s1;
-    u1 = c.s2 ? q0.s3 : u1;
-    u1 = c.s3 ? q1.s0 : u1;
-
-    u2 = c.s1 ? q0.s3 : q0.s2;
-    u2 = c.s2 ? q1.s0 : u2;
-    u2 = c.s3 ? q1.s1 : u2;
-
-    u3 = c.s1 ? q1.s0 : q0.s3;
-    u3 = c.s2 ? q1.s1 : u3;
-    u3 = c.s3 ? q1.s2 : u3;
-
-    u4 = c.s1 ? q1.s1 : q1.s0;
-    u4 = c.s2 ? q1.s2 : u4;
-    u4 = c.s3 ? q1.s3 : u4;
-
-    u5 = c.s1 ? q1.s2 : q1.s1;
-    u5 = c.s2 ? q1.s3 : u5;
-    u5 = c.s3 ? q2.s0 : u5;
-
-    u6 = c.s1 ? q1.s3 : q1.s2;
-    u6 = c.s2 ? q2.s0 : u6;
-    u6 = c.s3 ? q2.s1 : u6;
-
-    uint v0 = bytealign(u1, u0, j);
-    uint v1 = bytealign(u2, u1, j);
-    uint v2 = bytealign(u3, u2, j);
-    uint v3 = bytealign(u4, u3, j);
-    uint v4 = bytealign(u5, u4, j);
-    uint v5 = bytealign(u6, u5, j);
-
-    // Place those 192 bits in 4 48-bit doubles along with correct exponent
-    // If i > 1018 we would get subnormals so we scale p up and x down to get the same product
-    i = 2 + 8*i;
-    x *= i > 1018 ? 0x1.0p-136 : 1.0;
-    i -= i > 1018 ? 136 : 0;
-
-    uint ua = (uint)(1023 + 52 - i) << 20;
-    double a = as_double((uint2)(0, ua));
-    double p0 = as_double((uint2)(v0, ua | (v1 & 0xffffU))) - a;
-    ua += 0x03000000U;
-    a = as_double((uint2)(0, ua));
-    double p1 = as_double((uint2)((v2 << 16) | (v1 >> 16), ua | (v2 >> 16))) - a;
-    ua += 0x03000000U;
-    a = as_double((uint2)(0, ua));
-    double p2 = as_double((uint2)(v3, ua | (v4 & 0xffffU))) - a;
-    ua += 0x03000000U;
-    a = as_double((uint2)(0, ua));
-    double p3 = as_double((uint2)((v5 << 16) | (v4 >> 16), ua | (v5 >> 16))) - a;
-
-    // Exact multiply
-    double f0h = p0 * x;
-    double f0l = fma(p0, x, -f0h);
-    double f1h = p1 * x;
-    double f1l = fma(p1, x, -f1h);
-    double f2h = p2 * x;
-    double f2l = fma(p2, x, -f2h);
-    double f3h = p3 * x;
-    double f3l = fma(p3, x, -f3h);
-
-    // Accumulate product into 4 doubles
-    double s, t;
-
-    double f3 = f3h + f2h;
-    t = f2h - (f3 - f3h);
-    s = f3l + t;
-    t = t - (s - f3l);
-
-    double f2 = s + f1h;
-    t = f1h - (f2 - s) + t;
-    s = f2l + t;
-    t = t - (s - f2l);
-
-    double f1 = s + f0h;
-    t = f0h - (f1 - s) + t;
-    s = f1l + t;
-
-    double f0 = s + f0l;
-
-    // Strip off unwanted large integer bits
-    f3 = 0x1.0p+10 * fract(f3 * 0x1.0p-10, &fract_temp);
-    f3 += f3 + f2 < 0.0 ? 0x1.0p+10 : 0.0;
-
-    // Compute least significant integer bits
-    t = f3 + f2;
-    double di = t - fract(t, &fract_temp);
-    i = (float)di;
-
-    // Shift out remaining integer part
-    f3 -= di;
-    s = f3 + f2; t = f2 - (s - f3); f3 = s; f2 = t;
-    s = f2 + f1; t = f1 - (s - f2); f2 = s; f1 = t;
-    f1 += f0;
-
-    // Subtract 1 if fraction is >= 0.5, and update regn
-    int g = f3 >= 0.5;
-    i += g;
-    f3 -= (float)g;
-
-    // Shift up bits
-    s = f3 + f2; t = f2 -(s - f3); f3 = s; f2 = t + f1;
-
-    // Multiply precise fraction by pi/2 to get radians
-    const double p2h = 7074237752028440.0 / 0x1.0p+52;
-    const double p2t = 4967757600021510.0 / 0x1.0p+106;
-
-    double rhi = f3 * p2h;
-    double rlo = fma(f2, p2h, fma(f3, p2t, fma(f3, p2h, -rhi)));
-
-    *r = rhi + rlo;
-    *rr = rlo - (*r - rhi);
-    *regn = i & 0x3;
+_CLC_DEF void __clc_remainder_piby2_large(double x, double *r, double *rr,
+                                          int *regn) {
+
+  long ux = as_long(x);
+  int e = (int)(ux >> 52) - 1023;
+  int i = __clc_max(23, (e >> 3) + 17);
+  int j = 150 - i;
+  int j16 = j & ~0xf;
+  double fract_temp;
+
+  // The following extracts 192 consecutive bits of 2/pi aligned on an arbitrary
+  // byte boundary
+  uint4 q0 = USE_TABLE(pibits_tbl, j16);
+  uint4 q1 = USE_TABLE(pibits_tbl, (j16 + 16));
+  uint4 q2 = USE_TABLE(pibits_tbl, (j16 + 32));
+
+  int k = (j >> 2) & 0x3;
+  int4 c = (int4)k == (int4)(0, 1, 2, 3);
+
+  uint u0, u1, u2, u3, u4, u5, u6;
+
+  u0 = c.s1 ? q0.s1 : q0.s0;
+  u0 = c.s2 ? q0.s2 : u0;
+  u0 = c.s3 ? q0.s3 : u0;
+
+  u1 = c.s1 ? q0.s2 : q0.s1;
+  u1 = c.s2 ? q0.s3 : u1;
+  u1 = c.s3 ? q1.s0 : u1;
+
+  u2 = c.s1 ? q0.s3 : q0.s2;
+  u2 = c.s2 ? q1.s0 : u2;
+  u2 = c.s3 ? q1.s1 : u2;
+
+  u3 = c.s1 ? q1.s0 : q0.s3;
+  u3 = c.s2 ? q1.s1 : u3;
+  u3 = c.s3 ? q1.s2 : u3;
+
+  u4 = c.s1 ? q1.s1 : q1.s0;
+  u4 = c.s2 ? q1.s2 : u4;
+  u4 = c.s3 ? q1.s3 : u4;
+
+  u5 = c.s1 ? q1.s2 : q1.s1;
+  u5 = c.s2 ? q1.s3 : u5;
+  u5 = c.s3 ? q2.s0 : u5;
+
+  u6 = c.s1 ? q1.s3 : q1.s2;
+  u6 = c.s2 ? q2.s0 : u6;
+  u6 = c.s3 ? q2.s1 : u6;
+
+  uint v0 = bytealign(u1, u0, j);
+  uint v1 = bytealign(u2, u1, j);
+  uint v2 = bytealign(u3, u2, j);
+  uint v3 = bytealign(u4, u3, j);
+  uint v4 = bytealign(u5, u4, j);
+  uint v5 = bytealign(u6, u5, j);
+
+  // Place those 192 bits in 4 48-bit doubles along with correct exponent
+  // If i > 1018 we would get subnormals so we scale p up and x down to get the
+  // same product
+  i = 2 + 8 * i;
+  x *= i > 1018 ? 0x1.0p-136 : 1.0;
+  i -= i > 1018 ? 136 : 0;
+
+  uint ua = (uint)(1023 + 52 - i) << 20;
+  double a = as_double((uint2)(0, ua));
+  double p0 = as_double((uint2)(v0, ua | (v1 & 0xffffU))) - a;
+  ua += 0x03000000U;
+  a = as_double((uint2)(0, ua));
+  double p1 = as_double((uint2)((v2 << 16) | (v1 >> 16), ua | (v2 >> 16))) - a;
+  ua += 0x03000000U;
+  a = as_double((uint2)(0, ua));
+  double p2 = as_double((uint2)(v3, ua | (v4 & 0xffffU))) - a;
+  ua += 0x03000000U;
+  a = as_double((uint2)(0, ua));
+  double p3 = as_double((uint2)((v5 << 16) | (v4 >> 16), ua | (v5 >> 16))) - a;
+
+  // Exact multiply
+  double f0h = p0 * x;
+  double f0l = fma(p0, x, -f0h);
+  double f1h = p1 * x;
+  double f1l = fma(p1, x, -f1h);
+  double f2h = p2 * x;
+  double f2l = fma(p2, x, -f2h);
+  double f3h = p3 * x;
+  double f3l = fma(p3, x, -f3h);
+
+  // Accumulate product into 4 doubles
+  double s, t;
+
+  double f3 = f3h + f2h;
+  t = f2h - (f3 - f3h);
+  s = f3l + t;
+  t = t - (s - f3l);
+
+  double f2 = s + f1h;
+  t = f1h - (f2 - s) + t;
+  s = f2l + t;
+  t = t - (s - f2l);
+
+  double f1 = s + f0h;
+  t = f0h - (f1 - s) + t;
+  s = f1l + t;
+
+  double f0 = s + f0l;
+
+  // Strip off unwanted large integer bits
+  f3 = 0x1.0p+10 * fract(f3 * 0x1.0p-10, &fract_temp);
+  f3 += f3 + f2 < 0.0 ? 0x1.0p+10 : 0.0;
+
+  // Compute least significant integer bits
+  t = f3 + f2;
+  double di = t - fract(t, &fract_temp);
+  i = (float)di;
+
+  // Shift out remaining integer part
+  f3 -= di;
+  s = f3 + f2;
+  t = f2 - (s - f3);
+  f3 = s;
+  f2 = t;
+  s = f2 + f1;
+  t = f1 - (s - f2);
+  f2 = s;
+  f1 = t;
+  f1 += f0;
+
+  // Subtract 1 if fraction is >= 0.5, and update regn
+  int g = f3 >= 0.5;
+  i += g;
+  f3 -= (float)g;
+
+  // Shift up bits
+  s = f3 + f2;
+  t = f2 - (s - f3);
+  f3 = s;
+  f2 = t + f1;
+
+  // Multiply precise fraction by pi/2 to get radians
+  const double p2h = 7074237752028440.0 / 0x1.0p+52;
+  const double p2t = 4967757600021510.0 / 0x1.0p+106;
+
+  double rhi = f3 * p2h;
+  double rlo = fma(f2, p2h, fma(f3, p2t, fma(f3, p2h, -rhi)));
+
+  *r = rhi + rlo;
+  *rr = rlo - (*r - rhi);
+  *regn = i & 0x3;
 }
 
-
 _CLC_DEF double2 __clc_sincos_piby4(double x, double xx) {
-    // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
-    //                      = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
-    //                      = x * f(w)
-    // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
-    // We use a minimax approximation of (f(w) - 1) / w
-    // because this produces an expansion in even powers of x.
-    // If xx (the tail of x) is non-zero, we add a correction
-    // term g(x,xx) = (1-x*x/2)*xx to the result, where g(x,xx)
-    // is an approximation to cos(x)*sin(xx) valid because
-    // xx is tiny relative to x.
-
-    // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
-    //                      = f(w)
-    // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
-    // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
-    // because this produces an expansion in even powers of x.
-    // If xx (the tail of x) is non-zero, we subtract a correction
-    // term g(x,xx) = x*xx to the result, where g(x,xx)
-    // is an approximation to sin(x)*sin(xx) valid because
-    // xx is tiny relative to x.
-
-    const double sc1 = -0.166666666666666646259241729;
-    const double sc2 =  0.833333333333095043065222816e-2;
-    const double sc3 = -0.19841269836761125688538679e-3;
-    const double sc4 =  0.275573161037288022676895908448e-5;
-    const double sc5 = -0.25051132068021699772257377197e-7;
-    const double sc6 =  0.159181443044859136852668200e-9;
-
-    const double cc1 =  0.41666666666666665390037e-1;
-    const double cc2 = -0.13888888888887398280412e-2;
-    const double cc3 =  0.248015872987670414957399e-4;
-    const double cc4 = -0.275573172723441909470836e-6;
-    const double cc5 =  0.208761463822329611076335e-8;
-    const double cc6 = -0.113826398067944859590880e-10;
-
-    double x2 = x * x;
-    double x3 = x2 * x;
-    double r = 0.5 * x2;
-    double t = 1.0 - r;
-
-    double sp = fma(fma(fma(fma(sc6, x2, sc5), x2, sc4), x2, sc3), x2, sc2);
-
-    double cp = t + fma(fma(fma(fma(fma(fma(cc6, x2, cc5), x2, cc4), x2, cc3), x2, cc2), x2, cc1),
-                        x2*x2, fma(x, xx, (1.0 - t) - r));
-
-    double2 ret;
-    ret.lo = x - fma(-x3, sc1, fma(fma(-x3, sp, 0.5*xx), x2, -xx));
-    ret.hi = cp;
-
-    return ret;
+  // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
+  //                      = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
+  //                      = x * f(w)
+  // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
+  // We use a minimax approximation of (f(w) - 1) / w
+  // because this produces an expansion in even powers of x.
+  // If xx (the tail of x) is non-zero, we add a correction
+  // term g(x,xx) = (1-x*x/2)*xx to the result, where g(x,xx)
+  // is an approximation to cos(x)*sin(xx) valid because
+  // xx is tiny relative to x.
+
+  // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
+  //                      = f(w)
+  // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
+  // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
+  // because this produces an expansion in even powers of x.
+  // If xx (the tail of x) is non-zero, we subtract a correction
+  // term g(x,xx) = x*xx to the result, where g(x,xx)
+  // is an approximation to sin(x)*sin(xx) valid because
+  // xx is tiny relative to x.
+
+  const double sc1 = -0.166666666666666646259241729;
+  const double sc2 = 0.833333333333095043065222816e-2;
+  const double sc3 = -0.19841269836761125688538679e-3;
+  const double sc4 = 0.275573161037288022676895908448e-5;
+  const double sc5 = -0.25051132068021699772257377197e-7;
+  const double sc6 = 0.159181443044859136852668200e-9;
+
+  const double cc1 = 0.41666666666666665390037e-1;
+  const double cc2 = -0.13888888888887398280412e-2;
+  const double cc3 = 0.248015872987670414957399e-4;
+  const double cc4 = -0.275573172723441909470836e-6;
+  const double cc5 = 0.208761463822329611076335e-8;
+  const double cc6 = -0.113826398067944859590880e-10;
+
+  double x2 = x * x;
+  double x3 = x2 * x;
+  double r = 0.5 * x2;
+  double t = 1.0 - r;
+
+  double sp = fma(fma(fma(fma(sc6, x2, sc5), x2, sc4), x2, sc3), x2, sc2);
+
+  double cp =
+      t + fma(fma(fma(fma(fma(fma(cc6, x2, cc5), x2, cc4), x2, cc3), x2, cc2),
+                  x2, cc1),
+              x2 * x2, fma(x, xx, (1.0 - t) - r));
+
+  double2 ret;
+  ret.lo = x - fma(-x3, sc1, fma(fma(-x3, sp, 0.5 * xx), x2, -xx));
+  ret.hi = cp;
+
+  return ret;
 }
 
 #endif
diff --git libclc/generic/lib/math/sincospiF_piby4.h libclc/generic/lib/math/sincospiF_piby4.h
index 90ecb1d7a636..d7c01c1bb13d 100644
--- libclc/generic/lib/math/sincospiF_piby4.h
+++ libclc/generic/lib/math/sincospiF_piby4.h
@@ -19,38 +19,41 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  * THE SOFTWARE.
  */
+#include <clc/math/clc_mad.h>
 
 // Evaluate single precisions in and cos of value in interval [-pi/4, pi/4]
-_CLC_INLINE float2
-__libclc__sincosf_piby4(float x)
-{
-    // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
-    // = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
-    // = x * f(w)
-    // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
-    // We use a minimax approximation of (f(w) - 1) / w
-    // because this produces an expansion in even powers of x.
+_CLC_INLINE float2 __libclc__sincosf_piby4(float x) {
+  // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
+  // = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
+  // = x * f(w)
+  // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
+  // We use a minimax approximation of (f(w) - 1) / w
+  // because this produces an expansion in even powers of x.
 
-    // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
-    // = f(w)
-    // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
-    // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
-    // because this produces an expansion in even powers of x.
+  // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
+  // = f(w)
+  // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
+  // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
+  // because this produces an expansion in even powers of x.
 
-    const float sc1 = -0.166666666638608441788607926e0F;
-    const float sc2 =  0.833333187633086262120839299e-2F;
-    const float sc3 = -0.198400874359527693921333720e-3F;
-    const float sc4 =  0.272500015145584081596826911e-5F;
+  const float sc1 = -0.166666666638608441788607926e0F;
+  const float sc2 = 0.833333187633086262120839299e-2F;
+  const float sc3 = -0.198400874359527693921333720e-3F;
+  const float sc4 = 0.272500015145584081596826911e-5F;
 
-    const float cc1 =  0.41666666664325175238031e-1F;
-    const float cc2 = -0.13888887673175665567647e-2F;
-    const float cc3 =  0.24800600878112441958053e-4F;
-    const float cc4 = -0.27301013343179832472841e-6F;
+  const float cc1 = 0.41666666664325175238031e-1F;
+  const float cc2 = -0.13888887673175665567647e-2F;
+  const float cc3 = 0.24800600878112441958053e-4F;
+  const float cc4 = -0.27301013343179832472841e-6F;
 
-    float x2 = x * x;
+  float x2 = x * x;
 
-    float2 ret;
-    ret.x = mad(x*x2, mad(x2, mad(x2, mad(x2, sc4, sc3), sc2), sc1), x);
-    ret.y = mad(x2*x2, mad(x2, mad(x2, mad(x2, cc4, cc3), cc2), cc1), mad(x2, -0.5f, 1.0f));
-    return ret;
+  float2 ret;
+  ret.x = __clc_mad(
+      x * x2, __clc_mad(x2, __clc_mad(x2, __clc_mad(x2, sc4, sc3), sc2), sc1),
+      x);
+  ret.y = __clc_mad(
+      x2 * x2, __clc_mad(x2, __clc_mad(x2, __clc_mad(x2, cc4, cc3), cc2), cc1),
+      __clc_mad(x2, -0.5f, 1.0f));
+  return ret;
 }
diff --git libclc/generic/lib/relational/bitselect.cl libclc/generic/lib/relational/bitselect.cl
index a470447f1fb9..04aae105b7f2 100644
--- libclc/generic/lib/relational/bitselect.cl
+++ libclc/generic/lib/relational/bitselect.cl
@@ -21,34 +21,10 @@
  */
 
 #include <clc/clc.h>
-#include <clc/clcmacro.h>
 #include <clc/relational/clc_bitselect.h>
 
 #define __CLC_BODY <bitselect.inc>
 #include <clc/integer/gentype.inc>
-#undef __CLC_BODY
 
-#define FLOAT_BITSELECT(f_type, i_type, width)                                 \
-  _CLC_OVERLOAD _CLC_DEF f_type##width bitselect(                              \
-      f_type##width x, f_type##width y, f_type##width z) {                     \
-    return __clc_bitselect(x, y, z);                                           \
-  }
-
-FLOAT_BITSELECT(float, uint, )
-FLOAT_BITSELECT(float, uint, 2)
-FLOAT_BITSELECT(float, uint, 3)
-FLOAT_BITSELECT(float, uint, 4)
-FLOAT_BITSELECT(float, uint, 8)
-FLOAT_BITSELECT(float, uint, 16)
-
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-FLOAT_BITSELECT(double, ulong, )
-FLOAT_BITSELECT(double, ulong, 2)
-FLOAT_BITSELECT(double, ulong, 3)
-FLOAT_BITSELECT(double, ulong, 4)
-FLOAT_BITSELECT(double, ulong, 8)
-FLOAT_BITSELECT(double, ulong, 16)
-
-#endif
+#define __CLC_BODY <bitselect.inc>
+#include <clc/math/gentype.inc>
diff --git libclc/generic/lib/relational/bitselect.inc libclc/generic/lib/relational/bitselect.inc
index 3a78a8c7b748..b0d64bddffdf 100644
--- libclc/generic/lib/relational/bitselect.inc
+++ libclc/generic/lib/relational/bitselect.inc
@@ -20,6 +20,7 @@
  * THE SOFTWARE.
  */
 
-_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE bitselect(__CLC_GENTYPE x, __CLC_GENTYPE y, __CLC_GENTYPE z) {
-  return ((x) ^ ((z) & ((y) ^ (x))));
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE bitselect(__CLC_GENTYPE x, __CLC_GENTYPE y,
+                                               __CLC_GENTYPE z) {
+  return __clc_bitselect(x, y, z);
 }
diff --git libclc/generic/lib/relational/select.cl libclc/generic/lib/relational/select.cl
index 094f4f9f29fa..663f9d7ccf99 100644
--- libclc/generic/lib/relational/select.cl
+++ libclc/generic/lib/relational/select.cl
@@ -1,7 +1,11 @@
 #include <clc/clc.h>
+#include <clc/relational/clc_select.h>
 #include <clc/utils.h>
 
-#define __CLC_BODY <select.inc>
+#define __CLC_SELECT_FN select
+#define __CLC_SELECT_DEF(x, y, z) return __clc_select(x, y, z)
+
+#define __CLC_BODY <clc/relational/clc_select_impl.inc>
 #include <clc/math/gentype.inc>
-#define __CLC_BODY <select.inc>
+#define __CLC_BODY <clc/relational/clc_select_impl.inc>
 #include <clc/integer/gentype.inc>
diff --git libcxx/docs/Status/Cxx20Papers.csv libcxx/docs/Status/Cxx20Papers.csv
index 66cb1012eceb..524c6d0ac8be 100644
--- libcxx/docs/Status/Cxx20Papers.csv
+++ libcxx/docs/Status/Cxx20Papers.csv
@@ -79,7 +79,7 @@
 "`P1236R1 <https://wg21.link/P1236R1>`__","Alternative Wording for P0907R4 Signed Integers are Two's Complement","2018-11 (San Diego)","","",""
 "`P1248R1 <https://wg21.link/P1248R1>`__","Remove CommonReference requirement from StrictWeakOrdering (a.k.a Fixing Relations)","2018-11 (San Diego)","|Complete|","13",""
 "`P1285R0 <https://wg21.link/P1285R0>`__","Improving Completeness Requirements for Type Traits","2018-11 (San Diego)","","",""
-"`P1353R0 <https://wg21.link/P1353R0>`__","Missing feature test macros","2018-11 (San Diego)","|In Progress|","",""
+"`P1353R0 <https://wg21.link/P1353R0>`__","Missing feature test macros","2018-11 (San Diego)","|Complete|","19",""
 "","","","","",""
 "`P0339R6 <https://wg21.link/P0339R6>`__","polymorphic_allocator<> as a vocabulary type","2019-02 (Kona)","|Complete|","16",""
 "`P0340R3 <https://wg21.link/P0340R3>`__","Making std::underlying_type SFINAE-friendly","2019-02 (Kona)","|Complete|","9",""
diff --git libcxx/include/__algorithm/make_projected.h libcxx/include/__algorithm/make_projected.h
index 4b54c504413e..4a2582293875 100644
--- libcxx/include/__algorithm/make_projected.h
+++ libcxx/include/__algorithm/make_projected.h
@@ -34,16 +34,16 @@ struct _ProjectedPred {
       : __pred(__pred_arg), __proj(__proj_arg) {}
 
   template <class _Tp>
-  typename __invoke_of<_Pred&, decltype(std::__invoke(std::declval<_Proj&>(), std::declval<_Tp>()))>::type
-      _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI
-      operator()(_Tp&& __v) const {
+  __invoke_result_t<_Pred&, decltype(std::__invoke(std::declval<_Proj&>(), std::declval<_Tp>()))> _LIBCPP_CONSTEXPR
+  _LIBCPP_HIDE_FROM_ABI
+  operator()(_Tp&& __v) const {
     return std::__invoke(__pred, std::__invoke(__proj, std::forward<_Tp>(__v)));
   }
 
   template <class _T1, class _T2>
-  typename __invoke_of<_Pred&,
-                       decltype(std::__invoke(std::declval<_Proj&>(), std::declval<_T1>())),
-                       decltype(std::__invoke(std::declval<_Proj&>(), std::declval<_T2>()))>::type _LIBCPP_CONSTEXPR
+  __invoke_result_t<_Pred&,
+                    decltype(std::__invoke(std::declval<_Proj&>(), std::declval<_T1>())),
+                    decltype(std::__invoke(std::declval<_Proj&>(), std::declval<_T2>()))> _LIBCPP_CONSTEXPR
   _LIBCPP_HIDE_FROM_ABI
   operator()(_T1&& __lhs, _T2&& __rhs) const {
     return std::__invoke(
diff --git libcxx/include/__algorithm/radix_sort.h libcxx/include/__algorithm/radix_sort.h
index 95f04a8bb31f..de6927995e74 100644
--- libcxx/include/__algorithm/radix_sort.h
+++ libcxx/include/__algorithm/radix_sort.h
@@ -88,10 +88,10 @@ __partial_sum_max(_InputIterator __first, _InputIterator __last, _OutputIterator
 
 template <class _Value, class _Map, class _Radix>
 struct __radix_sort_traits {
-  using __image_type _LIBCPP_NODEBUG = decay_t<typename __invoke_of<_Map, _Value>::type>;
+  using __image_type _LIBCPP_NODEBUG = decay_t<__invoke_result_t<_Map, _Value>>;
   static_assert(is_unsigned<__image_type>::value);
 
-  using __radix_type _LIBCPP_NODEBUG = decay_t<typename __invoke_of<_Radix, __image_type>::type>;
+  using __radix_type _LIBCPP_NODEBUG = decay_t<__invoke_result_t<_Radix, __image_type>>;
   static_assert(is_integral<__radix_type>::value);
 
   static constexpr auto __radix_value_range = numeric_limits<__radix_type>::max() + 1;
@@ -101,7 +101,7 @@ struct __radix_sort_traits {
 
 template <class _Value, class _Map>
 struct __counting_sort_traits {
-  using __image_type _LIBCPP_NODEBUG = decay_t<typename __invoke_of<_Map, _Value>::type>;
+  using __image_type _LIBCPP_NODEBUG = decay_t<__invoke_result_t<_Map, _Value>>;
   static_assert(is_unsigned<__image_type>::value);
 
   static constexpr const auto __value_range = numeric_limits<__image_type>::max() + 1;
@@ -158,7 +158,7 @@ _LIBCPP_HIDE_FROM_ABI bool __collect_impl(
   using __value_type                 = __iter_value_type<_ForwardIterator>;
   constexpr auto __radix_value_range = __radix_sort_traits<__value_type, _Map, _Radix>::__radix_value_range;
 
-  auto __previous  = numeric_limits<typename __invoke_of<_Map, __value_type>::type>::min();
+  auto __previous  = numeric_limits<__invoke_result_t<_Map, __value_type>>::min();
   auto __is_sorted = true;
   std::for_each(__first, __last, [&__counters, &__map, &__radix, &__previous, &__is_sorted](const auto& __value) {
     auto __current = __map(__value);
diff --git libcxx/include/__atomic/atomic_sync.h libcxx/include/__atomic/atomic_sync.h
index ab9bc59fdcfe..0dae448d649b 100644
--- libcxx/include/__atomic/atomic_sync.h
+++ libcxx/include/__atomic/atomic_sync.h
@@ -16,7 +16,6 @@
 #include <__config>
 #include <__memory/addressof.h>
 #include <__thread/poll_with_backoff.h>
-#include <__thread/support.h>
 #include <__type_traits/conjunction.h>
 #include <__type_traits/decay.h>
 #include <__type_traits/invoke.h>
@@ -108,15 +107,13 @@ struct __atomic_wait_backoff_impl {
 
   _LIBCPP_AVAILABILITY_SYNC
   _LIBCPP_HIDE_FROM_ABI bool operator()(chrono::nanoseconds __elapsed) const {
-    if (__elapsed > chrono::microseconds(64)) {
+    if (__elapsed > chrono::microseconds(4)) {
       auto __contention_address = __waitable_traits::__atomic_contention_address(__a_);
       __cxx_contention_t __monitor_val;
       if (__update_monitor_val_and_poll(__contention_address, __monitor_val))
         return true;
       std::__libcpp_atomic_wait(__contention_address, __monitor_val);
-    } else if (__elapsed > chrono::microseconds(4))
-      __libcpp_thread_yield();
-    else {
+    } else {
     } // poll
     return false;
   }
diff --git libcxx/include/__functional/bind.h libcxx/include/__functional/bind.h
index e31ad2979035..a3c327ab40cc 100644
--- libcxx/include/__functional/bind.h
+++ libcxx/include/__functional/bind.h
@@ -82,13 +82,13 @@ inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Tp& __mu(reference_w
 }
 
 template <class _Ti, class... _Uj, size_t... _Indx>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 typename __invoke_of<_Ti&, _Uj...>::type
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __invoke_result_t<_Ti&, _Uj...>
 __mu_expand(_Ti& __ti, tuple<_Uj...>& __uj, __tuple_indices<_Indx...>) {
   return __ti(std::forward<_Uj>(std::get<_Indx>(__uj))...);
 }
 
 template <class _Ti, class... _Uj, __enable_if_t<is_bind_expression<_Ti>::value, int> = 0>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 typename __invoke_of<_Ti&, _Uj...>::type
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __invoke_result_t<_Ti&, _Uj...>
 __mu(_Ti& __ti, tuple<_Uj...>& __uj) {
   typedef typename __make_tuple_indices<sizeof...(_Uj)>::type __indices;
   return std::__mu_expand(__ti, __uj, __indices());
@@ -130,12 +130,12 @@ struct __mu_return_invokable // false
 
 template <class _Ti, class... _Uj>
 struct __mu_return_invokable<true, _Ti, _Uj...> {
-  typedef typename __invoke_of<_Ti&, _Uj...>::type type;
+  using type = __invoke_result_t<_Ti&, _Uj...>;
 };
 
 template <class _Ti, class... _Uj>
 struct __mu_return_impl<_Ti, false, true, false, tuple<_Uj...> >
-    : public __mu_return_invokable<__invokable<_Ti&, _Uj...>::value, _Ti, _Uj...> {};
+    : public __mu_return_invokable<__is_invocable_v<_Ti&, _Uj...>, _Ti, _Uj...> {};
 
 template <class _Ti, class _TupleUj>
 struct __mu_return_impl<_Ti, false, false, true, _TupleUj> {
@@ -168,12 +168,12 @@ struct __is_valid_bind_return {
 
 template <class _Fp, class... _BoundArgs, class _TupleUj>
 struct __is_valid_bind_return<_Fp, tuple<_BoundArgs...>, _TupleUj> {
-  static const bool value = __invokable<_Fp, typename __mu_return<_BoundArgs, _TupleUj>::type...>::value;
+  static const bool value = __is_invocable_v<_Fp, typename __mu_return<_BoundArgs, _TupleUj>::type...>;
 };
 
 template <class _Fp, class... _BoundArgs, class _TupleUj>
 struct __is_valid_bind_return<_Fp, const tuple<_BoundArgs...>, _TupleUj> {
-  static const bool value = __invokable<_Fp, typename __mu_return<const _BoundArgs, _TupleUj>::type...>::value;
+  static const bool value = __is_invocable_v<_Fp, typename __mu_return<const _BoundArgs, _TupleUj>::type...>;
 };
 
 template <class _Fp, class _BoundArgs, class _TupleUj, bool = __is_valid_bind_return<_Fp, _BoundArgs, _TupleUj>::value>
@@ -181,12 +181,12 @@ struct __bind_return;
 
 template <class _Fp, class... _BoundArgs, class _TupleUj>
 struct __bind_return<_Fp, tuple<_BoundArgs...>, _TupleUj, true> {
-  typedef typename __invoke_of< _Fp&, typename __mu_return< _BoundArgs, _TupleUj >::type... >::type type;
+  using type = __invoke_result_t< _Fp&, typename __mu_return< _BoundArgs, _TupleUj >::type... >;
 };
 
 template <class _Fp, class... _BoundArgs, class _TupleUj>
 struct __bind_return<_Fp, const tuple<_BoundArgs...>, _TupleUj, true> {
-  typedef typename __invoke_of< _Fp&, typename __mu_return< const _BoundArgs, _TupleUj >::type... >::type type;
+  using type = __invoke_result_t< _Fp&, typename __mu_return< const _BoundArgs, _TupleUj >::type... >;
 };
 
 template <class _Fp, class _BoundArgs, size_t... _Indx, class _Args>
@@ -256,8 +256,7 @@ public:
                         is_void<_Rp>::value,
                     int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 result_type operator()(_Args&&... __args) {
-    typedef __invoke_void_return_wrapper<_Rp> _Invoker;
-    return _Invoker::__call(static_cast<base&>(*this), std::forward<_Args>(__args)...);
+    return std::__invoke_r<_Rp>(static_cast<base&>(*this), std::forward<_Args>(__args)...);
   }
 
   template <class... _Args,
@@ -266,8 +265,7 @@ public:
                               is_void<_Rp>::value,
                           int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 result_type operator()(_Args&&... __args) const {
-    typedef __invoke_void_return_wrapper<_Rp> _Invoker;
-    return _Invoker::__call(static_cast<base const&>(*this), std::forward<_Args>(__args)...);
+    return std::__invoke_r<_Rp>(static_cast<base const&>(*this), std::forward<_Args>(__args)...);
   }
 };
 
diff --git libcxx/include/__functional/function.h libcxx/include/__functional/function.h
index 08cb731be972..2a1293cfcc26 100644
--- libcxx/include/__functional/function.h
+++ libcxx/include/__functional/function.h
@@ -164,8 +164,7 @@ public:
       : __func_(std::move(__f)), __alloc_(std::move(__a)) {}
 
   _LIBCPP_HIDE_FROM_ABI _Rp operator()(_ArgTypes&&... __arg) {
-    typedef __invoke_void_return_wrapper<_Rp> _Invoker;
-    return _Invoker::__call(__func_, std::forward<_ArgTypes>(__arg)...);
+    return std::__invoke_r<_Rp>(__func_, std::forward<_ArgTypes>(__arg)...);
   }
 
   _LIBCPP_HIDE_FROM_ABI __alloc_func* __clone() const {
@@ -213,8 +212,7 @@ public:
   _LIBCPP_HIDE_FROM_ABI explicit __default_alloc_func(const _Target& __f) : __f_(__f) {}
 
   _LIBCPP_HIDE_FROM_ABI _Rp operator()(_ArgTypes&&... __arg) {
-    typedef __invoke_void_return_wrapper<_Rp> _Invoker;
-    return _Invoker::__call(__f_, std::forward<_ArgTypes>(__arg)...);
+    return std::__invoke_r<_Rp>(__f_, std::forward<_ArgTypes>(__arg)...);
   }
 
   _LIBCPP_HIDE_FROM_ABI __default_alloc_func* __clone() const {
@@ -841,12 +839,12 @@ class _LIBCPP_TEMPLATE_VIS function<_Rp(_ArgTypes...)>
   __func __f_;
 
   template <class _Fp,
-            bool = _And< _IsNotSame<__remove_cvref_t<_Fp>, function>, __invokable<_Fp, _ArgTypes...> >::value>
+            bool = _And<_IsNotSame<__remove_cvref_t<_Fp>, function>, __is_invocable<_Fp, _ArgTypes...> >::value>
   struct __callable;
   template <class _Fp>
   struct __callable<_Fp, true> {
     static const bool value =
-        is_void<_Rp>::value || __is_core_convertible<typename __invoke_of<_Fp, _ArgTypes...>::type, _Rp>::value;
+        is_void<_Rp>::value || __is_core_convertible<__invoke_result_t<_Fp, _ArgTypes...>, _Rp>::value;
   };
   template <class _Fp>
   struct __callable<_Fp, false> {
diff --git libcxx/include/__functional/hash.h libcxx/include/__functional/hash.h
index 1f67b6a83776..28b2635ab125 100644
--- libcxx/include/__functional/hash.h
+++ libcxx/include/__functional/hash.h
@@ -522,7 +522,7 @@ template <class _Key, class _Hash>
 using __check_hash_requirements _LIBCPP_NODEBUG =
     integral_constant<bool,
                       is_copy_constructible<_Hash>::value && is_move_constructible<_Hash>::value &&
-                          __invokable_r<size_t, _Hash, _Key const&>::value >;
+                          __is_invocable_r_v<size_t, _Hash, _Key const&> >;
 
 template <class _Key, class _Hash = hash<_Key> >
 using __has_enabled_hash _LIBCPP_NODEBUG =
diff --git libcxx/include/__functional/mem_fn.h libcxx/include/__functional/mem_fn.h
index f246edb334bb..690393988c5a 100644
--- libcxx/include/__functional/mem_fn.h
+++ libcxx/include/__functional/mem_fn.h
@@ -36,8 +36,8 @@ public:
 
   // invoke
   template <class... _ArgTypes>
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 typename __invoke_of<const _Tp&, _ArgTypes...>::type
-  operator()(_ArgTypes&&... __args) const _NOEXCEPT_(__nothrow_invokable<const _Tp&, _ArgTypes...>::value) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __invoke_result_t<const _Tp&, _ArgTypes...>
+  operator()(_ArgTypes&&... __args) const _NOEXCEPT_(__is_nothrow_invocable_v<const _Tp&, _ArgTypes...>) {
     return std::__invoke(__f_, std::forward<_ArgTypes>(__args)...);
   }
 };
diff --git libcxx/include/__functional/reference_wrapper.h libcxx/include/__functional/reference_wrapper.h
index a4a66a50cf84..d6cd6428f22d 100644
--- libcxx/include/__functional/reference_wrapper.h
+++ libcxx/include/__functional/reference_wrapper.h
@@ -57,7 +57,7 @@ public:
 
   // invoke
   template <class... _ArgTypes>
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 typename __invoke_of<type&, _ArgTypes...>::type
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __invoke_result_t<type&, _ArgTypes...>
   operator()(_ArgTypes&&... __args) const
 #if _LIBCPP_STD_VER >= 17
       // Since is_nothrow_invocable requires C++17 LWG3764 is not backported
diff --git libcxx/include/__hash_table libcxx/include/__hash_table
index 7788f687746f..9a82ec51daee 100644
--- libcxx/include/__hash_table
+++ libcxx/include/__hash_table
@@ -650,9 +650,9 @@ struct __enforce_unordered_container_requirements {
 
 template <class _Key, class _Hash, class _Equal>
 #ifndef _LIBCPP_CXX03_LANG
-_LIBCPP_DIAGNOSE_WARNING(!__invokable<_Equal const&, _Key const&, _Key const&>::value,
+_LIBCPP_DIAGNOSE_WARNING(!__is_invocable_v<_Equal const&, _Key const&, _Key const&>,
                          "the specified comparator type does not provide a viable const call operator")
-_LIBCPP_DIAGNOSE_WARNING(!__invokable<_Hash const&, _Key const&>::value,
+_LIBCPP_DIAGNOSE_WARNING(!__is_invocable_v<_Hash const&, _Key const&>,
                          "the specified hash functor does not provide a viable const call operator")
 #endif
     typename __enforce_unordered_container_requirements<_Key, _Hash, _Equal>::type
diff --git libcxx/include/__tree libcxx/include/__tree
index dfb205c51e17..acad6c33f878 100644
--- libcxx/include/__tree
+++ libcxx/include/__tree
@@ -876,7 +876,7 @@ private:
 
 template <class _Tp, class _Compare>
 #ifndef _LIBCPP_CXX03_LANG
-_LIBCPP_DIAGNOSE_WARNING(!__invokable<_Compare const&, _Tp const&, _Tp const&>::value,
+_LIBCPP_DIAGNOSE_WARNING(!__is_invocable_v<_Compare const&, _Tp const&, _Tp const&>,
                          "the specified comparator type does not provide a viable const call operator")
 #endif
 int __diagnose_non_const_comparator();
diff --git libcxx/include/__type_traits/invoke.h libcxx/include/__type_traits/invoke.h
index 6f641b9a81b8..013293bec49b 100644
--- libcxx/include/__type_traits/invoke.h
+++ libcxx/include/__type_traits/invoke.h
@@ -29,6 +29,36 @@
 #  pragma GCC system_header
 #endif
 
+// This file defines the following libc++-internal API (back-ported to C++03):
+//
+// template <class... Args>
+// decltype(auto) __invoke(Args&&... args) noexcept(noexcept(std::invoke(std::forward<Args>(args...)))) {
+//   return std::invoke(std::forward<Args>(args)...);
+// }
+//
+// template <class Ret, class... Args>
+// Ret __invoke_r(Args&&... args) {
+//   return std::invoke_r(std::forward<Args>(args)...);
+// }
+//
+// template <class Ret, class Func, class... Args>
+// inline const bool __is_invocable_r_v = is_invocable_r_v<Ret, Func, Args...>;
+//
+// template <class Func, class... Args>
+// struct __is_invocable : is_invocable<Func, Args...> {};
+//
+// template <class Func, class... Args>
+// inline const bool __is_invocable_v = is_invocable_v<Func, Args...>;
+//
+// template <class Func, class... Args>
+// inline const bool __is_nothrow_invocable_v = is_nothrow_invocable_v<Func, Args...>;
+//
+// template <class Func, class... Args>
+// struct __invoke_result : invoke_result {};
+//
+// template <class Func, class... Args>
+// using __invoke_result_t = invoke_result_t<Func, Args...>;
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _DecayedFp>
@@ -167,7 +197,7 @@ struct __invokable_r {
   static const bool value = type::value;
 };
 template <class _Fp, class... _Args>
-using __invokable _LIBCPP_NODEBUG = __invokable_r<void, _Fp, _Args...>;
+using __is_invocable _LIBCPP_NODEBUG = __invokable_r<void, _Fp, _Args...>;
 
 template <bool _IsInvokable, bool _IsCVVoid, class _Ret, class _Fp, class... _Args>
 struct __nothrow_invokable_r_imp {
@@ -204,11 +234,7 @@ using __nothrow_invokable_r _LIBCPP_NODEBUG =
 
 template <class _Fp, class... _Args>
 using __nothrow_invokable _LIBCPP_NODEBUG =
-    __nothrow_invokable_r_imp<__invokable<_Fp, _Args...>::value, true, void, _Fp, _Args...>;
-
-template <class _Fp, class... _Args>
-struct __invoke_of
-    : public enable_if<__invokable<_Fp, _Args...>::value, typename __invokable_r<void, _Fp, _Args...>::_Result> {};
+    __nothrow_invokable_r_imp<__is_invocable<_Fp, _Args...>::value, true, void, _Fp, _Args...>;
 
 template <class _Ret, bool = is_void<_Ret>::value>
 struct __invoke_void_return_wrapper {
@@ -226,31 +252,51 @@ struct __invoke_void_return_wrapper<_Ret, true> {
   }
 };
 
+template <class _Func, class... _Args>
+inline const bool __is_invocable_v = __is_invocable<_Func, _Args...>::value;
+
+template <class _Ret, class _Func, class... _Args>
+inline const bool __is_invocable_r_v = __invokable_r<_Ret, _Func, _Args...>::value;
+
+template <class _Func, class... _Args>
+inline const bool __is_nothrow_invocable_v = __nothrow_invokable<_Func, _Args...>::value;
+
+template <class _Func, class... _Args>
+struct __invoke_result
+    : enable_if<__is_invocable_v<_Func, _Args...>, typename __invokable_r<void, _Func, _Args...>::_Result> {};
+
+template <class _Func, class... _Args>
+using __invoke_result_t _LIBCPP_NODEBUG = typename __invoke_result<_Func, _Args...>::type;
+
+template <class _Ret, class... _Args>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Ret __invoke_r(_Args&&... __args) {
+  return __invoke_void_return_wrapper<_Ret>::__call(std::forward<_Args>(__args)...);
+}
+
 #if _LIBCPP_STD_VER >= 17
 
 // is_invocable
 
 template <class _Fn, class... _Args>
-struct _LIBCPP_TEMPLATE_VIS is_invocable : integral_constant<bool, __invokable<_Fn, _Args...>::value> {};
+struct _LIBCPP_TEMPLATE_VIS is_invocable : bool_constant<__is_invocable_v<_Fn, _Args...>> {};
 
 template <class _Ret, class _Fn, class... _Args>
-struct _LIBCPP_TEMPLATE_VIS is_invocable_r : integral_constant<bool, __invokable_r<_Ret, _Fn, _Args...>::value> {};
+struct _LIBCPP_TEMPLATE_VIS is_invocable_r : bool_constant<__is_invocable_r_v<_Ret, _Fn, _Args...>> {};
 
 template <class _Fn, class... _Args>
-inline constexpr bool is_invocable_v = is_invocable<_Fn, _Args...>::value;
+inline constexpr bool is_invocable_v = __is_invocable_v<_Fn, _Args...>;
 
 template <class _Ret, class _Fn, class... _Args>
-inline constexpr bool is_invocable_r_v = is_invocable_r<_Ret, _Fn, _Args...>::value;
+inline constexpr bool is_invocable_r_v = __is_invocable_r_v<_Ret, _Fn, _Args...>;
 
 // is_nothrow_invocable
 
 template <class _Fn, class... _Args>
-struct _LIBCPP_TEMPLATE_VIS is_nothrow_invocable : integral_constant<bool, __nothrow_invokable<_Fn, _Args...>::value> {
-};
+struct _LIBCPP_TEMPLATE_VIS is_nothrow_invocable : bool_constant<__nothrow_invokable<_Fn, _Args...>::value> {};
 
 template <class _Ret, class _Fn, class... _Args>
-struct _LIBCPP_TEMPLATE_VIS is_nothrow_invocable_r
-    : integral_constant<bool, __nothrow_invokable_r<_Ret, _Fn, _Args...>::value> {};
+struct _LIBCPP_TEMPLATE_VIS is_nothrow_invocable_r : bool_constant<__nothrow_invokable_r<_Ret, _Fn, _Args...>::value> {
+};
 
 template <class _Fn, class... _Args>
 inline constexpr bool is_nothrow_invocable_v = is_nothrow_invocable<_Fn, _Args...>::value;
@@ -259,7 +305,7 @@ template <class _Ret, class _Fn, class... _Args>
 inline constexpr bool is_nothrow_invocable_r_v = is_nothrow_invocable_r<_Ret, _Fn, _Args...>::value;
 
 template <class _Fn, class... _Args>
-struct _LIBCPP_TEMPLATE_VIS invoke_result : __invoke_of<_Fn, _Args...> {};
+struct _LIBCPP_TEMPLATE_VIS invoke_result : __invoke_result<_Fn, _Args...> {};
 
 template <class _Fn, class... _Args>
 using invoke_result_t = typename invoke_result<_Fn, _Args...>::type;
diff --git libcxx/include/__type_traits/result_of.h libcxx/include/__type_traits/result_of.h
index 73a194475206..217ca70b4cd2 100644
--- libcxx/include/__type_traits/result_of.h
+++ libcxx/include/__type_traits/result_of.h
@@ -22,10 +22,10 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if _LIBCPP_STD_VER <= 17 || defined(_LIBCPP_ENABLE_CXX20_REMOVED_TYPE_TRAITS)
 template <class _Callable>
-class _LIBCPP_DEPRECATED_IN_CXX17 result_of;
+struct _LIBCPP_DEPRECATED_IN_CXX17 result_of;
 
 template <class _Fp, class... _Args>
-class _LIBCPP_TEMPLATE_VIS result_of<_Fp(_Args...)> : public __invoke_of<_Fp, _Args...> {};
+struct _LIBCPP_TEMPLATE_VIS result_of<_Fp(_Args...)> : __invoke_result<_Fp, _Args...> {};
 
 #  if _LIBCPP_STD_VER >= 14
 template <class _Tp>
diff --git libcxx/include/__vector/vector.h libcxx/include/__vector/vector.h
index 28e9495a314a..d94aca6788c8 100644
--- libcxx/include/__vector/vector.h
+++ libcxx/include/__vector/vector.h
@@ -242,7 +242,7 @@ private:
 
     _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void operator()() {
       if (__vec_.__begin_ != nullptr) {
-        __vec_.__clear();
+        __vec_.clear();
         __vec_.__annotate_delete();
         __alloc_traits::deallocate(__vec_.__alloc_, __vec_.__begin_, __vec_.capacity());
       }
@@ -525,7 +525,7 @@ public:
 
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void clear() _NOEXCEPT {
     size_type __old_size = size();
-    __clear();
+    __base_destruct_at_end(this->__begin_);
     __annotate_shrink(__old_size);
   }
 
@@ -737,10 +737,6 @@ private:
     ++__tx.__pos_;
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __clear() _NOEXCEPT {
-    __base_destruct_at_end(this->__begin_);
-  }
-
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __base_destruct_at_end(pointer __new_last) _NOEXCEPT {
     pointer __soon_to_be_end = this->__end_;
     while (__new_last != __soon_to_be_end)
@@ -764,7 +760,7 @@ private:
 
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __copy_assign_alloc(const vector& __c, true_type) {
     if (this->__alloc_ != __c.__alloc_) {
-      __clear();
+      clear();
       __annotate_delete();
       __alloc_traits::deallocate(this->__alloc_, this->__begin_, capacity());
       this->__begin_ = this->__end_ = this->__cap_ = nullptr;
diff --git libcxx/include/future libcxx/include/future
index 72f3ed5ca5d2..db1f624244b8 100644
--- libcxx/include/future
+++ libcxx/include/future
@@ -1840,7 +1840,7 @@ class _LIBCPP_HIDDEN __async_func {
   tuple<_Fp, _Args...> __f_;
 
 public:
-  typedef typename __invoke_of<_Fp, _Args...>::type _Rp;
+  using _Rp _LIBCPP_NODEBUG = __invoke_result_t<_Fp, _Args...>;
 
   _LIBCPP_HIDE_FROM_ABI explicit __async_func(_Fp&& __f, _Args&&... __args)
       : __f_(std::move(__f), std::move(__args)...) {}
@@ -1864,7 +1864,7 @@ inline _LIBCPP_HIDE_FROM_ABI bool __does_policy_contain(launch __policy, launch
 }
 
 template <class _Fp, class... _Args>
-[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI future<typename __invoke_of<__decay_t<_Fp>, __decay_t<_Args>...>::type>
+[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI future<__invoke_result_t<__decay_t<_Fp>, __decay_t<_Args>...> >
 async(launch __policy, _Fp&& __f, _Args&&... __args) {
   typedef __async_func<__decay_t<_Fp>, __decay_t<_Args>...> _BF;
   typedef typename _BF::_Rp _Rp;
@@ -1889,7 +1889,7 @@ async(launch __policy, _Fp&& __f, _Args&&... __args) {
 }
 
 template <class _Fp, class... _Args>
-[[__nodiscard__]] inline _LIBCPP_HIDE_FROM_ABI future<typename __invoke_of<__decay_t<_Fp>, __decay_t<_Args>...>::type>
+[[__nodiscard__]] inline _LIBCPP_HIDE_FROM_ABI future<__invoke_result_t<__decay_t<_Fp>, __decay_t<_Args>...> >
 async(_Fp&& __f, _Args&&... __args) {
   return std::async(launch::any, std::forward<_Fp>(__f), std::forward<_Args>(__args)...);
 }
diff --git libcxx/include/unordered_map libcxx/include/unordered_map
index 0ae413849177..76623d024275 100644
--- libcxx/include/unordered_map
+++ libcxx/include/unordered_map
@@ -1842,7 +1842,7 @@ struct __container_traits<unordered_map<_Key, _Tp, _Hash, _Pred, _Alloc> > {
   //  other than the container's hash function from within an insert or emplace function
   //  inserting a single element, the insertion has no effect.
   static _LIBCPP_CONSTEXPR const bool __emplacement_has_strong_exception_safety_guarantee =
-      __nothrow_invokable<_Hash, const _Key&>::value;
+      __is_nothrow_invocable_v<_Hash, const _Key&>;
 };
 
 template <class _Key,
@@ -2542,7 +2542,7 @@ struct __container_traits<unordered_multimap<_Key, _Tp, _Hash, _Pred, _Alloc> >
   //  other than the container's hash function from within an insert or emplace function
   //  inserting a single element, the insertion has no effect.
   static _LIBCPP_CONSTEXPR const bool __emplacement_has_strong_exception_safety_guarantee =
-      __nothrow_invokable<_Hash, const _Key&>::value;
+      __is_nothrow_invocable_v<_Hash, const _Key&>;
 };
 
 _LIBCPP_END_NAMESPACE_STD
diff --git libcxx/include/unordered_set libcxx/include/unordered_set
index 87f0a9f438ef..87d98435329f 100644
--- libcxx/include/unordered_set
+++ libcxx/include/unordered_set
@@ -1195,7 +1195,7 @@ struct __container_traits<unordered_set<_Value, _Hash, _Pred, _Alloc> > {
   //  other than the container's hash function from within an insert or emplace function
   //  inserting a single element, the insertion has no effect.
   static _LIBCPP_CONSTEXPR const bool __emplacement_has_strong_exception_safety_guarantee =
-      __nothrow_invokable<_Hash, const _Value&>::value;
+      __is_nothrow_invocable_v<_Hash, const _Value&>;
 };
 
 template <class _Value, class _Hash = hash<_Value>, class _Pred = equal_to<_Value>, class _Alloc = allocator<_Value> >
@@ -1815,7 +1815,7 @@ struct __container_traits<unordered_multiset<_Value, _Hash, _Pred, _Alloc> > {
   //  other than the container's hash function from within an insert or emplace function
   //  inserting a single element, the insertion has no effect.
   static _LIBCPP_CONSTEXPR const bool __emplacement_has_strong_exception_safety_guarantee =
-      __nothrow_invokable<_Hash, const _Value&>::value;
+      __is_nothrow_invocable_v<_Hash, const _Value&>;
 };
 
 _LIBCPP_END_NAMESPACE_STD
diff --git libcxx/test/benchmarks/atomic_wait.bench.cpp libcxx/test/benchmarks/atomic_wait.bench.cpp
deleted file mode 100644
index d19f5fbed8ad..000000000000
--- libcxx/test/benchmarks/atomic_wait.bench.cpp
+++ /dev/null
@@ -1,159 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03, c++11, c++14, c++17
-
-#include <atomic>
-#include <cstdint>
-#include <numeric>
-#include <stop_token>
-#include <thread>
-
-#include "benchmark/benchmark.h"
-#include "make_test_thread.h"
-
-using namespace std::chrono_literals;
-
-void BM_atomic_wait_one_thread_one_atomic_wait(benchmark::State& state) {
-  std::atomic<std::uint64_t> a;
-  auto thread_func = [&](std::stop_token st) {
-    while (!st.stop_requested()) {
-      a.fetch_add(1, std::memory_order_relaxed);
-      a.notify_all();
-    }
-  };
-
-  std::uint64_t total_loop_test_param = state.range(0);
-
-  auto thread = support::make_test_jthread(thread_func);
-
-  for (auto _ : state) {
-    for (std::uint64_t i = 0; i < total_loop_test_param; ++i) {
-      auto old = a.load(std::memory_order_relaxed);
-      a.wait(old);
-    }
-  }
-}
-BENCHMARK(BM_atomic_wait_one_thread_one_atomic_wait)->RangeMultiplier(2)->Range(1 << 10, 1 << 24);
-
-void BM_atomic_wait_multi_thread_one_atomic_wait(benchmark::State& state) {
-  std::atomic<std::uint64_t> a;
-  auto notify_func = [&](std::stop_token st) {
-    while (!st.stop_requested()) {
-      a.fetch_add(1, std::memory_order_relaxed);
-      a.notify_all();
-    }
-  };
-
-  std::uint64_t total_loop_test_param = state.range(0);
-  constexpr auto num_waiting_threads  = 15;
-  std::vector<std::jthread> wait_threads;
-  wait_threads.reserve(num_waiting_threads);
-
-  auto notify_thread = support::make_test_jthread(notify_func);
-
-  std::atomic<std::uint64_t> start_flag = 0;
-  std::atomic<std::uint64_t> done_count = 0;
-  auto wait_func                        = [&a, &start_flag, &done_count, total_loop_test_param](std::stop_token st) {
-    auto old_start = 0;
-    while (!st.stop_requested()) {
-      start_flag.wait(old_start);
-      old_start = start_flag.load();
-      for (std::uint64_t i = 0; i < total_loop_test_param; ++i) {
-        auto old = a.load(std::memory_order_relaxed);
-        a.wait(old);
-      }
-      done_count.fetch_add(1);
-    }
-  };
-
-  for (size_t i = 0; i < num_waiting_threads; ++i) {
-    wait_threads.emplace_back(support::make_test_jthread(wait_func));
-  }
-
-  for (auto _ : state) {
-    done_count = 0;
-    start_flag.fetch_add(1);
-    start_flag.notify_all();
-    while (done_count < num_waiting_threads) {
-      std::this_thread::yield();
-    }
-  }
-  for (auto& t : wait_threads) {
-    t.request_stop();
-  }
-  start_flag.fetch_add(1);
-  start_flag.notify_all();
-  for (auto& t : wait_threads) {
-    t.join();
-  }
-}
-BENCHMARK(BM_atomic_wait_multi_thread_one_atomic_wait)->RangeMultiplier(2)->Range(1 << 10, 1 << 20);
-
-void BM_atomic_wait_multi_thread_wait_different_atomics(benchmark::State& state) {
-  const std::uint64_t total_loop_test_param = state.range(0);
-  constexpr std::uint64_t num_atomics       = 7;
-  std::vector<std::atomic<std::uint64_t>> atomics(num_atomics);
-
-  auto notify_func = [&](std::stop_token st, size_t idx) {
-    while (!st.stop_requested()) {
-      atomics[idx].fetch_add(1, std::memory_order_relaxed);
-      atomics[idx].notify_all();
-    }
-  };
-
-  std::atomic<std::uint64_t> start_flag = 0;
-  std::atomic<std::uint64_t> done_count = 0;
-
-  auto wait_func = [&, total_loop_test_param](std::stop_token st, size_t idx) {
-    auto old_start = 0;
-    while (!st.stop_requested()) {
-      start_flag.wait(old_start);
-      old_start = start_flag.load();
-      for (std::uint64_t i = 0; i < total_loop_test_param; ++i) {
-        auto old = atomics[idx].load(std::memory_order_relaxed);
-        atomics[idx].wait(old);
-      }
-      done_count.fetch_add(1);
-    }
-  };
-
-  std::vector<std::jthread> notify_threads;
-  notify_threads.reserve(num_atomics);
-
-  std::vector<std::jthread> wait_threads;
-  wait_threads.reserve(num_atomics);
-
-  for (size_t i = 0; i < num_atomics; ++i) {
-    notify_threads.emplace_back(support::make_test_jthread(notify_func, i));
-  }
-
-  for (size_t i = 0; i < num_atomics; ++i) {
-    wait_threads.emplace_back(support::make_test_jthread(wait_func, i));
-  }
-
-  for (auto _ : state) {
-    done_count = 0;
-    start_flag.fetch_add(1);
-    start_flag.notify_all();
-    while (done_count < num_atomics) {
-      std::this_thread::yield();
-    }
-  }
-  for (auto& t : wait_threads) {
-    t.request_stop();
-  }
-  start_flag.fetch_add(1);
-  start_flag.notify_all();
-  for (auto& t : wait_threads) {
-    t.join();
-  }
-}
-BENCHMARK(BM_atomic_wait_multi_thread_wait_different_atomics)->RangeMultiplier(2)->Range(1 << 10, 1 << 20);
-
-BENCHMARK_MAIN();
diff --git libcxx/test/benchmarks/atomic_wait_1_waiter_1_notifier.bench.cpp libcxx/test/benchmarks/atomic_wait_1_waiter_1_notifier.bench.cpp
new file mode 100644
index 000000000000..c3d7e6511925
--- /dev/null
+++ libcxx/test/benchmarks/atomic_wait_1_waiter_1_notifier.bench.cpp
@@ -0,0 +1,74 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+#include "atomic_wait_helper.h"
+
+#include <atomic>
+#include <array>
+#include <chrono>
+#include <cstdint>
+#include <numeric>
+#include <stop_token>
+#include <thread>
+
+#include "benchmark/benchmark.h"
+#include "make_test_thread.h"
+
+using namespace std::chrono_literals;
+
+template <class NotifyPolicy, class NumPrioTasks>
+void BM_1_atomic_1_waiter_1_notifier(benchmark::State& state) {
+  [[maybe_unused]] std::array<HighPrioTask, NumPrioTasks::value> tasks{};
+  std::atomic<std::uint64_t> a;
+  auto thread_func = [&](std::stop_token st) { NotifyPolicy::notify(a, st); };
+
+  std::uint64_t total_loop_test_param = state.range(0);
+
+  auto thread = support::make_test_jthread(thread_func);
+
+  for (auto _ : state) {
+    for (std::uint64_t i = 0; i < total_loop_test_param; ++i) {
+      auto old = a.load(std::memory_order_relaxed);
+      a.wait(old);
+    }
+  }
+}
+
+BENCHMARK(BM_1_atomic_1_waiter_1_notifier<KeepNotifying, NumHighPrioTasks<0>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 16, 1 << 18);
+BENCHMARK(BM_1_atomic_1_waiter_1_notifier<NotifyEveryNus<50>, NumHighPrioTasks<0>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 10, 1 << 12);
+BENCHMARK(BM_1_atomic_1_waiter_1_notifier<NotifyEveryNus<100>, NumHighPrioTasks<0>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 10, 1 << 12);
+
+BENCHMARK(BM_1_atomic_1_waiter_1_notifier<KeepNotifying, NumHighPrioTasks<4>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 16, 1 << 18);
+BENCHMARK(BM_1_atomic_1_waiter_1_notifier<NotifyEveryNus<50>, NumHighPrioTasks<4>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 10, 1 << 12);
+BENCHMARK(BM_1_atomic_1_waiter_1_notifier<NotifyEveryNus<100>, NumHighPrioTasks<4>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 10, 1 << 12);
+
+BENCHMARK(BM_1_atomic_1_waiter_1_notifier<KeepNotifying, NumHighPrioTasks<7>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 4, 1 << 6);
+BENCHMARK(BM_1_atomic_1_waiter_1_notifier<NotifyEveryNus<50>, NumHighPrioTasks<7>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 3, 1 << 5);
+BENCHMARK(BM_1_atomic_1_waiter_1_notifier<NotifyEveryNus<100>, NumHighPrioTasks<7>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 3, 1 << 5);
+
+BENCHMARK_MAIN();
diff --git libcxx/test/benchmarks/atomic_wait_N_waiter_N_notifier.bench.cpp libcxx/test/benchmarks/atomic_wait_N_waiter_N_notifier.bench.cpp
new file mode 100644
index 000000000000..d9b9aa212f60
--- /dev/null
+++ libcxx/test/benchmarks/atomic_wait_N_waiter_N_notifier.bench.cpp
@@ -0,0 +1,167 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+#include "atomic_wait_helper.h"
+
+#include <atomic>
+#include <cstdint>
+#include <numeric>
+#include <stop_token>
+#include <pthread.h>
+#include <sched.h>
+#include <thread>
+#include <chrono>
+#include <array>
+
+#include "benchmark/benchmark.h"
+#include "make_test_thread.h"
+
+using namespace std::chrono_literals;
+
+template <class NotifyPolicy, class NumberOfAtomics, class NumPrioTasks>
+void BM_N_atomics_N_waiter_N_notifier(benchmark::State& state) {
+  [[maybe_unused]] std::array<HighPrioTask, NumPrioTasks::value> tasks{};
+  const std::uint64_t total_loop_test_param = state.range(0);
+  constexpr std::uint64_t num_atomics       = NumberOfAtomics::value;
+  std::vector<std::atomic<std::uint64_t>> atomics(num_atomics);
+
+  auto notify_func = [&](std::stop_token st, size_t idx) {
+    while (!st.stop_requested()) {
+      NotifyPolicy::notify(atomics[idx], st);
+    }
+  };
+
+  std::atomic<std::uint64_t> start_flag = 0;
+  std::atomic<std::uint64_t> done_count = 0;
+
+  auto wait_func = [&, total_loop_test_param](std::stop_token st, size_t idx) {
+    auto old_start = 0;
+    while (!st.stop_requested()) {
+      start_flag.wait(old_start);
+      old_start = start_flag.load();
+      for (std::uint64_t i = 0; i < total_loop_test_param; ++i) {
+        auto old = atomics[idx].load(std::memory_order_relaxed);
+        atomics[idx].wait(old);
+      }
+      done_count.fetch_add(1);
+    }
+  };
+
+  std::vector<std::jthread> notify_threads;
+  notify_threads.reserve(num_atomics);
+
+  std::vector<std::jthread> wait_threads;
+  wait_threads.reserve(num_atomics);
+
+  for (size_t i = 0; i < num_atomics; ++i) {
+    notify_threads.emplace_back(support::make_test_jthread(notify_func, i));
+  }
+
+  for (size_t i = 0; i < num_atomics; ++i) {
+    wait_threads.emplace_back(support::make_test_jthread(wait_func, i));
+  }
+
+  for (auto _ : state) {
+    done_count = 0;
+    start_flag.fetch_add(1);
+    start_flag.notify_all();
+    while (done_count < num_atomics) {
+      std::this_thread::yield();
+    }
+  }
+  for (auto& t : wait_threads) {
+    t.request_stop();
+  }
+  start_flag.fetch_add(1);
+  start_flag.notify_all();
+  for (auto& t : wait_threads) {
+    t.join();
+  }
+}
+
+BENCHMARK(BM_N_atomics_N_waiter_N_notifier<KeepNotifying, NumberOfAtomics<2>, NumHighPrioTasks<0>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 12, 1 << 14);
+BENCHMARK(BM_N_atomics_N_waiter_N_notifier<KeepNotifying, NumberOfAtomics<3>, NumHighPrioTasks<0>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 10, 1 << 12);
+BENCHMARK(BM_N_atomics_N_waiter_N_notifier<KeepNotifying, NumberOfAtomics<5>, NumHighPrioTasks<0>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 10, 1 << 12);
+BENCHMARK(BM_N_atomics_N_waiter_N_notifier<KeepNotifying, NumberOfAtomics<7>, NumHighPrioTasks<0>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 8, 1 << 10);
+
+BENCHMARK(BM_N_atomics_N_waiter_N_notifier<NotifyEveryNus<50>, NumberOfAtomics<2>, NumHighPrioTasks<0>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 10, 1 << 12);
+BENCHMARK(BM_N_atomics_N_waiter_N_notifier<NotifyEveryNus<50>, NumberOfAtomics<3>, NumHighPrioTasks<0>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 8, 1 << 10);
+BENCHMARK(BM_N_atomics_N_waiter_N_notifier<NotifyEveryNus<50>, NumberOfAtomics<5>, NumHighPrioTasks<0>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 8, 1 << 10);
+BENCHMARK(BM_N_atomics_N_waiter_N_notifier<NotifyEveryNus<50>, NumberOfAtomics<7>, NumHighPrioTasks<0>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 6, 1 << 8);
+
+BENCHMARK(BM_N_atomics_N_waiter_N_notifier<NotifyEveryNus<100>, NumberOfAtomics<2>, NumHighPrioTasks<0>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 8, 1 << 10);
+BENCHMARK(BM_N_atomics_N_waiter_N_notifier<NotifyEveryNus<100>, NumberOfAtomics<3>, NumHighPrioTasks<0>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 8, 1 << 10);
+BENCHMARK(BM_N_atomics_N_waiter_N_notifier<NotifyEveryNus<100>, NumberOfAtomics<5>, NumHighPrioTasks<0>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 7, 1 << 9);
+BENCHMARK(BM_N_atomics_N_waiter_N_notifier<NotifyEveryNus<100>, NumberOfAtomics<7>, NumHighPrioTasks<0>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 6, 1 << 8);
+
+BENCHMARK(BM_N_atomics_N_waiter_N_notifier<KeepNotifying, NumberOfAtomics<2>, NumHighPrioTasks<4>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 7, 1 << 9);
+BENCHMARK(BM_N_atomics_N_waiter_N_notifier<KeepNotifying, NumberOfAtomics<3>, NumHighPrioTasks<4>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 7, 1 << 9);
+BENCHMARK(BM_N_atomics_N_waiter_N_notifier<KeepNotifying, NumberOfAtomics<5>, NumHighPrioTasks<4>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 6, 1 << 8);
+BENCHMARK(BM_N_atomics_N_waiter_N_notifier<KeepNotifying, NumberOfAtomics<7>, NumHighPrioTasks<4>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 4, 1 << 6);
+
+BENCHMARK(BM_N_atomics_N_waiter_N_notifier<NotifyEveryNus<50>, NumberOfAtomics<2>, NumHighPrioTasks<4>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 7, 1 << 9);
+BENCHMARK(BM_N_atomics_N_waiter_N_notifier<NotifyEveryNus<50>, NumberOfAtomics<3>, NumHighPrioTasks<4>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 7, 1 << 9);
+BENCHMARK(BM_N_atomics_N_waiter_N_notifier<NotifyEveryNus<50>, NumberOfAtomics<5>, NumHighPrioTasks<4>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 5, 1 << 7);
+BENCHMARK(BM_N_atomics_N_waiter_N_notifier<NotifyEveryNus<50>, NumberOfAtomics<7>, NumHighPrioTasks<4>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 3, 1 << 5);
+
+BENCHMARK(BM_N_atomics_N_waiter_N_notifier<NotifyEveryNus<100>, NumberOfAtomics<2>, NumHighPrioTasks<4>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 6, 1 << 8);
+BENCHMARK(BM_N_atomics_N_waiter_N_notifier<NotifyEveryNus<100>, NumberOfAtomics<3>, NumHighPrioTasks<4>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 6, 1 << 8);
+BENCHMARK(BM_N_atomics_N_waiter_N_notifier<NotifyEveryNus<100>, NumberOfAtomics<5>, NumHighPrioTasks<4>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 5, 1 << 7);
+BENCHMARK(BM_N_atomics_N_waiter_N_notifier<NotifyEveryNus<100>, NumberOfAtomics<7>, NumHighPrioTasks<4>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 3, 1 << 5);
+
+BENCHMARK_MAIN();
diff --git libcxx/test/benchmarks/atomic_wait_helper.h libcxx/test/benchmarks/atomic_wait_helper.h
new file mode 100644
index 000000000000..cfdacf9e0168
--- /dev/null
+++ libcxx/test/benchmarks/atomic_wait_helper.h
@@ -0,0 +1,92 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TEST_BENCHMARK_ATOMIC_WAIT_HELPER_H
+#define TEST_BENCHMARK_ATOMIC_WAIT_HELPER_H
+
+#include <atomic>
+#include <chrono>
+#include <exception>
+#include <stop_token>
+#include <thread>
+
+struct HighPrioTask {
+  sched_param param;
+  pthread_attr_t attr_t;
+  pthread_t thread;
+  std::atomic_bool stopped{false};
+
+  HighPrioTask(const HighPrioTask&) = delete;
+
+  HighPrioTask() {
+    pthread_attr_init(&attr_t);
+    pthread_attr_setschedpolicy(&attr_t, SCHED_FIFO);
+    param.sched_priority = sched_get_priority_max(SCHED_FIFO);
+    pthread_attr_setschedparam(&attr_t, &param);
+    pthread_attr_setinheritsched(&attr_t, PTHREAD_EXPLICIT_SCHED);
+
+    auto thread_fun = [](void* arg) -> void* {
+      auto* stop = reinterpret_cast<std::atomic_bool*>(arg);
+      while (!stop->load(std::memory_order_relaxed)) {
+        // spin
+      }
+      return nullptr;
+    };
+
+    if (pthread_create(&thread, &attr_t, thread_fun, &stopped) != 0) {
+      throw std::runtime_error("failed to create thread");
+    }
+  }
+
+  ~HighPrioTask() {
+    stopped = true;
+    pthread_attr_destroy(&attr_t);
+    pthread_join(thread, nullptr);
+  }
+};
+
+template <std::size_t N>
+struct NumHighPrioTasks {
+  static constexpr auto value = N;
+};
+
+template <std::size_t N>
+struct NumWaitingThreads {
+  static constexpr auto value = N;
+};
+
+template <std::size_t N>
+struct NumberOfAtomics {
+  static constexpr auto value = N;
+};
+
+struct KeepNotifying {
+  template <class Atomic>
+  static void notify(Atomic& a, std::stop_token st) {
+    while (!st.stop_requested()) {
+      a.fetch_add(1, std::memory_order_relaxed);
+      a.notify_all();
+    }
+  }
+};
+
+template <std::size_t N>
+struct NotifyEveryNus {
+  template <class Atomic>
+  static void notify(Atomic& a, std::stop_token st) {
+    while (!st.stop_requested()) {
+      auto start = std::chrono::system_clock::now();
+      a.fetch_add(1, std::memory_order_relaxed);
+      a.notify_all();
+      while (std::chrono::system_clock::now() - start < std::chrono::microseconds{N}) {
+      }
+    }
+  }
+};
+
+#endif // TEST_BENCHMARK_ATOMIC_WAIT_HELPER_H
\ No newline at end of file
diff --git libcxx/test/benchmarks/atomic_wait_multi_waiter_1_notifier.bench.cpp libcxx/test/benchmarks/atomic_wait_multi_waiter_1_notifier.bench.cpp
new file mode 100644
index 000000000000..a14a6a2ad9c9
--- /dev/null
+++ libcxx/test/benchmarks/atomic_wait_multi_waiter_1_notifier.bench.cpp
@@ -0,0 +1,167 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+#include "atomic_wait_helper.h"
+
+#include <atomic>
+#include <cstdint>
+#include <numeric>
+#include <stop_token>
+#include <thread>
+#include <chrono>
+#include <array>
+
+#include "benchmark/benchmark.h"
+#include "make_test_thread.h"
+
+using namespace std::chrono_literals;
+
+template <class NotifyPolicy, class NumWaitingThreads, class NumPrioTasks>
+void BM_1_atomic_multi_waiter_1_notifier(benchmark::State& state) {
+  [[maybe_unused]] std::array<HighPrioTask, NumPrioTasks::value> tasks{};
+
+  std::atomic<std::uint64_t> a;
+  auto notify_func = [&](std::stop_token st) { NotifyPolicy::notify(a, st); };
+
+  std::uint64_t total_loop_test_param = state.range(0);
+  constexpr auto num_waiting_threads  = NumWaitingThreads::value;
+  std::vector<std::jthread> wait_threads;
+  wait_threads.reserve(num_waiting_threads);
+
+  auto notify_thread = support::make_test_jthread(notify_func);
+
+  std::atomic<std::uint64_t> start_flag = 0;
+  std::atomic<std::uint64_t> done_count = 0;
+  auto wait_func                        = [&a, &start_flag, &done_count, total_loop_test_param](std::stop_token st) {
+    auto old_start = 0;
+    while (!st.stop_requested()) {
+      start_flag.wait(old_start);
+      old_start = start_flag.load();
+      for (std::uint64_t i = 0; i < total_loop_test_param; ++i) {
+        auto old = a.load(std::memory_order_relaxed);
+        a.wait(old);
+      }
+      done_count.fetch_add(1);
+    }
+  };
+
+  for (size_t i = 0; i < num_waiting_threads; ++i) {
+    wait_threads.emplace_back(support::make_test_jthread(wait_func));
+  }
+
+  for (auto _ : state) {
+    done_count = 0;
+    start_flag.fetch_add(1);
+    start_flag.notify_all();
+    while (done_count < num_waiting_threads) {
+      std::this_thread::yield();
+    }
+  }
+  for (auto& t : wait_threads) {
+    t.request_stop();
+  }
+  start_flag.fetch_add(1);
+  start_flag.notify_all();
+  for (auto& t : wait_threads) {
+    t.join();
+  }
+}
+
+BENCHMARK(BM_1_atomic_multi_waiter_1_notifier<KeepNotifying, NumWaitingThreads<3>, NumHighPrioTasks<0>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 14, 1 << 16);
+BENCHMARK(BM_1_atomic_multi_waiter_1_notifier<KeepNotifying, NumWaitingThreads<7>, NumHighPrioTasks<0>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 12, 1 << 14);
+BENCHMARK(BM_1_atomic_multi_waiter_1_notifier<KeepNotifying, NumWaitingThreads<15>, NumHighPrioTasks<0>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 10, 1 << 12);
+
+BENCHMARK(BM_1_atomic_multi_waiter_1_notifier<NotifyEveryNus<50>, NumWaitingThreads<3>, NumHighPrioTasks<0>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 10, 1 << 12);
+BENCHMARK(BM_1_atomic_multi_waiter_1_notifier<NotifyEveryNus<50>, NumWaitingThreads<7>, NumHighPrioTasks<0>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 8, 1 << 10);
+BENCHMARK(BM_1_atomic_multi_waiter_1_notifier<NotifyEveryNus<50>, NumWaitingThreads<15>, NumHighPrioTasks<0>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 6, 1 << 8);
+
+BENCHMARK(BM_1_atomic_multi_waiter_1_notifier<NotifyEveryNus<100>, NumWaitingThreads<3>, NumHighPrioTasks<0>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 8, 1 << 10);
+BENCHMARK(BM_1_atomic_multi_waiter_1_notifier<NotifyEveryNus<100>, NumWaitingThreads<7>, NumHighPrioTasks<0>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 6, 1 << 8);
+BENCHMARK(BM_1_atomic_multi_waiter_1_notifier<NotifyEveryNus<100>, NumWaitingThreads<15>, NumHighPrioTasks<0>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 4, 1 << 6);
+
+BENCHMARK(BM_1_atomic_multi_waiter_1_notifier<KeepNotifying, NumWaitingThreads<3>, NumHighPrioTasks<4>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 8, 1 << 10);
+BENCHMARK(BM_1_atomic_multi_waiter_1_notifier<KeepNotifying, NumWaitingThreads<7>, NumHighPrioTasks<4>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 6, 1 << 8);
+BENCHMARK(BM_1_atomic_multi_waiter_1_notifier<KeepNotifying, NumWaitingThreads<15>, NumHighPrioTasks<4>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 4, 1 << 6);
+
+BENCHMARK(BM_1_atomic_multi_waiter_1_notifier<NotifyEveryNus<50>, NumWaitingThreads<3>, NumHighPrioTasks<4>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 8, 1 << 10);
+BENCHMARK(BM_1_atomic_multi_waiter_1_notifier<NotifyEveryNus<50>, NumWaitingThreads<7>, NumHighPrioTasks<4>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 6, 1 << 8);
+BENCHMARK(BM_1_atomic_multi_waiter_1_notifier<NotifyEveryNus<50>, NumWaitingThreads<15>, NumHighPrioTasks<4>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 4, 1 << 6);
+
+BENCHMARK(BM_1_atomic_multi_waiter_1_notifier<NotifyEveryNus<100>, NumWaitingThreads<3>, NumHighPrioTasks<4>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 8, 1 << 10);
+BENCHMARK(BM_1_atomic_multi_waiter_1_notifier<NotifyEveryNus<100>, NumWaitingThreads<7>, NumHighPrioTasks<4>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 6, 1 << 8);
+BENCHMARK(BM_1_atomic_multi_waiter_1_notifier<NotifyEveryNus<100>, NumWaitingThreads<15>, NumHighPrioTasks<4>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 4, 1 << 6);
+
+BENCHMARK(BM_1_atomic_multi_waiter_1_notifier<KeepNotifying, NumWaitingThreads<3>, NumHighPrioTasks<7>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 4, 1 << 6);
+BENCHMARK(BM_1_atomic_multi_waiter_1_notifier<KeepNotifying, NumWaitingThreads<7>, NumHighPrioTasks<7>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 3, 1 << 5);
+BENCHMARK(BM_1_atomic_multi_waiter_1_notifier<KeepNotifying, NumWaitingThreads<15>, NumHighPrioTasks<7>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 2, 1 << 4);
+
+BENCHMARK(BM_1_atomic_multi_waiter_1_notifier<NotifyEveryNus<50>, NumWaitingThreads<3>, NumHighPrioTasks<7>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 3, 1 << 5);
+BENCHMARK(BM_1_atomic_multi_waiter_1_notifier<NotifyEveryNus<50>, NumWaitingThreads<7>, NumHighPrioTasks<7>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 2, 1 << 4);
+BENCHMARK(BM_1_atomic_multi_waiter_1_notifier<NotifyEveryNus<50>, NumWaitingThreads<15>, NumHighPrioTasks<7>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 1, 1 << 3);
+
+BENCHMARK(BM_1_atomic_multi_waiter_1_notifier<NotifyEveryNus<100>, NumWaitingThreads<3>, NumHighPrioTasks<7>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 3, 1 << 5);
+BENCHMARK(BM_1_atomic_multi_waiter_1_notifier<NotifyEveryNus<100>, NumWaitingThreads<7>, NumHighPrioTasks<7>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 2, 1 << 4);
+BENCHMARK(BM_1_atomic_multi_waiter_1_notifier<NotifyEveryNus<100>, NumWaitingThreads<15>, NumHighPrioTasks<7>>)
+    ->RangeMultiplier(2)
+    ->Range(1 << 1, 1 << 3);
+
+BENCHMARK_MAIN();
diff --git libcxx/test/libcxx/containers/associative/non_const_comparator.verify.cpp libcxx/test/libcxx/containers/associative/non_const_comparator.verify.cpp
index 3bb59f1920f6..cb7a044abd8c 100644
--- libcxx/test/libcxx/containers/associative/non_const_comparator.verify.cpp
+++ libcxx/test/libcxx/containers/associative/non_const_comparator.verify.cpp
@@ -12,9 +12,9 @@
 // Test that libc++ generates a warning diagnostic when the container is
 // provided a non-const callable comparator.
 
+#include <__type_traits/invoke.h>
 #include <map>
 #include <set>
-#include <type_traits> // for __invokable
 
 struct BadCompare {
   template <class T, class U>
@@ -24,8 +24,8 @@ struct BadCompare {
 };
 
 void f() {
-  static_assert(!std::__invokable<BadCompare const&, int const&, int const&>::value, "");
-  static_assert(std::__invokable<BadCompare&, int const&, int const&>::value, "");
+  static_assert(!std::__is_invocable_v<BadCompare const&, int const&, int const&>, "");
+  static_assert(std::__is_invocable_v<BadCompare&, int const&, int const&>, "");
 
   // expected-warning@set:* 2 {{the specified comparator type does not provide a viable const call operator}}
   // expected-warning@map:* 2 {{the specified comparator type does not provide a viable const call operator}}
diff --git libcxx/test/libcxx/containers/unord/non_const_comparator.verify.cpp libcxx/test/libcxx/containers/unord/non_const_comparator.verify.cpp
index a5d529dacef4..c3418302d031 100644
--- libcxx/test/libcxx/containers/unord/non_const_comparator.verify.cpp
+++ libcxx/test/libcxx/containers/unord/non_const_comparator.verify.cpp
@@ -12,6 +12,7 @@
 // Test that libc++ generates a warning diagnostic when the container is
 // provided a non-const callable comparator or a non-const hasher.
 
+#include <__type_traits/invoke.h>
 #include <unordered_set>
 #include <unordered_map>
 
@@ -30,8 +31,8 @@ struct BadEqual {
 };
 
 void f() {
-  static_assert(!std::__invokable<BadEqual const&, int const&, int const&>::value, "");
-  static_assert(std::__invokable<BadEqual&, int const&, int const&>::value, "");
+  static_assert(!std::__is_invocable_v<BadEqual const&, int const&, int const&>, "");
+  static_assert(std::__is_invocable_v<BadEqual&, int const&, int const&>, "");
 
   // expected-warning@unordered_set:* 2 {{the specified comparator type does not provide a viable const call operator}}
   // expected-warning@unordered_map:* 2 {{the specified comparator type does not provide a viable const call operator}}
diff --git libcxx/test/libcxx/utilities/function.objects/func.require/bullet_1_2_3.pass.cpp libcxx/test/libcxx/utilities/function.objects/func.require/bullet_1_2_3.pass.cpp
index 4d1010522f2f..48460d1488fd 100644
--- libcxx/test/libcxx/utilities/function.objects/func.require/bullet_1_2_3.pass.cpp
+++ libcxx/test/libcxx/utilities/function.objects/func.require/bullet_1_2_3.pass.cpp
@@ -262,8 +262,8 @@ struct ReferenceWrapper {
     constexpr operator Type&() const noexcept { return *ptr; }
 
     template <class... _ArgTypes>
-    constexpr typename std::__invoke_of<Type&, _ArgTypes...>::type operator() (_ArgTypes&&... __args) const {
-        return std::__invoke(get(), std::forward<_ArgTypes>(__args)...);
+    constexpr std::__invoke_result_t<Type&, _ArgTypes...> operator()(_ArgTypes&&... __args) const {
+      return std::__invoke(get(), std::forward<_ArgTypes>(__args)...);
     }
 };
 
diff --git libcxx/test/std/input.output/file.streams/fstreams/ifstream.members/offset_range.pass.cpp libcxx/test/std/input.output/file.streams/fstreams/ifstream.members/offset_range.pass.cpp
index 5afd4465db31..6ffe750564c2 100644
--- libcxx/test/std/input.output/file.streams/fstreams/ifstream.members/offset_range.pass.cpp
+++ libcxx/test/std/input.output/file.streams/fstreams/ifstream.members/offset_range.pass.cpp
@@ -28,6 +28,11 @@
 //
 // XFAIL: target=powerpc-{{.+}}-aix{{.*}}
 
+// By default, off_t is typically a 32-bit integer on ARMv7 Linux systems,
+// meaning it can represent file sizes up to 2GB (2^31 bytes) only.
+//
+// UNSUPPORTED: target=armv7-unknown-linux-gnueabihf
+
 #include <fstream>
 #include <iostream>
 #include <cassert>
diff --git lld/COFF/Chunks.cpp lld/COFF/Chunks.cpp
index 115e3457db69..ff3c89884c24 100644
--- lld/COFF/Chunks.cpp
+++ lld/COFF/Chunks.cpp
@@ -1183,7 +1183,7 @@ size_t Arm64XDynamicRelocEntry::getSize() const {
 
 void Arm64XDynamicRelocEntry::writeTo(uint8_t *buf) const {
   auto out = reinterpret_cast<ulittle16_t *>(buf);
-  *out = (offset & 0xfff) | (type << 12);
+  *out = (offset.get() & 0xfff) | (type << 12);
 
   switch (type) {
   case IMAGE_DVRT_ARM64X_FIXUP_TYPE_VALUE:
@@ -1211,14 +1211,19 @@ void Arm64XDynamicRelocEntry::writeTo(uint8_t *buf) const {
 void DynamicRelocsChunk::finalize() {
   llvm::stable_sort(arm64xRelocs, [=](const Arm64XDynamicRelocEntry &a,
                                       const Arm64XDynamicRelocEntry &b) {
-    return a.offset < b.offset;
+    return a.offset.get() < b.offset.get();
   });
 
-  size = sizeof(coff_dynamic_reloc_table) + sizeof(coff_dynamic_relocation64) +
-         sizeof(coff_base_reloc_block_header);
+  size = sizeof(coff_dynamic_reloc_table) + sizeof(coff_dynamic_relocation64);
+  uint32_t prevPage = 0xfff;
 
   for (const Arm64XDynamicRelocEntry &entry : arm64xRelocs) {
-    assert(!(entry.offset & ~0xfff)); // Not yet supported.
+    uint32_t page = entry.offset.get() & ~0xfff;
+    if (page != prevPage) {
+      size = alignTo(size, sizeof(uint32_t)) +
+             sizeof(coff_base_reloc_block_header);
+      prevPage = page;
+    }
     size += entry.getSize();
   }
 
@@ -1235,17 +1240,31 @@ void DynamicRelocsChunk::writeTo(uint8_t *buf) const {
   header->Symbol = IMAGE_DYNAMIC_RELOCATION_ARM64X;
   buf += sizeof(*header);
 
-  auto pageHeader = reinterpret_cast<coff_base_reloc_block_header *>(buf);
-  pageHeader->BlockSize = sizeof(*pageHeader);
+  coff_base_reloc_block_header *pageHeader = nullptr;
+  size_t relocSize = 0;
   for (const Arm64XDynamicRelocEntry &entry : arm64xRelocs) {
-    entry.writeTo(buf + pageHeader->BlockSize);
-    pageHeader->BlockSize += entry.getSize();
+    uint32_t page = entry.offset.get() & ~0xfff;
+    if (!pageHeader || page != pageHeader->PageRVA) {
+      relocSize = alignTo(relocSize, sizeof(uint32_t));
+      if (pageHeader)
+        pageHeader->BlockSize =
+            buf + relocSize - reinterpret_cast<uint8_t *>(pageHeader);
+      pageHeader =
+          reinterpret_cast<coff_base_reloc_block_header *>(buf + relocSize);
+      pageHeader->PageRVA = page;
+      relocSize += sizeof(*pageHeader);
+    }
+
+    entry.writeTo(buf + relocSize);
+    relocSize += entry.getSize();
   }
-  pageHeader->BlockSize = alignTo(pageHeader->BlockSize, sizeof(uint32_t));
+  relocSize = alignTo(relocSize, sizeof(uint32_t));
+  pageHeader->BlockSize =
+      buf + relocSize - reinterpret_cast<uint8_t *>(pageHeader);
 
-  header->BaseRelocSize = pageHeader->BlockSize;
-  table->Size += header->BaseRelocSize;
-  assert(size == sizeof(*table) + sizeof(*header) + header->BaseRelocSize);
+  header->BaseRelocSize = relocSize;
+  table->Size += relocSize;
+  assert(size == sizeof(*table) + sizeof(*header) + relocSize);
 }
 
 } // namespace lld::coff
diff --git lld/COFF/Chunks.h lld/COFF/Chunks.h
index 46fd8e21dce6..7ba58e336451 100644
--- lld/COFF/Chunks.h
+++ lld/COFF/Chunks.h
@@ -851,13 +851,13 @@ private:
 class Arm64XDynamicRelocEntry {
 public:
   Arm64XDynamicRelocEntry(llvm::COFF::Arm64XFixupType type, uint8_t size,
-                          uint32_t offset, Arm64XRelocVal value)
+                          Arm64XRelocVal offset, Arm64XRelocVal value)
       : offset(offset), value(value), type(type), size(size) {}
 
   size_t getSize() const;
   void writeTo(uint8_t *buf) const;
 
-  uint32_t offset;
+  Arm64XRelocVal offset;
   Arm64XRelocVal value;
 
 private:
@@ -873,8 +873,8 @@ public:
   void writeTo(uint8_t *buf) const override;
   void finalize();
 
-  void add(llvm::COFF::Arm64XFixupType type, uint8_t size, uint32_t offset,
-           Arm64XRelocVal value) {
+  void add(llvm::COFF::Arm64XFixupType type, uint8_t size,
+           Arm64XRelocVal offset, Arm64XRelocVal value) {
     arm64xRelocs.emplace_back(type, size, offset, value);
   }
 
diff --git lld/COFF/Config.h lld/COFF/Config.h
index 924560fef023..cd280aa09964 100644
--- lld/COFF/Config.h
+++ lld/COFF/Config.h
@@ -115,6 +115,7 @@ struct Configuration {
   enum ManifestKind { Default, SideBySide, Embed, No };
   bool is64() const { return llvm::COFF::is64Bit(machine); }
 
+  std::unique_ptr<MemoryBuffer> dosStub;
   llvm::COFF::MachineTypes machine = IMAGE_FILE_MACHINE_UNKNOWN;
   bool machineInferred = false;
   size_t wordsize;
@@ -161,8 +162,6 @@ struct Configuration {
   bool dll = false;
   StringRef implib;
   bool noimplib = false;
-  std::vector<Export> exports;
-  bool hadExplicitExports;
   std::set<std::string> delayLoads;
   std::map<std::string, int> dllOrder;
   Symbol *delayLoadHelper = nullptr;
diff --git lld/COFF/DLL.cpp lld/COFF/DLL.cpp
index 875ada9d6053..6a3f8eb21e84 100644
--- lld/COFF/DLL.cpp
+++ lld/COFF/DLL.cpp
@@ -639,22 +639,22 @@ public:
 
 class AddressTableChunk : public NonSectionChunk {
 public:
-  explicit AddressTableChunk(COFFLinkerContext &ctx, size_t baseOrdinal,
+  explicit AddressTableChunk(SymbolTable &symtab, size_t baseOrdinal,
                              size_t maxOrdinal)
       : baseOrdinal(baseOrdinal), size((maxOrdinal - baseOrdinal) + 1),
-        ctx(ctx) {}
+        symtab(symtab) {}
   size_t getSize() const override { return size * 4; }
 
   void writeTo(uint8_t *buf) const override {
     memset(buf, 0, getSize());
 
-    for (const Export &e : ctx.config.exports) {
+    for (const Export &e : symtab.exports) {
       assert(e.ordinal >= baseOrdinal && "Export symbol has invalid ordinal");
       // Subtract the OrdinalBase to get the index.
       uint8_t *p = buf + (e.ordinal - baseOrdinal) * 4;
       uint32_t bit = 0;
       // Pointer to thumb code must have the LSB set, so adjust it.
-      if (ctx.config.machine == ARMNT && !e.data)
+      if (symtab.machine == ARMNT && !e.data)
         bit = 1;
       if (e.forwardChunk) {
         write32le(p, e.forwardChunk->getRVA() | bit);
@@ -669,7 +669,7 @@ public:
 private:
   size_t baseOrdinal;
   size_t size;
-  const COFFLinkerContext &ctx;
+  const SymbolTable &symtab;
 };
 
 class NamePointersChunk : public NonSectionChunk {
@@ -690,13 +690,13 @@ private:
 
 class ExportOrdinalChunk : public NonSectionChunk {
 public:
-  explicit ExportOrdinalChunk(const COFFLinkerContext &ctx, size_t baseOrdinal,
+  explicit ExportOrdinalChunk(const SymbolTable &symtab, size_t baseOrdinal,
                               size_t tableSize)
-      : baseOrdinal(baseOrdinal), size(tableSize), ctx(ctx) {}
+      : baseOrdinal(baseOrdinal), size(tableSize), symtab(symtab) {}
   size_t getSize() const override { return size * 2; }
 
   void writeTo(uint8_t *buf) const override {
-    for (const Export &e : ctx.config.exports) {
+    for (const Export &e : symtab.exports) {
       if (e.noname)
         continue;
       assert(e.ordinal >= baseOrdinal && "Export symbol has invalid ordinal");
@@ -709,7 +709,7 @@ public:
 private:
   size_t baseOrdinal;
   size_t size;
-  const COFFLinkerContext &ctx;
+  const SymbolTable &symtab;
 };
 
 } // anonymous namespace
@@ -920,9 +920,9 @@ Chunk *DelayLoadContents::newThunkChunk(DefinedImportData *s,
   }
 }
 
-EdataContents::EdataContents(COFFLinkerContext &ctx) : ctx(ctx) {
+void createEdataChunks(SymbolTable &symtab, std::vector<Chunk *> &chunks) {
   unsigned baseOrdinal = 1 << 16, maxOrdinal = 0;
-  for (Export &e : ctx.config.exports) {
+  for (Export &e : symtab.exports) {
     baseOrdinal = std::min(baseOrdinal, (unsigned)e.ordinal);
     maxOrdinal = std::max(maxOrdinal, (unsigned)e.ordinal);
   }
@@ -930,15 +930,16 @@ EdataContents::EdataContents(COFFLinkerContext &ctx) : ctx(ctx) {
   // https://learn.microsoft.com/en-us/cpp/build/reference/export-exports-a-function?view=msvc-170
   assert(baseOrdinal >= 1);
 
-  auto *dllName = make<StringChunk>(sys::path::filename(ctx.config.outputFile));
-  auto *addressTab = make<AddressTableChunk>(ctx, baseOrdinal, maxOrdinal);
+  auto *dllName =
+      make<StringChunk>(sys::path::filename(symtab.ctx.config.outputFile));
+  auto *addressTab = make<AddressTableChunk>(symtab, baseOrdinal, maxOrdinal);
   std::vector<Chunk *> names;
-  for (Export &e : ctx.config.exports)
+  for (Export &e : symtab.exports)
     if (!e.noname)
       names.push_back(make<StringChunk>(e.exportName));
 
   std::vector<Chunk *> forwards;
-  for (Export &e : ctx.config.exports) {
+  for (Export &e : symtab.exports) {
     if (e.forwardTo.empty())
       continue;
     e.forwardChunk = make<StringChunk>(e.forwardTo);
@@ -946,7 +947,8 @@ EdataContents::EdataContents(COFFLinkerContext &ctx) : ctx(ctx) {
   }
 
   auto *nameTab = make<NamePointersChunk>(names);
-  auto *ordinalTab = make<ExportOrdinalChunk>(ctx, baseOrdinal, names.size());
+  auto *ordinalTab =
+      make<ExportOrdinalChunk>(symtab, baseOrdinal, names.size());
   auto *dir =
       make<ExportDirectoryChunk>(baseOrdinal, maxOrdinal, names.size(), dllName,
                                  addressTab, nameTab, ordinalTab);
diff --git lld/COFF/DLL.h lld/COFF/DLL.h
index f7d2b57a20a0..724a323d62d2 100644
--- lld/COFF/DLL.h
+++ lld/COFF/DLL.h
@@ -77,20 +77,8 @@ private:
   COFFLinkerContext &ctx;
 };
 
-// Windows-specific.
-// EdataContents creates all chunks for the DLL export table.
-class EdataContents {
-public:
-  EdataContents(COFFLinkerContext &ctx);
-  std::vector<Chunk *> chunks;
-
-  uint64_t getRVA() { return chunks[0]->getRVA(); }
-  uint64_t getSize() {
-    return chunks.back()->getRVA() + chunks.back()->getSize() - getRVA();
-  }
-
-  COFFLinkerContext &ctx;
-};
+// Create all chunks for the DLL export table.
+void createEdataChunks(SymbolTable &symtab, std::vector<Chunk *> &chunks);
 
 } // namespace lld::coff
 
diff --git lld/COFF/Driver.cpp lld/COFF/Driver.cpp
index 898c6c17d206..4e0678282eed 100644
--- lld/COFF/Driver.cpp
+++ lld/COFF/Driver.cpp
@@ -458,7 +458,7 @@ void LinkerDriver::parseDirectives(InputFile *file) {
     // declarations, many object files may end up with having the
     // same /EXPORT options. In order to save cost of parsing them,
     // we dedup them first.
-    if (!directivesExports.insert(e).second)
+    if (!file->symtab.directivesExports.insert(e).second)
       continue;
 
     Export exp = parseExport(e);
@@ -469,7 +469,7 @@ void LinkerDriver::parseDirectives(InputFile *file) {
         exp.extName = saver().save("_" + exp.extName);
     }
     exp.source = ExportSource::Directives;
-    ctx.config.exports.push_back(exp);
+    file->symtab.exports.push_back(exp);
   }
 
   // Handle /include: in bulk.
@@ -956,7 +956,7 @@ std::string LinkerDriver::getImportName(bool asLib) {
 void LinkerDriver::createImportLibrary(bool asLib) {
   llvm::TimeTraceScope timeScope("Create import library");
   std::vector<COFFShortExport> exports;
-  for (Export &e1 : ctx.config.exports) {
+  for (Export &e1 : ctx.symtab.exports) {
     COFFShortExport e2;
     e2.Name = std::string(e1.name);
     e2.SymbolName = std::string(e1.symbolName);
@@ -1069,7 +1069,7 @@ void LinkerDriver::parseModuleDefs(StringRef path) {
     e2.isPrivate = e1.Private;
     e2.constant = e1.Constant;
     e2.source = ExportSource::ModuleDefinition;
-    ctx.config.exports.push_back(e2);
+    ctx.symtab.exports.push_back(e2);
   }
 }
 
@@ -1222,8 +1222,10 @@ static void findKeepUniqueSections(COFFLinkerContext &ctx) {
 
   // Exported symbols could be address-significant in other executables or DSOs,
   // so we conservatively mark them as address-significant.
-  for (Export &r : ctx.config.exports)
-    markAddrsig(r.sym);
+  ctx.forEachSymtab([](SymbolTable &symtab) {
+    for (Export &r : symtab.exports)
+      markAddrsig(r.sym);
+  });
 
   // Visit the address-significance table in each object file and mark each
   // referenced symbol as address-significant.
@@ -1376,13 +1378,13 @@ void LinkerDriver::maybeCreateECExportThunk(StringRef name, Symbol *&sym) {
 void LinkerDriver::createECExportThunks() {
   // Check if EXP+ symbols have corresponding $hp_target symbols and use them
   // to create export thunks when available.
-  for (Symbol *s : ctx.symtab.expSymbols) {
+  for (Symbol *s : ctx.symtabEC->expSymbols) {
     if (!s->isUsedInRegularObj)
       continue;
     assert(s->getName().starts_with("EXP+"));
     std::string targetName =
         (s->getName().substr(strlen("EXP+")) + "$hp_target").str();
-    Symbol *sym = ctx.symtab.find(targetName);
+    Symbol *sym = ctx.symtabEC->find(targetName);
     if (!sym)
       continue;
     Defined *targetSym;
@@ -1407,7 +1409,7 @@ void LinkerDriver::createECExportThunks() {
   if (ctx.symtabEC->entry)
     maybeCreateECExportThunk(ctx.symtabEC->entry->getName(),
                              ctx.symtabEC->entry);
-  for (Export &e : ctx.config.exports) {
+  for (Export &e : ctx.symtabEC->exports) {
     if (!e.data)
       maybeCreateECExportThunk(e.extName.empty() ? e.name : e.extName, e.sym);
   }
@@ -1430,7 +1432,7 @@ void LinkerDriver::maybeExportMinGWSymbols(const opt::InputArgList &args) {
     if (!ctx.config.dll)
       return;
 
-    if (!ctx.config.exports.empty())
+    if (!ctx.symtab.exports.empty())
       return;
     if (args.hasArg(OPT_exclude_all_symbols))
       return;
@@ -1466,7 +1468,7 @@ void LinkerDriver::maybeExportMinGWSymbols(const opt::InputArgList &args) {
       if (!(c->getOutputCharacteristics() & IMAGE_SCN_MEM_EXECUTE))
         e.data = true;
     s->isUsedInRegularObj = true;
-    ctx.config.exports.push_back(e);
+    ctx.symtab.exports.push_back(e);
   });
 }
 
@@ -2298,6 +2300,10 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
     config->noSEH = args.hasArg(OPT_noseh);
   }
 
+  // Handle /stub
+  if (auto *arg = args.getLastArg(OPT_stub))
+    parseDosStub(arg->getValue());
+
   // Handle /functionpadmin
   for (auto *arg : args.filtered(OPT_functionpadmin, OPT_functionpadmin_opt))
     parseFunctionPadMin(arg);
@@ -2339,7 +2345,7 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
         if (!e.extName.empty() && !isDecorated(e.extName))
           e.extName = saver().save("_" + e.extName);
       }
-      config->exports.push_back(e);
+      mainSymtab.exports.push_back(e);
     }
   }
 
@@ -2351,7 +2357,7 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
 
   // Handle generation of import library from a def file.
   if (!args.hasArg(OPT_INPUT, OPT_wholearchive_file)) {
-    fixupExports();
+    ctx.forEachSymtab([](SymbolTable &symtab) { symtab.fixupExports(); });
     if (!config->noimplib)
       createImportLibrary(/*asLib=*/true);
     return;
@@ -2537,16 +2543,16 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
         // search for its mangled names.
         if (symtab.entry)
           symtab.mangleMaybe(symtab.entry);
-      });
 
-      // Windows specific -- Make sure we resolve all dllexported symbols.
-      for (Export &e : config->exports) {
-        if (!e.forwardTo.empty())
-          continue;
-        e.sym = ctx.symtab.addGCRoot(e.name, !e.data);
-        if (e.source != ExportSource::Directives)
-          e.symbolName = ctx.symtab.mangleMaybe(e.sym);
-      }
+        // Windows specific -- Make sure we resolve all dllexported symbols.
+        for (Export &e : symtab.exports) {
+          if (!e.forwardTo.empty())
+            continue;
+          e.sym = symtab.addGCRoot(e.name, !e.data);
+          if (e.source != ExportSource::Directives)
+            e.symbolName = symtab.mangleMaybe(e.sym);
+        }
+      });
 
       // Add weak aliases. Weak aliases is a mechanism to give remaining
       // undefined symbols final chance to be resolved successfully.
@@ -2647,7 +2653,9 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
   if (errorCount())
     return;
 
-  config->hadExplicitExports = !config->exports.empty();
+  ctx.forEachSymtab([](SymbolTable &symtab) {
+    symtab.hadExplicitExports = !symtab.exports.empty();
+  });
   if (config->mingw) {
     // In MinGW, all symbols are automatically exported if no symbols
     // are chosen to be exported.
@@ -2712,17 +2720,18 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
   // Windows specific -- when we are creating a .dll file, we also
   // need to create a .lib file. In MinGW mode, we only do that when the
   // -implib option is given explicitly, for compatibility with GNU ld.
-  if (!config->exports.empty() || config->dll) {
+  if (!ctx.symtab.exports.empty() || config->dll) {
     llvm::TimeTraceScope timeScope("Create .lib exports");
-    fixupExports();
+    ctx.forEachSymtab([](SymbolTable &symtab) { symtab.fixupExports(); });
     if (!config->noimplib && (!config->mingw || !config->implib.empty()))
       createImportLibrary(/*asLib=*/false);
-    assignExportOrdinals();
+    ctx.forEachSymtab(
+        [](SymbolTable &symtab) { symtab.assignExportOrdinals(); });
   }
 
   // Handle /output-def (MinGW specific).
   if (auto *arg = args.getLastArg(OPT_output_def))
-    writeDefFile(ctx, arg->getValue(), config->exports);
+    writeDefFile(ctx, arg->getValue(), ctx.symtab.exports);
 
   // Set extra alignment for .comm symbols
   for (auto pair : config->alignComm) {
diff --git lld/COFF/Driver.h lld/COFF/Driver.h
index 8ce2e13129ba..12724cbd1eef 100644
--- lld/COFF/Driver.h
+++ lld/COFF/Driver.h
@@ -182,7 +182,6 @@ private:
   std::list<std::function<void()>> taskQueue;
   std::vector<MemoryBufferRef> resources;
 
-  llvm::DenseSet<StringRef> directivesExports;
   llvm::DenseSet<StringRef> excludedSymbols;
 
   COFFLinkerContext &ctx;
@@ -218,6 +217,9 @@ private:
   void parseSection(StringRef);
   void parseAligncomm(StringRef);
 
+  // Parses a MS-DOS stub file
+  void parseDosStub(StringRef path);
+
   // Parses a string in the form of "[:<integer>]"
   void parseFunctionPadMin(llvm::opt::Arg *a);
 
@@ -246,8 +248,6 @@ private:
 
   // Used for dllexported symbols.
   Export parseExport(StringRef arg);
-  void fixupExports();
-  void assignExportOrdinals();
 
   // Parses a string in the form of "key=value" and check
   // if value matches previous values for the key.
diff --git lld/COFF/DriverUtils.cpp lld/COFF/DriverUtils.cpp
index 1148be09fb10..4a70c826691d 100644
--- lld/COFF/DriverUtils.cpp
+++ lld/COFF/DriverUtils.cpp
@@ -246,6 +246,22 @@ void LinkerDriver::parseAligncomm(StringRef s) {
       std::max(ctx.config.alignComm[std::string(name)], 1 << v);
 }
 
+void LinkerDriver::parseDosStub(StringRef path) {
+  std::unique_ptr<MemoryBuffer> stub =
+      CHECK(MemoryBuffer::getFile(path), "could not open " + path);
+  size_t bufferSize = stub->getBufferSize();
+  const char *bufferStart = stub->getBufferStart();
+  // MS link.exe compatibility:
+  // 1. stub must be greater than or equal to 64 bytes
+  // 2. stub must start with a valid dos signature 'MZ'
+  if (bufferSize < 64)
+    Err(ctx) << "/stub: stub must be greater than or equal to 64 bytes: "
+             << path;
+  if (bufferStart[0] != 'M' || bufferStart[1] != 'Z')
+    Err(ctx) << "/stub: invalid DOS signature: " << path;
+  ctx.config.dosStub = std::move(stub);
+}
+
 // Parses /functionpadmin option argument.
 void LinkerDriver::parseFunctionPadMin(llvm::opt::Arg *a) {
   StringRef arg = a->getNumValues() ? a->getValue() : "";
@@ -640,142 +656,6 @@ err:
   llvm_unreachable("");
 }
 
-// Convert stdcall/fastcall style symbols into unsuffixed symbols,
-// with or without a leading underscore. (MinGW specific.)
-static StringRef killAt(StringRef sym, bool prefix) {
-  if (sym.empty())
-    return sym;
-  // Strip any trailing stdcall suffix
-  sym = sym.substr(0, sym.find('@', 1));
-  if (!sym.starts_with("@")) {
-    if (prefix && !sym.starts_with("_"))
-      return saver().save("_" + sym);
-    return sym;
-  }
-  // For fastcall, remove the leading @ and replace it with an
-  // underscore, if prefixes are used.
-  sym = sym.substr(1);
-  if (prefix)
-    sym = saver().save("_" + sym);
-  return sym;
-}
-
-static StringRef exportSourceName(ExportSource s) {
-  switch (s) {
-  case ExportSource::Directives:
-    return "source file (directives)";
-  case ExportSource::Export:
-    return "/export";
-  case ExportSource::ModuleDefinition:
-    return "/def";
-  default:
-    llvm_unreachable("unknown ExportSource");
-  }
-}
-
-// Performs error checking on all /export arguments.
-// It also sets ordinals.
-void LinkerDriver::fixupExports() {
-  llvm::TimeTraceScope timeScope("Fixup exports");
-  // Symbol ordinals must be unique.
-  std::set<uint16_t> ords;
-  for (Export &e : ctx.config.exports) {
-    if (e.ordinal == 0)
-      continue;
-    if (!ords.insert(e.ordinal).second)
-      Fatal(ctx) << "duplicate export ordinal: " << e.name;
-  }
-
-  for (Export &e : ctx.config.exports) {
-    if (!e.exportAs.empty()) {
-      e.exportName = e.exportAs;
-      continue;
-    }
-
-    StringRef sym =
-        !e.forwardTo.empty() || e.extName.empty() ? e.name : e.extName;
-    if (ctx.config.machine == I386 && sym.starts_with("_")) {
-      // In MSVC mode, a fully decorated stdcall function is exported
-      // as-is with the leading underscore (with type IMPORT_NAME).
-      // In MinGW mode, a decorated stdcall function gets the underscore
-      // removed, just like normal cdecl functions.
-      if (ctx.config.mingw || !sym.contains('@')) {
-        e.exportName = sym.substr(1);
-        continue;
-      }
-    }
-    if (isArm64EC(ctx.config.machine) && !e.data && !e.constant) {
-      if (std::optional<std::string> demangledName =
-              getArm64ECDemangledFunctionName(sym)) {
-        e.exportName = saver().save(*demangledName);
-        continue;
-      }
-    }
-    e.exportName = sym;
-  }
-
-  if (ctx.config.killAt && ctx.config.machine == I386) {
-    for (Export &e : ctx.config.exports) {
-      e.name = killAt(e.name, true);
-      e.exportName = killAt(e.exportName, false);
-      e.extName = killAt(e.extName, true);
-      e.symbolName = killAt(e.symbolName, true);
-    }
-  }
-
-  // Uniquefy by name.
-  DenseMap<StringRef, std::pair<Export *, unsigned>> map(
-      ctx.config.exports.size());
-  std::vector<Export> v;
-  for (Export &e : ctx.config.exports) {
-    auto pair = map.insert(std::make_pair(e.exportName, std::make_pair(&e, 0)));
-    bool inserted = pair.second;
-    if (inserted) {
-      pair.first->second.second = v.size();
-      v.push_back(e);
-      continue;
-    }
-    Export *existing = pair.first->second.first;
-    if (e == *existing || e.name != existing->name)
-      continue;
-    // If the existing export comes from .OBJ directives, we are allowed to
-    // overwrite it with /DEF: or /EXPORT without any warning, as MSVC link.exe
-    // does.
-    if (existing->source == ExportSource::Directives) {
-      *existing = e;
-      v[pair.first->second.second] = e;
-      continue;
-    }
-    if (existing->source == e.source) {
-      Warn(ctx) << "duplicate " << exportSourceName(existing->source)
-                << " option: " << e.name;
-    } else {
-      Warn(ctx) << "duplicate export: " << e.name << " first seen in "
-                << exportSourceName(existing->source) << ", now in "
-                << exportSourceName(e.source);
-    }
-  }
-  ctx.config.exports = std::move(v);
-
-  // Sort by name.
-  llvm::sort(ctx.config.exports, [](const Export &a, const Export &b) {
-    return a.exportName < b.exportName;
-  });
-}
-
-void LinkerDriver::assignExportOrdinals() {
-  // Assign unique ordinals if default (= 0).
-  uint32_t max = 0;
-  for (Export &e : ctx.config.exports)
-    max = std::max(max, (uint32_t)e.ordinal);
-  for (Export &e : ctx.config.exports)
-    if (e.ordinal == 0)
-      e.ordinal = ++max;
-  if (max > std::numeric_limits<uint16_t>::max())
-    Fatal(ctx) << "too many exported symbols (got " << max << ", max "
-               << Twine(std::numeric_limits<uint16_t>::max()) << ")";
-}
-
 // Parses a string in the form of "key=value" and check
 // if value matches previous values for the same key.
 void LinkerDriver::checkFailIfMismatch(StringRef arg, InputFile *source) {
diff --git lld/COFF/MapFile.cpp lld/COFF/MapFile.cpp
index af87587d143d..eb98bb484f9f 100644
--- lld/COFF/MapFile.cpp
+++ lld/COFF/MapFile.cpp
@@ -326,7 +326,7 @@ void lld::coff::writeMapFile(COFFLinkerContext &ctx) {
     os << " Exports\n";
     os << "\n";
     os << "  ordinal    name\n\n";
-    for (Export &e : ctx.config.exports) {
+    for (Export &e : ctx.symtab.exports) {
       os << format("  %7d", e.ordinal) << "    " << e.name << "\n";
       if (!e.extName.empty() && e.extName != e.name)
         os << "               exported name: " << e.extName << "\n";
diff --git lld/COFF/SymbolTable.cpp lld/COFF/SymbolTable.cpp
index bf965e8a2332..ecccc7d6ed70 100644
--- lld/COFF/SymbolTable.cpp
+++ lld/COFF/SymbolTable.cpp
@@ -1118,6 +1118,141 @@ void SymbolTable::addUndefinedGlob(StringRef arg) {
     addGCRoot(sym->getName());
 }
 
+// Convert stdcall/fastcall style symbols into unsuffixed symbols,
+// with or without a leading underscore. (MinGW specific.)
+static StringRef killAt(StringRef sym, bool prefix) {
+  if (sym.empty())
+    return sym;
+  // Strip any trailing stdcall suffix
+  sym = sym.substr(0, sym.find('@', 1));
+  if (!sym.starts_with("@")) {
+    if (prefix && !sym.starts_with("_"))
+      return saver().save("_" + sym);
+    return sym;
+  }
+  // For fastcall, remove the leading @ and replace it with an
+  // underscore, if prefixes are used.
+  sym = sym.substr(1);
+  if (prefix)
+    sym = saver().save("_" + sym);
+  return sym;
+}
+
+static StringRef exportSourceName(ExportSource s) {
+  switch (s) {
+  case ExportSource::Directives:
+    return "source file (directives)";
+  case ExportSource::Export:
+    return "/export";
+  case ExportSource::ModuleDefinition:
+    return "/def";
+  default:
+    llvm_unreachable("unknown ExportSource");
+  }
+}
+
+// Performs error checking on all /export arguments.
+// It also sets ordinals.
+void SymbolTable::fixupExports() {
+  llvm::TimeTraceScope timeScope("Fixup exports");
+  // Symbol ordinals must be unique.
+  std::set<uint16_t> ords;
+  for (Export &e : exports) {
+    if (e.ordinal == 0)
+      continue;
+    if (!ords.insert(e.ordinal).second)
+      Fatal(ctx) << "duplicate export ordinal: " << e.name;
+  }
+
+  for (Export &e : exports) {
+    if (!e.exportAs.empty()) {
+      e.exportName = e.exportAs;
+      continue;
+    }
+
+    StringRef sym =
+        !e.forwardTo.empty() || e.extName.empty() ? e.name : e.extName;
+    if (machine == I386 && sym.starts_with("_")) {
+      // In MSVC mode, a fully decorated stdcall function is exported
+      // as-is with the leading underscore (with type IMPORT_NAME).
+      // In MinGW mode, a decorated stdcall function gets the underscore
+      // removed, just like normal cdecl functions.
+      if (ctx.config.mingw || !sym.contains('@')) {
+        e.exportName = sym.substr(1);
+        continue;
+      }
+    }
+    if (isEC() && !e.data && !e.constant) {
+      if (std::optional<std::string> demangledName =
+              getArm64ECDemangledFunctionName(sym)) {
+        e.exportName = saver().save(*demangledName);
+        continue;
+      }
+    }
+    e.exportName = sym;
+  }
+
+  if (ctx.config.killAt && machine == I386) {
+    for (Export &e : exports) {
+      e.name = killAt(e.name, true);
+      e.exportName = killAt(e.exportName, false);
+      e.extName = killAt(e.extName, true);
+      e.symbolName = killAt(e.symbolName, true);
+    }
+  }
+
+  // Uniquefy by name.
+  DenseMap<StringRef, std::pair<Export *, unsigned>> map(exports.size());
+  std::vector<Export> v;
+  for (Export &e : exports) {
+    auto pair = map.insert(std::make_pair(e.exportName, std::make_pair(&e, 0)));
+    bool inserted = pair.second;
+    if (inserted) {
+      pair.first->second.second = v.size();
+      v.push_back(e);
+      continue;
+    }
+    Export *existing = pair.first->second.first;
+    if (e == *existing || e.name != existing->name)
+      continue;
+    // If the existing export comes from .OBJ directives, we are allowed to
+    // overwrite it with /DEF: or /EXPORT without any warning, as MSVC link.exe
+    // does.
+    if (existing->source == ExportSource::Directives) {
+      *existing = e;
+      v[pair.first->second.second] = e;
+      continue;
+    }
+    if (existing->source == e.source) {
+      Warn(ctx) << "duplicate " << exportSourceName(existing->source)
+                << " option: " << e.name;
+    } else {
+      Warn(ctx) << "duplicate export: " << e.name << " first seen in "
+                << exportSourceName(existing->source) << ", now in "
+                << exportSourceName(e.source);
+    }
+  }
+  exports = std::move(v);
+
+  // Sort by name.
+  llvm::sort(exports, [](const Export &a, const Export &b) {
+    return a.exportName < b.exportName;
+  });
+}
+
+void SymbolTable::assignExportOrdinals() {
+  // Assign unique ordinals if default (= 0).
+  uint32_t max = 0;
+  for (Export &e : exports)
+    max = std::max(max, (uint32_t)e.ordinal);
+  for (Export &e : exports)
+    if (e.ordinal == 0)
+      e.ordinal = ++max;
+  if (max > std::numeric_limits<uint16_t>::max())
+    Fatal(ctx) << "too many exported symbols (got " << max << ", max "
+               << Twine(std::numeric_limits<uint16_t>::max()) << ")";
+}
+
 Symbol *SymbolTable::addUndefined(StringRef name) {
   return addUndefined(name, nullptr, false);
 }
diff --git lld/COFF/SymbolTable.h lld/COFF/SymbolTable.h
index 66bca0d63e5f..a0acf5db4690 100644
--- lld/COFF/SymbolTable.h
+++ lld/COFF/SymbolTable.h
@@ -150,6 +150,14 @@ public:
   // A list of EC EXP+ symbols.
   std::vector<Symbol *> expSymbols;
 
+  // A list of DLL exports.
+  std::vector<Export> exports;
+  llvm::DenseSet<StringRef> directivesExports;
+  bool hadExplicitExports;
+
+  void fixupExports();
+  void assignExportOrdinals();
+
   // Iterates symbols in non-determinstic hash table order.
   template <typename T> void forEachSymbol(T callback) {
     for (auto &pair : symMap)
diff --git lld/COFF/Symbols.cpp lld/COFF/Symbols.cpp
index 148822fdb68f..fce50d41a663 100644
--- lld/COFF/Symbols.cpp
+++ lld/COFF/Symbols.cpp
@@ -100,7 +100,6 @@ bool Symbol::isLive() const {
   return true;
 }
 
-// MinGW specific.
 void Symbol::replaceKeepingName(Symbol *other, size_t size) {
   StringRef origName = getName();
   memcpy(this, other, size);
diff --git lld/COFF/Writer.cpp lld/COFF/Writer.cpp
index 8247f131dcf0..3d95d219a493 100644
--- lld/COFF/Writer.cpp
+++ lld/COFF/Writer.cpp
@@ -76,14 +76,8 @@ static unsigned char dosProgram[] = {
 };
 static_assert(sizeof(dosProgram) % 8 == 0,
               "DOSProgram size must be multiple of 8");
-
-static const int dosStubSize = sizeof(dos_header) + sizeof(dosProgram);
-static_assert(dosStubSize % 8 == 0, "DOSStub size must be multiple of 8");
-static const uint32_t coffHeaderOffset = dosStubSize + sizeof(PEMagic);
-static const uint32_t peHeaderOffset =
-    coffHeaderOffset + sizeof(coff_file_header);
-static const uint32_t dataDirOffset64 =
-    peHeaderOffset + sizeof(pe32plus_header);
+static_assert((sizeof(dos_header) + sizeof(dosProgram)) % 8 == 0,
+              "DOSStub size must be multiple of 8");
 
 static const int numberOfDataDirectory = 16;
 
@@ -210,10 +204,11 @@ struct ChunkRange {
 class Writer {
 public:
   Writer(COFFLinkerContext &c)
-      : buffer(c.e.outputBuffer), delayIdata(c), edata(c), ctx(c) {}
+      : buffer(c.e.outputBuffer), delayIdata(c), ctx(c) {}
   void run();
 
 private:
+  void calculateStubDependentSizes();
   void createSections();
   void createMiscChunks();
   void createImportTables();
@@ -298,7 +293,6 @@ private:
   Chunk *iatStart = nullptr;
   uint64_t iatSize = 0;
   DelayLoadContents delayIdata;
-  EdataContents edata;
   bool setNoSEHCharacteristic = false;
   uint32_t tlsAlignment = 0;
 
@@ -315,6 +309,11 @@ private:
   uint64_t sizeOfImage;
   uint64_t sizeOfHeaders;
 
+  uint32_t dosStubSize;
+  uint32_t coffHeaderOffset;
+  uint32_t peHeaderOffset;
+  uint32_t dataDirOffset64;
+
   OutputSection *textSec;
   OutputSection *hexpthkSec;
   OutputSection *rdataSec;
@@ -728,10 +727,8 @@ void Writer::writePEChecksum() {
   uint32_t *buf = (uint32_t *)buffer->getBufferStart();
   uint32_t size = (uint32_t)(buffer->getBufferSize());
 
-  coff_file_header *coffHeader =
-      (coff_file_header *)((uint8_t *)buf + dosStubSize + sizeof(PEMagic));
-  pe32_header *peHeader =
-      (pe32_header *)((uint8_t *)coffHeader + sizeof(coff_file_header));
+  pe32_header *peHeader = (pe32_header *)((uint8_t *)buf + coffHeaderOffset +
+                                          sizeof(coff_file_header));
 
   uint64_t sum = 0;
   uint32_t count = size;
@@ -762,6 +759,7 @@ void Writer::run() {
     llvm::TimeTraceScope timeScope("Write PE");
     ScopedTimer t1(ctx.codeLayoutTimer);
 
+    calculateStubDependentSizes();
     if (ctx.config.machine == ARM64X)
       ctx.dynamicRelocs = make<DynamicRelocsChunk>();
     createImportTables();
@@ -1035,6 +1033,17 @@ void Writer::sortSections() {
       sortBySectionOrder(it.second->chunks);
 }
 
+void Writer::calculateStubDependentSizes() {
+  if (ctx.config.dosStub)
+    dosStubSize = alignTo(ctx.config.dosStub->getBufferSize(), 8);
+  else
+    dosStubSize = sizeof(dos_header) + sizeof(dosProgram);
+
+  coffHeaderOffset = dosStubSize + sizeof(PEMagic);
+  peHeaderOffset = coffHeaderOffset + sizeof(coff_file_header);
+  dataDirOffset64 = peHeaderOffset + sizeof(pe32plus_header);
+}
+
 // Create output section objects and add them to OutputSections.
 void Writer::createSections() {
   llvm::TimeTraceScope timeScope("Output sections");
@@ -1322,10 +1331,12 @@ void Writer::createExportTable() {
   if (!edataSec->chunks.empty()) {
     // Allow using a custom built export table from input object files, instead
     // of having the linker synthesize the tables.
-    if (ctx.config.hadExplicitExports)
+    if (ctx.symtab.hadExplicitExports)
       Warn(ctx) << "literal .edata sections override exports";
-  } else if (!ctx.config.exports.empty()) {
-    for (Chunk *c : edata.chunks)
+  } else if (!ctx.symtab.exports.empty()) {
+    std::vector<Chunk *> edataChunks;
+    createEdataChunks(ctx.symtab, edataChunks);
+    for (Chunk *c : edataChunks)
       edataSec->addChunk(c);
   }
   if (!edataSec->chunks.empty()) {
@@ -1333,7 +1344,7 @@ void Writer::createExportTable() {
     edataEnd = edataSec->chunks.back();
   }
   // Warn on exported deleting destructor.
-  for (auto e : ctx.config.exports)
+  for (auto e : ctx.symtab.exports)
     if (e.sym && e.sym->getName().starts_with("??_G"))
       Warn(ctx) << "export of deleting dtor: " << e.sym;
 }
@@ -1668,21 +1679,37 @@ template <typename PEHeaderTy> void Writer::writeHeader() {
   // When run under Windows, the loader looks at AddressOfNewExeHeader and uses
   // the PE header instead.
   Configuration *config = &ctx.config;
+
   uint8_t *buf = buffer->getBufferStart();
   auto *dos = reinterpret_cast<dos_header *>(buf);
-  buf += sizeof(dos_header);
-  dos->Magic[0] = 'M';
-  dos->Magic[1] = 'Z';
-  dos->UsedBytesInTheLastPage = dosStubSize % 512;
-  dos->FileSizeInPages = divideCeil(dosStubSize, 512);
-  dos->HeaderSizeInParagraphs = sizeof(dos_header) / 16;
-
-  dos->AddressOfRelocationTable = sizeof(dos_header);
-  dos->AddressOfNewExeHeader = dosStubSize;
 
   // Write DOS program.
-  memcpy(buf, dosProgram, sizeof(dosProgram));
-  buf += sizeof(dosProgram);
+  if (config->dosStub) {
+    memcpy(buf, config->dosStub->getBufferStart(),
+           config->dosStub->getBufferSize());
+    // MS link.exe accepts an invalid `e_lfanew` (AddressOfNewExeHeader) and
+    // updates it automatically. Replicate the same behaviour.
+    dos->AddressOfNewExeHeader = alignTo(config->dosStub->getBufferSize(), 8);
+    // Unlike MS link.exe, LLD accepts non-8-byte-aligned stubs.
+    // In that case, we add zero paddings ourselves.
+    buf += alignTo(config->dosStub->getBufferSize(), 8);
+  } else {
+    buf += sizeof(dos_header);
+    dos->Magic[0] = 'M';
+    dos->Magic[1] = 'Z';
+    dos->UsedBytesInTheLastPage = dosStubSize % 512;
+    dos->FileSizeInPages = divideCeil(dosStubSize, 512);
+    dos->HeaderSizeInParagraphs = sizeof(dos_header) / 16;
+
+    dos->AddressOfRelocationTable = sizeof(dos_header);
+    dos->AddressOfNewExeHeader = dosStubSize;
+
+    memcpy(buf, dosProgram, sizeof(dosProgram));
+    buf += sizeof(dosProgram);
+  }
+
+  // Make sure DOS stub is aligned to 8 bytes at this point
+  assert((buf - buffer->getBufferStart()) % 8 == 0);
 
   // Write PE magic
   memcpy(buf, PEMagic, sizeof(PEMagic));
@@ -2034,11 +2061,11 @@ void Writer::createGuardCFTables() {
   ctx.forEachSymtab([&](SymbolTable &symtab) {
     if (symtab.entry)
       maybeAddAddressTakenFunction(addressTakenSyms, symtab.entry);
-  });
 
-  // Mark exported symbols in executable sections as address-taken.
-  for (Export &e : config->exports)
-    maybeAddAddressTakenFunction(addressTakenSyms, e.sym);
+    // Mark exported symbols in executable sections as address-taken.
+    for (Export &e : symtab.exports)
+      maybeAddAddressTakenFunction(addressTakenSyms, e.sym);
+  });
 
   // For each entry in the .giats table, check if it has a corresponding load
   // thunk (e.g. because the DLL that defines it will be delay-loaded) and, if
@@ -2352,6 +2379,20 @@ void Writer::setECSymbols() {
       delayIatCopySym, "__hybrid_auxiliary_delayload_iat_copy",
       delayIdata.getAuxIatCopy().empty() ? nullptr
                                          : delayIdata.getAuxIatCopy().front());
+
+  if (ctx.hybridSymtab) {
+    // For the hybrid image, set the alternate entry point to the EC entry
+    // point. In the hybrid view, it is swapped to the native entry point
+    // using ARM64X relocations.
+    if (auto altEntrySym = cast_or_null<Defined>(ctx.hybridSymtab->entry)) {
+      // If the entry is an EC export thunk, use its target instead.
+      if (auto thunkChunk =
+              dyn_cast<ECExportThunkChunk>(altEntrySym->getChunk()))
+        altEntrySym = thunkChunk->target;
+      symtab->findUnderscore("__arm64x_native_entrypoint")
+          ->replaceKeepingName(altEntrySym, sizeof(SymbolUnion));
+    }
+  }
 }
 
 // Write section contents to a mmap'ed file.
@@ -2586,12 +2627,23 @@ void Writer::createDynamicRelocs() {
                          coffHeaderOffset + offsetof(coff_file_header, Machine),
                          AMD64);
 
-  if (ctx.symtab.entry != ctx.hybridSymtab->entry)
+  if (ctx.symtab.entry != ctx.hybridSymtab->entry) {
     ctx.dynamicRelocs->add(IMAGE_DVRT_ARM64X_FIXUP_TYPE_VALUE, sizeof(uint32_t),
                            peHeaderOffset +
                                offsetof(pe32plus_header, AddressOfEntryPoint),
                            cast_or_null<Defined>(ctx.hybridSymtab->entry));
 
+    // Swap the alternate entry point in the CHPE metadata.
+    Symbol *s = ctx.hybridSymtab->findUnderscore("__chpe_metadata");
+    if (auto chpeSym = cast_or_null<DefinedRegular>(s))
+      ctx.dynamicRelocs->add(
+          IMAGE_DVRT_ARM64X_FIXUP_TYPE_VALUE, sizeof(uint32_t),
+          Arm64XRelocVal(chpeSym, offsetof(chpe_metadata, AlternateEntryPoint)),
+          cast_or_null<Defined>(ctx.symtab.entry));
+    else
+      Warn(ctx) << "'__chpe_metadata' is missing for ARM64X target";
+  }
+
   // Set the hybrid load config to the EC load config.
   ctx.dynamicRelocs->add(IMAGE_DVRT_ARM64X_FIXUP_TYPE_VALUE, sizeof(uint32_t),
                          dataDirOffset64 +
diff --git lld/MachO/Options.td lld/MachO/Options.td
index 39191af7dc16..4c89f96c3eba 100644
--- lld/MachO/Options.td
+++ lld/MachO/Options.td
@@ -133,7 +133,7 @@ def print_symbol_order_eq: Joined<["--"], "print-symbol-order=">,
 def irpgo_profile: Separate<["--"], "irpgo-profile">, Group<grp_lld>;
 def irpgo_profile_eq: Joined<["--"], "irpgo-profile=">,
     Alias<!cast<Separate>(irpgo_profile)>, MetaVarName<"<profile>">,
-    HelpText<"Read the IRPGO <profile> for use with -bp-startup-sort and other profile-guided optimizations">,
+    HelpText<"Read the IRPGO <profile> for use with --bp-startup-sort and other profile-guided optimizations">,
     Group<grp_lld>;
 def bp_startup_sort: Joined<["--"], "bp-startup-sort=">,
     MetaVarName<"[none,function]">,
diff --git lld/test/COFF/Inputs/stub63mz lld/test/COFF/Inputs/stub63mz
new file mode 100644
index 000000000000..2a8954d2d691
Binary files /dev/null and lld/test/COFF/Inputs/stub63mz differ
diff --git lld/test/COFF/Inputs/stub64mz lld/test/COFF/Inputs/stub64mz
new file mode 100644
index 000000000000..aaeb005adb54
Binary files /dev/null and lld/test/COFF/Inputs/stub64mz differ
diff --git lld/test/COFF/Inputs/stub64zz lld/test/COFF/Inputs/stub64zz
new file mode 100644
index 000000000000..fa58df18aabe
Binary files /dev/null and lld/test/COFF/Inputs/stub64zz differ
diff --git lld/test/COFF/Inputs/stub68mz lld/test/COFF/Inputs/stub68mz
new file mode 100644
index 000000000000..42b722594653
Binary files /dev/null and lld/test/COFF/Inputs/stub68mz differ
diff --git lld/test/COFF/arm64x-entry.test lld/test/COFF/arm64x-entry.test
index d5363c66544a..1c2e7e7a0c93 100644
--- lld/test/COFF/arm64x-entry.test
+++ lld/test/COFF/arm64x-entry.test
@@ -3,12 +3,14 @@ RUN: split-file %s %t.dir && cd %t.dir
 
 RUN: llvm-mc -filetype=obj -triple=arm64ec-windows arm64ec-dllmain.s -o arm64ec-dllmain.obj
 RUN: llvm-mc -filetype=obj -triple=aarch64-windows arm64-dllmain.s -o arm64-dllmain.obj
+RUN: llvm-mc -filetype=obj -triple=x86_64-windows amd64-dllmain.s -o amd64-dllmain.obj
 RUN: llvm-mc -filetype=obj -triple=arm64ec-windows arm64ec-func.s -o arm64ec-func.obj
 RUN: llvm-mc -filetype=obj -triple=aarch64-windows arm64-func.s -o arm64-func.obj
 RUN: llvm-mc -filetype=obj -triple=arm64ec-windows arm64-drectve.s -o arm64ec-drectve.obj
 RUN: llvm-mc -filetype=obj -triple=aarch64-windows arm64-drectve.s -o arm64-drectve.obj
 RUN: llvm-mc -filetype=obj -triple=arm64ec-windows %S/Inputs/loadconfig-arm64ec.s -o loadconfig-arm64ec.obj
 RUN: llvm-mc -filetype=obj -triple=aarch64-windows %S/Inputs/loadconfig-arm64.s -o loadconfig-arm64.obj
+RUN: llvm-mc -filetype=obj -triple=arm64ec-windows loadconfig-min.s -o loadconfig-min.obj
 
 RUN: lld-link -machine:arm64x -dll -out:out.dll arm64ec-dllmain.obj arm64-dllmain.obj \
 RUN:          loadconfig-arm64.obj loadconfig-arm64ec.obj
@@ -34,10 +36,12 @@ DISASM-NEXT: 180003009: e9 f2 ef ff ff               jmp     0x180002000 <.text+
 DISASM-NEXT: 18000300e: cc                           int3
 DISASM-NEXT: 18000300f: cc                           int3
 
-RUN: llvm-readobj --headers out.dll | FileCheck --check-prefix=READOBJ %s
+RUN: llvm-readobj --headers --coff-load-config out.dll | FileCheck --check-prefix=READOBJ %s
 READOBJ: AddressOfEntryPoint: 0x1000
+READOBJ: AlternateEntryPoint: 0x2000
 READOBJ: HybridObject {
 READOBJ:   AddressOfEntryPoint: 0x3000
+READOBJ:   AlternateEntryPoint: 0x1000
 READOBJ: }
 
 RUN: lld-link -machine:arm64x -dll -out:out2.dll arm64ec-func.obj arm64-func.obj \
@@ -55,6 +59,20 @@ RUN:          loadconfig-arm64.obj loadconfig-arm64ec.obj -entry:func
 RUN: llvm-objdump -d out4.dll | FileCheck --check-prefix=DISASM %s
 RUN: llvm-readobj --headers --coff-load-config out4.dll | FileCheck --check-prefix=READOBJ %s
 
+RUN: lld-link -machine:arm64x -dll -out:out-x86.dll amd64-dllmain.obj arm64-dllmain.obj \
+RUN:          loadconfig-arm64.obj loadconfig-arm64ec.obj
+RUN: llvm-readobj --headers --coff-load-config out-x86.dll | FileCheck --check-prefix=READOBJ-X86 %s
+READOBJ-X86: AddressOfEntryPoint: 0x1000
+READOBJ-X86: AlternateEntryPoint: 0x2000
+READOBJ-X86: HybridObject {
+READOBJ-X86:   AddressOfEntryPoint: 0x2000
+READOBJ-X86:   AlternateEntryPoint: 0x1000
+READOBJ-X86: }
+
+RUN: lld-link -machine:arm64x -dll -out:out-warn.dll arm64ec-dllmain.obj arm64-dllmain.obj \
+RUN:          loadconfig-arm64.obj loadconfig-min.obj 2>&1 | FileCheck --check-prefix=WARN %s
+WARN: lld-link: warning: '__chpe_metadata' is missing for ARM64X target
+
 #--- arm64-dllmain.s
     .section .text,"xr",discard,_DllMainCRTStartup
     .globl _DllMainCRTStartup
@@ -87,6 +105,56 @@ func:
     mov w0, #2
     ret
 
+#--- amd64-dllmain.s
+    .section .text,"xr",discard,_DllMainCRTStartup
+    .globl _DllMainCRTStartup
+    .p2align 2
+_DllMainCRTStartup:
+    movl $3, %eax
+    retq
+
 #--- arm64-drectve.s
 .section .drectve
     .ascii "-entry:func"
+
+#--- loadconfig-min.s
+        .section .rdata,"dr"
+        .globl _load_config_used
+        .p2align 3, 0
+_load_config_used:
+        .word 0x140
+        .fill 0xc4,1,0
+        .xword chpe_metadata
+        .fill 0x70,1,0
+
+        .p2align 3, 0
+chpe_metadata:
+        .word 2
+        .rva __hybrid_code_map
+        .word __hybrid_code_map_count
+        .rva __x64_code_ranges_to_entry_points
+        .rva __arm64x_redirection_metadata
+        .word 0 // __os_arm64x_dispatch_call_no_redirect
+        .word 0 // __os_arm64x_dispatch_ret
+        .word 0 // __os_arm64x_check_call
+        .word 0 // __os_arm64x_check_icall
+        .word 0 // __os_arm64x_check_icall_cfg
+        .rva __arm64x_native_entrypoint
+        .rva __hybrid_auxiliary_iat
+        .word __x64_code_ranges_to_entry_points_count
+        .word __arm64x_redirection_metadata_count
+        .word 0 // __os_arm64x_get_x64_information
+        .word 0 // __os_arm64x_set_x64_information
+        .rva __arm64x_extra_rfe_table
+        .word __arm64x_extra_rfe_table_size
+        .word 0 // __os_arm64x_dispatch_fptr
+        .rva __hybrid_auxiliary_iat_copy
+        .rva __hybrid_auxiliary_delayload_iat
+        .rva __hybrid_auxiliary_delayload_iat_copy
+        .word __hybrid_image_info_bitfield
+        .word 0 // __os_arm64x_helper3
+        .word 0 // __os_arm64x_helper4
+        .word 0 // __os_arm64x_helper5
+        .word 0 // __os_arm64x_helper6
+        .word 0 // __os_arm64x_helper7
+        .word 0 // __os_arm64x_helper8
diff --git lld/test/COFF/arm64x-export.test lld/test/COFF/arm64x-export.test
new file mode 100644
index 000000000000..e5d0307e570e
--- /dev/null
+++ lld/test/COFF/arm64x-export.test
@@ -0,0 +1,121 @@
+REQUIRES: aarch64, x86
+RUN: split-file %s %t.dir && cd %t.dir
+
+RUN: llvm-mc -filetype=obj -triple=arm64ec-windows arm64ec-func.s -o arm64ec-func.obj
+RUN: llvm-mc -filetype=obj -triple=aarch64-windows arm64-func.s -o arm64-func.obj
+RUN: llvm-mc -filetype=obj -triple=arm64ec-windows func-drectve.s -o arm64ec-drectve.obj
+RUN: llvm-mc -filetype=obj -triple=aarch64-windows func-drectve.s -o arm64-drectve.obj
+RUN: llvm-mc -filetype=obj -triple=arm64ec-windows %S/Inputs/loadconfig-arm64ec.s -o loadconfig-arm64ec.obj
+RUN: llvm-mc -filetype=obj -triple=aarch64-windows %S/Inputs/loadconfig-arm64.s -o loadconfig-arm64.obj
+
+
+# A command-line export applies only to EC exports.
+
+RUN: lld-link -machine:arm64x -dll -out:out-cmd.dll arm64ec-func.obj arm64-func.obj \
+RUN:          loadconfig-arm64.obj loadconfig-arm64ec.obj -noentry -export:func
+
+RUN: llvm-objdump -d out-cmd.dll | FileCheck --check-prefix=DISASM-EC %s
+DISASM-EC:      Disassembly of section .text:
+DISASM-EC-EMPTY:
+DISASM-EC-NEXT: 0000000180001000 <.text>:
+DISASM-EC-NEXT: 180001000: 52800040     mov     w0, #0x2                // =2
+DISASM-EC-NEXT: 180001004: d65f03c0     ret
+DISASM-EC-EMPTY:
+DISASM-EC-NEXT: Disassembly of section .hexpthk:
+DISASM-EC-EMPTY:
+DISASM-EC-NEXT: 0000000180002000 <.hexpthk>:
+DISASM-EC-NEXT: 180002000: 48 8b c4                     movq    %rsp, %rax
+DISASM-EC-NEXT: 180002003: 48 89 58 20                  movq    %rbx, 0x20(%rax)
+DISASM-EC-NEXT: 180002007: 55                           pushq   %rbp
+DISASM-EC-NEXT: 180002008: 5d                           popq    %rbp
+DISASM-EC-NEXT: 180002009: e9 f2 ef ff ff               jmp     0x180001000 <.text>
+DISASM-EC-NEXT: 18000200e: cc                           int3
+DISASM-EC-NEXT: 18000200f: cc                           int3
+
+RUN: llvm-readobj --headers --coff-exports out-cmd.dll | FileCheck --check-prefix=EXPORTS-EC %s
+EXPORTS-EC:      ExportTableRVA: 0x0
+EXPORTS-EC-NEXT: ExportTableSize: 0x0
+EXPORTS-EC-NOT:  Name: func
+
+# Export using the EC .drectve section.
+
+RUN: lld-link -machine:arm64x -dll -out:out-drectve-ec.dll arm64ec-func.obj arm64-func.obj \
+RUN:          loadconfig-arm64.obj loadconfig-arm64ec.obj arm64ec-drectve.obj -noentry
+RUN: llvm-objdump -d out-drectve-ec.dll | FileCheck --check-prefix=DISASM-EC %s
+RUN: llvm-readobj --headers --coff-exports out-drectve-ec.dll | FileCheck --check-prefix=EXPORTS-EC %s
+
+# Export using the native .drectve section.
+
+RUN: lld-link -machine:arm64x -dll -out:out-drectve-native.dll arm64ec-func.obj arm64-func.obj \
+RUN:          loadconfig-arm64.obj loadconfig-arm64ec.obj arm64-drectve.obj -noentry
+
+RUN: llvm-objdump -d out-drectve-native.dll | FileCheck --check-prefix=DISASM-NATIVE %s
+DISASM-NATIVE:      Disassembly of section .text:
+DISASM-NATIVE-EMPTY:
+DISASM-NATIVE-NEXT: 0000000180001000 <func>:
+DISASM-NATIVE-NEXT: 180001000: 52800020     mov     w0, #0x1                // =1
+DISASM-NATIVE-NEXT: 180001004: d65f03c0     ret
+
+RUN: llvm-readobj --headers --coff-exports out-drectve-native.dll | FileCheck --check-prefix=EXPORTS-NATIVE %s
+EXPORTS-NATIVE:      ExportTableRVA: 0x2{{.*}}
+EXPORTS-NATIVE-NEXT: ExportTableSize: 0x4{{.*}}
+EXPORTS-NATIVE:      Export {
+EXPORTS-NATIVE-NEXT:   Ordinal: 1
+EXPORTS-NATIVE-NEXT:   Name: func
+EXPORTS-NATIVE-NEXT:   RVA: 0x1000
+EXPORTS-NATIVE-NEXT: }
+
+# Export using both the native and EC .drectve sections.
+
+RUN: lld-link -machine:arm64x -dll -out:out-both.dll arm64ec-func.obj arm64-func.obj \
+RUN:          loadconfig-arm64.obj loadconfig-arm64ec.obj arm64-drectve.obj arm64ec-drectve.obj -noentry
+
+RUN: llvm-objdump -d out-both.dll | FileCheck --check-prefix=DISASM-BOTH %s
+DISASM-BOTH:      Disassembly of section .text:
+DISASM-BOTH-EMPTY:
+DISASM-BOTH-NEXT: 0000000180001000 <func>:
+DISASM-BOTH-NEXT: 180001000: 52800020     mov     w0, #0x1                // =1
+DISASM-BOTH-NEXT: 180001004: d65f03c0     ret
+DISASM-BOTH-NEXT:                 ...
+DISASM-BOTH-NEXT: 180002000: 52800040     mov     w0, #0x2                // =2
+DISASM-BOTH-NEXT: 180002004: d65f03c0     ret
+DISASM-BOTH-EMPTY:
+DISASM-BOTH-NEXT: Disassembly of section .hexpthk:
+DISASM-BOTH-EMPTY:
+DISASM-BOTH-NEXT: 0000000180003000 <.hexpthk>:
+DISASM-BOTH-NEXT: 180003000: 48 8b c4                     movq    %rsp, %rax
+DISASM-BOTH-NEXT: 180003003: 48 89 58 20                  movq    %rbx, 0x20(%rax)
+DISASM-BOTH-NEXT: 180003007: 55                           pushq   %rbp
+DISASM-BOTH-NEXT: 180003008: 5d                           popq    %rbp
+DISASM-BOTH-NEXT: 180003009: e9 f2 ef ff ff               jmp     0x180002000 <func+0x1000>
+DISASM-BOTH-NEXT: 18000300e: cc                           int3
+DISASM-BOTH-NEXT: 18000300f: cc                           int3
+
+RUN: llvm-readobj --headers --coff-exports out-both.dll | FileCheck --check-prefix=EXPORTS-BOTH %s
+EXPORTS-BOTH:      ExportTableRVA: 0x4{{.*}}
+EXPORTS-BOTH-NEXT: ExportTableSize: 0x4{{.*}}
+EXPORTS-BOTH:      Export {
+EXPORTS-BOTH-NEXT:   Ordinal: 1
+EXPORTS-BOTH-NEXT:   Name: func
+EXPORTS-BOTH-NEXT:   RVA: 0x1000
+EXPORTS-BOTH-NEXT: }
+
+#--- arm64-func.s
+    .section .text,"xr",discard,func
+    .globl func
+    .p2align 2
+func:
+    mov w0, #1
+    ret
+
+#--- arm64ec-func.s
+    .section .text,"xr",discard,func
+    .globl func
+    .p2align 2
+func:
+    mov w0, #2
+    ret
+
+#--- func-drectve.s
+.section .drectve
+    .ascii "-export:func"
diff --git lld/test/COFF/stub.test lld/test/COFF/stub.test
new file mode 100644
index 000000000000..84de6ed84c95
--- /dev/null
+++ lld/test/COFF/stub.test
@@ -0,0 +1,55 @@
+# RUN: yaml2obj %p/Inputs/ret42.yaml -o %t.obj
+
+# RUN: lld-link /out:%t.exe /entry:main /stub:%p/Inputs/stub64mz %t.obj
+# RUN: llvm-readobj --file-headers %t.exe | FileCheck -check-prefix=CHECK1 %s
+
+CHECK1: Magic: MZ
+CHECK1: UsedBytesInTheLastPage: 144
+CHECK1: FileSizeInPages: 3
+CHECK1: NumberOfRelocationItems: 0
+CHECK1: HeaderSizeInParagraphs: 4
+CHECK1: MinimumExtraParagraphs: 0
+CHECK1: MaximumExtraParagraphs: 65535
+CHECK1: InitialRelativeSS: 0
+CHECK1: InitialSP: 184
+CHECK1: Checksum: 0
+CHECK1: InitialIP: 0
+CHECK1: InitialRelativeCS: 0
+CHECK1: AddressOfRelocationTable: 64
+CHECK1: OverlayNumber: 0
+CHECK1: OEMid: 0
+CHECK1: OEMinfo: 0
+CHECK1: AddressOfNewExeHeader: 64
+
+## Invalid DOS signature (must be `MZ`)
+# RUN: not lld-link /out:%t.exe /entry:main /stub:%p/Inputs/stub64zz %t.obj 2>&1 | FileCheck -check-prefix=CHECK2 %s
+
+CHECK2: lld-link: error: /stub: invalid DOS signature: {{.*}}
+
+## Unlike MS linker, we accept non-8byte-aligned stubs and we add paddings ourselves
+# RUN: lld-link /out:%t.exe /entry:main /stub:%p/Inputs/stub68mz %t.obj
+# RUN: llvm-readobj --file-headers %t.exe | FileCheck -check-prefix=CHECK3 %s
+
+CHECK3: Magic: MZ
+CHECK3: UsedBytesInTheLastPage: 144
+CHECK3: FileSizeInPages: 3
+CHECK3: NumberOfRelocationItems: 0
+CHECK3: HeaderSizeInParagraphs: 4
+CHECK3: MinimumExtraParagraphs: 0
+CHECK3: MaximumExtraParagraphs: 65535
+CHECK3: InitialRelativeSS: 0
+CHECK3: InitialSP: 184
+CHECK3: Checksum: 0
+CHECK3: InitialIP: 0
+CHECK3: InitialRelativeCS: 0
+CHECK3: AddressOfRelocationTable: 64
+CHECK3: OverlayNumber: 0
+CHECK3: OEMid: 0
+CHECK3: OEMinfo: 0
+## 68 is unaligned and rounded up to 72 by LLD
+CHECK3: AddressOfNewExeHeader: 72
+
+## Too-small stub (must be at least 64 bytes long) && Unaligned
+# RUN: not lld-link /out:%t.exe /entry:main /stub:%p/Inputs/stub63mz %t.obj 2>&1 | FileCheck -check-prefix=CHECK4 %s
+
+CHECK4: lld-link: error: /stub: stub must be greater than or equal to 64 bytes: {{.*}}
diff --git lld/wasm/SyntheticSections.cpp lld/wasm/SyntheticSections.cpp
index 715fba1ee6da..7fb44b9f0c00 100644
--- lld/wasm/SyntheticSections.cpp
+++ lld/wasm/SyntheticSections.cpp
@@ -594,7 +594,7 @@ void ElemSection::writeBody() {
   }
   writeInitExpr(os, initExpr);
 
-  if (flags & WASM_ELEM_SEGMENT_MASK_HAS_ELEM_KIND) {
+  if (flags & WASM_ELEM_SEGMENT_MASK_HAS_ELEM_DESC) {
     // We only write active function table initializers, for which the elem kind
     // is specified to be written as 0x00 and interpreted to mean "funcref".
     const uint8_t elemKind = 0;
diff --git lldb/bindings/headers.swig lldb/bindings/headers.swig
index c0dde905f986..5e7c54d1eb83 100644
--- lldb/bindings/headers.swig
+++ lldb/bindings/headers.swig
@@ -52,6 +52,7 @@
 #include "lldb/API/SBProcess.h"
 #include "lldb/API/SBProcessInfo.h"
 #include "lldb/API/SBProcessInfoList.h"
+#include "lldb/API/SBProgress.h"
 #include "lldb/API/SBQueue.h"
 #include "lldb/API/SBQueueItem.h"
 #include "lldb/API/SBReproducer.h"
diff --git lldb/bindings/interface/SBProgressDocstrings.i lldb/bindings/interface/SBProgressDocstrings.i
new file mode 100644
index 000000000000..2997fe619fcc
--- /dev/null
+++ lldb/bindings/interface/SBProgressDocstrings.i
@@ -0,0 +1,14 @@
+%feature("docstring",
+"A Progress indicator helper class.
+
+Any potentially long running sections of code in LLDB should report
+progress so that clients are aware of delays that might appear during
+debugging. Delays commonly include indexing debug information, parsing
+symbol tables for object files, downloading symbols from remote
+repositories, and many more things.
+
+The Progress class helps make sure that progress is correctly reported
+and will always send an initial progress update, updates when
+Progress::Increment() is called, and also will make sure that a progress
+completed update is reported even if the user doesn't explicitly cause one
+to be sent.") lldb::SBProgress;
diff --git lldb/bindings/interfaces.swig lldb/bindings/interfaces.swig
index 8a6fed95f0b7..08df9a1a8d53 100644
--- lldb/bindings/interfaces.swig
+++ lldb/bindings/interfaces.swig
@@ -54,6 +54,7 @@
 %include "./interface/SBPlatformDocstrings.i"
 %include "./interface/SBProcessDocstrings.i"
 %include "./interface/SBProcessInfoDocstrings.i"
+%include "./interface/SBProgressDocstrings.i"
 %include "./interface/SBQueueDocstrings.i"
 %include "./interface/SBQueueItemDocstrings.i"
 %include "./interface/SBReproducerDocstrings.i"
@@ -133,6 +134,7 @@
 %include "lldb/API/SBProcess.h"
 %include "lldb/API/SBProcessInfo.h"
 %include "lldb/API/SBProcessInfoList.h"
+%include "lldb/API/SBProgress.h"
 %include "lldb/API/SBQueue.h"
 %include "lldb/API/SBQueueItem.h"
 %include "lldb/API/SBReproducer.h"
diff --git lldb/docs/use/formatting.rst lldb/docs/use/formatting.rst
index 970bacfd8807..7b3f01eebc89 100644
--- lldb/docs/use/formatting.rst
+++ lldb/docs/use/formatting.rst
@@ -113,11 +113,11 @@ A complete list of currently supported format string variables is listed below:
 +---------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | ``module.file.basename``                          | The basename of the current module (shared library or executable)                                                                                                                                                                                                                           |
 +---------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| ``module.file.fullpath``                          | The basename of the current module (shared library or executable)                                                                                                                                                                                                                           |
+| ``module.file.fullpath``                          | The path of the current module (shared library or executable)                                                                                                                                                                                                                               |
 +---------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | ``process.file.basename``                         | The basename of the file for the process                                                                                                                                                                                                                                                    |
 +---------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| ``process.file.fullpath``                         | The fullname of the file for the process                                                                                                                                                                                                                                                    |
+| ``process.file.fullpath``                         | The path of the file for the process                                                                                                                                                                                                                                                        |
 +---------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | ``process.id``                                    | The process ID native to the system on which the inferior runs.                                                                                                                                                                                                                             |
 +---------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
@@ -141,6 +141,10 @@ A complete list of currently supported format string variables is listed below:
 +---------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | ``target.arch``                                   | The architecture of the current target                                                                                                                                                                                                                                                      |
 +---------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| ``target.file.basename``                          | The basename of the current target                                                                                                                                                                                                                                                          |
++---------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| ``target.file.fullpath``                          | The path of the current target                                                                                                                                                                                                                                                              |
++---------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | ``script.target:python_func``                     | Use a Python function to generate a piece of textual output                                                                                                                                                                                                                                 |
 +---------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | ``script.process:python_func``                    | Use a Python function to generate a piece of textual output                                                                                                                                                                                                                                 |
diff --git lldb/include/lldb/API/SBDebugger.h lldb/include/lldb/API/SBDebugger.h
index 787bd040dd15..eb371e33c495 100644
--- lldb/include/lldb/API/SBDebugger.h
+++ lldb/include/lldb/API/SBDebugger.h
@@ -203,7 +203,7 @@ public:
   lldb::SBCommandInterpreter GetCommandInterpreter();
 
   void HandleCommand(const char *command);
-  
+
   void RequestInterrupt();
   void CancelInterruptRequest();
   bool InterruptRequested();
@@ -517,6 +517,7 @@ private:
   friend class SBPlatform;
   friend class SBTarget;
   friend class SBTrace;
+  friend class SBProgress;
 
   lldb::SBTarget FindTargetWithLLDBProcess(const lldb::ProcessSP &processSP);
 
diff --git lldb/include/lldb/API/SBProgress.h lldb/include/lldb/API/SBProgress.h
new file mode 100644
index 000000000000..d2eaf0a743cb
--- /dev/null
+++ lldb/include/lldb/API/SBProgress.h
@@ -0,0 +1,66 @@
+//===-- SBProgress.h --------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_API_SBPROGRESS_H
+#define LLDB_API_SBPROGRESS_H
+
+#include "lldb/API/SBDebugger.h"
+#include "lldb/API/SBDefines.h"
+
+namespace lldb {
+
+/// A Progress indicator helper class.
+///
+/// Any potentially long running sections of code in LLDB should report
+/// progress so that clients are aware of delays that might appear during
+/// debugging. Delays commonly include indexing debug information, parsing
+/// symbol tables for object files, downloading symbols from remote
+/// repositories, and many more things.
+///
+/// The Progress class helps make sure that progress is correctly reported
+/// and will always send an initial progress update, updates when
+/// Progress::Increment() is called, and also will make sure that a progress
+/// completed update is reported even if the user doesn't explicitly cause one
+/// to be sent.
+class LLDB_API SBProgress {
+public:
+  /// Construct a progress object with a title, details and a given debugger.
+  /// \param title
+  ///   The title of the progress object.
+  /// \param details
+  ///   The details of the progress object.
+  /// \param debugger
+  ///   The debugger for this progress object to report to.
+  SBProgress(const char *title, const char *details, SBDebugger &debugger);
+
+  /// Construct a progress object with a title, details, the total units of work
+  /// to be done, and a given debugger.
+  /// \param title
+  ///   The title of the progress object.
+  /// \param details
+  ///   The details of the progress object.
+  /// \param total_units
+  ///   The total number of units of work to be done.
+  /// \param debugger
+  ///   The debugger for this progress object to report to.
+  SBProgress(const char *title, const char *details, uint64_t total_units,
+             SBDebugger &debugger);
+
+  ~SBProgress();
+
+  void Increment(uint64_t amount, const char *description = nullptr);
+
+protected:
+  lldb_private::Progress &ref() const;
+
+private:
+  std::unique_ptr<lldb_private::Progress> m_opaque_up;
+}; // SBProgress
+} // namespace lldb
+
+#endif // LLDB_API_SBPROGRESS_H
diff --git lldb/include/lldb/Core/FormatEntity.h lldb/include/lldb/Core/FormatEntity.h
index 36f6df4118c2..c9d5af1f3167 100644
--- lldb/include/lldb/Core/FormatEntity.h
+++ lldb/include/lldb/Core/FormatEntity.h
@@ -67,6 +67,7 @@ struct Entry {
     ScriptThread,
     ThreadInfo,
     TargetArch,
+    TargetFile,
     ScriptTarget,
     ModuleFile,
     File,
diff --git lldb/include/lldb/lldb-forward.h lldb/include/lldb/lldb-forward.h
index d09edeeccaff..fc7456a4b9a3 100644
--- lldb/include/lldb/lldb-forward.h
+++ lldb/include/lldb/lldb-forward.h
@@ -233,6 +233,7 @@ class Symtab;
 class SyntheticChildren;
 class SyntheticChildrenFrontEnd;
 class SystemRuntime;
+class Progress;
 class Target;
 class TargetList;
 class TargetProperties;
diff --git lldb/source/API/CMakeLists.txt lldb/source/API/CMakeLists.txt
index d8308841c05d..147b30f3b002 100644
--- lldb/source/API/CMakeLists.txt
+++ lldb/source/API/CMakeLists.txt
@@ -83,6 +83,7 @@ add_lldb_library(liblldb SHARED ${option_framework}
   SBModule.cpp
   SBModuleSpec.cpp
   SBPlatform.cpp
+  SBProgress.cpp
   SBProcess.cpp
   SBProcessInfo.cpp
   SBProcessInfoList.cpp
diff --git lldb/source/API/SBProgress.cpp lldb/source/API/SBProgress.cpp
new file mode 100644
index 000000000000..d6ed5f0d15fc
--- /dev/null
+++ lldb/source/API/SBProgress.cpp
@@ -0,0 +1,43 @@
+//===-- SBProgress.cpp --------------------------------------------------*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/API/SBProgress.h"
+#include "lldb/Core/Progress.h"
+#include "lldb/Utility/Instrumentation.h"
+
+using namespace lldb;
+
+SBProgress::SBProgress(const char *title, const char *details,
+                       SBDebugger &debugger) {
+  LLDB_INSTRUMENT_VA(this, title, details, debugger);
+
+  m_opaque_up = std::make_unique<lldb_private::Progress>(
+      title, details, /*total=*/std::nullopt, debugger.get(),
+      /*minimum_report_time=*/std::nullopt,
+      lldb_private::Progress::Origin::eExternal);
+}
+
+SBProgress::SBProgress(const char *title, const char *details,
+                       uint64_t total_units, SBDebugger &debugger) {
+  LLDB_INSTRUMENT_VA(this, title, details, total_units, debugger);
+
+  m_opaque_up = std::make_unique<lldb_private::Progress>(
+      title, details, total_units, debugger.get(),
+      /*minimum_report_time=*/std::nullopt,
+      lldb_private::Progress::Origin::eExternal);
+}
+
+SBProgress::~SBProgress() = default;
+
+void SBProgress::Increment(uint64_t amount, const char *description) {
+  LLDB_INSTRUMENT_VA(amount, description);
+
+  m_opaque_up->Increment(amount, description);
+}
+
+lldb_private::Progress &SBProgress::ref() const { return *m_opaque_up; }
diff --git lldb/source/Core/Debugger.cpp lldb/source/Core/Debugger.cpp
index 6ceb209269c9..2df2aeb20aa2 100644
--- lldb/source/Core/Debugger.cpp
+++ lldb/source/Core/Debugger.cpp
@@ -1952,7 +1952,8 @@ lldb::thread_result_t Debugger::DefaultEventHandler() {
   listener_sp->StartListeningForEvents(
       &m_broadcaster, lldb::eBroadcastBitProgress | lldb::eBroadcastBitWarning |
                           lldb::eBroadcastBitError |
-                          lldb::eBroadcastSymbolChange);
+                          lldb::eBroadcastSymbolChange |
+                          lldb::eBroadcastBitExternalProgress);
 
   // Let the thread that spawned us know that we have started up and that we
   // are now listening to all required events so no events get missed
diff --git lldb/source/Core/FormatEntity.cpp lldb/source/Core/FormatEntity.cpp
index e13284832cf5..fb7043ac74b8 100644
--- lldb/source/Core/FormatEntity.cpp
+++ lldb/source/Core/FormatEntity.cpp
@@ -162,7 +162,9 @@ constexpr Definition g_thread_child_entries[] = {
     Definition("completed-expression", EntryType::ThreadCompletedExpression)};
 
 constexpr Definition g_target_child_entries[] = {
-    Definition("arch", EntryType::TargetArch)};
+    Definition("arch", EntryType::TargetArch),
+    Entry::DefinitionWithChildren("file", EntryType::TargetFile,
+                                  g_file_child_entries)};
 
 #define _TO_STR2(_val) #_val
 #define _TO_STR(_val) _TO_STR2(_val)
@@ -322,6 +324,7 @@ const char *FormatEntity::Entry::TypeToCString(Type t) {
     ENUM_TO_CSTR(ScriptThread);
     ENUM_TO_CSTR(ThreadInfo);
     ENUM_TO_CSTR(TargetArch);
+    ENUM_TO_CSTR(TargetFile);
     ENUM_TO_CSTR(ScriptTarget);
     ENUM_TO_CSTR(ModuleFile);
     ENUM_TO_CSTR(File);
@@ -1469,6 +1472,17 @@ bool FormatEntity::Format(const Entry &entry, Stream &s,
     }
     return false;
 
+  case Entry::Type::TargetFile:
+    if (exe_ctx) {
+      if (Target *target = exe_ctx->GetTargetPtr()) {
+        if (Module *exe_module = target->GetExecutableModulePointer()) {
+          if (DumpFile(s, exe_module->GetFileSpec(), (FileKind)entry.number))
+            return true;
+        }
+      }
+    }
+    return false;
+
   case Entry::Type::ScriptTarget:
     if (exe_ctx) {
       Target *target = exe_ctx->GetTargetPtr();
diff --git lldb/source/Plugins/Process/Utility/LinuxSignals.cpp lldb/source/Plugins/Process/Utility/LinuxSignals.cpp
index 3f25dbc6abbb..eaecc84df15d 100644
--- lldb/source/Plugins/Process/Utility/LinuxSignals.cpp
+++ lldb/source/Plugins/Process/Utility/LinuxSignals.cpp
@@ -20,6 +20,9 @@
 #ifndef SEGV_MTESERR
 #define SEGV_MTESERR 9
 #endif
+#ifndef SEGV_CPERR
+#define SEGV_CPERR 10
+#endif
 
 #define ADD_SIGCODE(signal_name, signal_value, code_name, code_value, ...)     \
   static_assert(signal_name == signal_value,                                   \
@@ -82,6 +85,7 @@ void LinuxSignals::Reset() {
   ADD_SIGCODE(SIGSEGV, 11, SEGV_BNDERR,  3, "failed address bounds checks", SignalCodePrintOption::Bounds);
   ADD_SIGCODE(SIGSEGV, 11, SEGV_MTEAERR, 8, "async tag check fault");
   ADD_SIGCODE(SIGSEGV, 11, SEGV_MTESERR, 9, "sync tag check fault", SignalCodePrintOption::Address);
+  ADD_SIGCODE(SIGSEGV, 11, SEGV_CPERR,  10, "control protection fault");
   // Some platforms will occasionally send nonstandard spurious SI_KERNEL
   // codes. One way to get this is via unaligned SIMD loads. Treat it as invalid address.
   ADD_SIGCODE(SIGSEGV, 11, SI_KERNEL, 0x80, "invalid address", SignalCodePrintOption::Address);
diff --git lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
index fb3af44abfa8..68fa1d13943a 100644
--- lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
+++ lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
@@ -3127,7 +3127,6 @@ size_t DWARFASTParserClang::ParseChildParameters(
         }
       }
 
-      bool skip = false;
       if (is_artificial) {
         // In order to determine if a C++ member function is "const" we
         // have to look at the const-ness of "this"...
@@ -3150,10 +3149,7 @@ size_t DWARFASTParserClang::ParseChildParameters(
             }
           }
         }
-        skip = true;
-      }
-
-      if (!skip) {
+      } else {
         Type *type = die.ResolveTypeUID(param_type_die_form.Reference());
         if (type) {
           function_param_types.push_back(type->GetForwardCompilerType());
diff --git lldb/test/API/commands/expression/import-std-module/vector-dbg-info-content/TestDbgInfoContentVectorFromStdModule.py lldb/test/API/commands/expression/import-std-module/vector-dbg-info-content/TestDbgInfoContentVectorFromStdModule.py
index 1c32222e64f1..759077302bfc 100644
--- lldb/test/API/commands/expression/import-std-module/vector-dbg-info-content/TestDbgInfoContentVectorFromStdModule.py
+++ lldb/test/API/commands/expression/import-std-module/vector-dbg-info-content/TestDbgInfoContentVectorFromStdModule.py
@@ -23,13 +23,6 @@ class TestDbgInfoContentVector(TestBase):
 
         self.runCmd("settings set target.import-std-module true")
 
-        if self.expectedCompiler(["clang"]) and self.expectedCompilerVersion(
-            [">", "16.0"]
-        ):
-            vector_type = "std::vector<Foo>"
-        else:
-            vector_type = "std::vector<Foo, std::allocator<Foo> >"
-
         size_type = "size_type"
         value_type = "value_type"
         iterator = "iterator"
@@ -41,13 +34,14 @@ class TestDbgInfoContentVector(TestBase):
             ValueCheck(name="current"),
         ]
 
-        self.expect_expr(
-            "a",
-            result_type=vector_type,
-            result_children=[
-                ValueCheck(children=[ValueCheck(value="3")]),
-                ValueCheck(children=[ValueCheck(value="1")]),
-                ValueCheck(children=[ValueCheck(value="2")]),
+        self.expect(
+            "expr a",
+            patterns=[
+                """\(std::vector<Foo(, std::allocator<Foo> )*>\) \$0 = size=3 \{
+  \[0\] = \(a = 3\)
+  \[1\] = \(a = 1\)
+  \[2\] = \(a = 2\)
+\}"""
             ],
         )
 
diff --git lldb/test/API/commands/expression/import-std-module/vector-of-vectors/TestVectorOfVectorsFromStdModule.py lldb/test/API/commands/expression/import-std-module/vector-of-vectors/TestVectorOfVectorsFromStdModule.py
index a1f33271f39d..e18785ec1359 100644
--- lldb/test/API/commands/expression/import-std-module/vector-of-vectors/TestVectorOfVectorsFromStdModule.py
+++ lldb/test/API/commands/expression/import-std-module/vector-of-vectors/TestVectorOfVectorsFromStdModule.py
@@ -17,42 +17,26 @@ class TestVectorOfVectors(TestBase):
             self, "// Set break point at this line.", lldb.SBFileSpec("main.cpp")
         )
 
-        if self.expectedCompiler(["clang"]) and self.expectedCompilerVersion(
-            [">", "16.0"]
-        ):
-            vector_type = "std::vector<int>"
-            vector_of_vector_type = "std::vector<std::vector<int> >"
-        else:
-            vector_type = "std::vector<int>"
-            vector_of_vector_type = (
-                "std::vector<std::vector<int>, std::allocator<std::vector<int> > >"
-            )
-
         size_type = "size_type"
         value_type = "value_type"
 
         self.runCmd("settings set target.import-std-module true")
 
-        self.expect_expr(
-            "a",
-            result_type=vector_of_vector_type,
-            result_children=[
-                ValueCheck(
-                    type=vector_type,
-                    children=[
-                        ValueCheck(value="1"),
-                        ValueCheck(value="2"),
-                        ValueCheck(value="3"),
-                    ],
-                ),
-                ValueCheck(
-                    type=vector_type,
-                    children=[
-                        ValueCheck(value="3"),
-                        ValueCheck(value="2"),
-                        ValueCheck(value="1"),
-                    ],
-                ),
+        self.expect(
+            "expr a",
+            patterns=[
+                """\(std::vector<std::vector<int>(, std::allocator<std::vector<int> )* >\) \$0 = size=2 \{
+  \[0\] = size=3 \{
+    \[0\] = 1
+    \[1\] = 2
+    \[2\] = 3
+  \}
+  \[1\] = size=3 \{
+    \[0\] = 3
+    \[1\] = 2
+    \[2\] = 1
+  \}
+\}"""
             ],
         )
         self.expect_expr("a.size()", result_type=size_type, result_value="2")
diff --git lldb/test/API/linux/aarch64/gcs/TestAArch64LinuxGCS.py lldb/test/API/linux/aarch64/gcs/TestAArch64LinuxGCS.py
index b425c9e548ee..0928ff8e14e0 100644
--- lldb/test/API/linux/aarch64/gcs/TestAArch64LinuxGCS.py
+++ lldb/test/API/linux/aarch64/gcs/TestAArch64LinuxGCS.py
@@ -61,3 +61,25 @@ class AArch64LinuxGCSTestCase(TestBase):
 
         # Note that we must let the debugee get killed here as it cannot exit
         # cleanly if GCS was manually enabled.
+
+    @skipUnlessArch("aarch64")
+    @skipUnlessPlatform(["linux"])
+    def test_gcs_fault(self):
+        if not self.isAArch64GCS():
+            self.skipTest("Target must support GCS.")
+
+        self.build()
+        self.runCmd("file " + self.getBuildArtifact("a.out"), CURRENT_EXECUTABLE_SET)
+        self.runCmd("run", RUN_SUCCEEDED)
+
+        if self.process().GetState() == lldb.eStateExited:
+            self.fail("Test program failed to run.")
+
+        self.expect(
+            "thread list",
+            "Expected stopped by SIGSEGV.",
+            substrs=[
+                "stopped",
+                "stop reason = signal SIGSEGV: control protection fault",
+            ],
+        )
diff --git lldb/test/API/linux/aarch64/gcs/main.c lldb/test/API/linux/aarch64/gcs/main.c
index 9633ed2838f9..32a9b07c2074 100644
--- lldb/test/API/linux/aarch64/gcs/main.c
+++ lldb/test/API/linux/aarch64/gcs/main.c
@@ -36,6 +36,19 @@ unsigned long get_gcs_status() {
   return mode;
 }
 
+void gcs_signal() {
+  // If we enabled GCS manually, then we could just return from main to generate
+  // a signal. However, if the C library enabled it, then we'd just exit
+  // normally. Assume the latter, and try to return to some bogus address to
+  // generate the signal.
+  __asm__ __volatile__(
+      // Corrupt the link register. This could be many numbers but 16 is a
+      // nicely aligned value that is unlikely to result in a fault because the
+      // PC is misaligned, which would hide the GCS fault.
+      "add x30, x30, #10\n"
+      "ret\n");
+}
+
 int main() {
   if (!(getauxval(AT_HWCAP2) & HWCAP2_GCS))
     return 1;
@@ -50,5 +63,7 @@ int main() {
   }
 
   // By now we should have one memory region where the GCS is stored.
-  return 0; // Set break point at this line.
+  gcs_signal(); // Set break point at this line.
+
+  return 0;
 }
diff --git lldb/test/API/python_api/sbprogress/TestSBProgress.py lldb/test/API/python_api/sbprogress/TestSBProgress.py
new file mode 100644
index 000000000000..c456247da80c
--- /dev/null
+++ lldb/test/API/python_api/sbprogress/TestSBProgress.py
@@ -0,0 +1,35 @@
+"""Test the SBProgress API."""
+
+import lldb
+from lldbsuite.test.lldbtest import *
+
+
+class SBProgressTestCase(TestBase):
+    def test_with_external_bit_set(self):
+        """Test SBProgress events are listened to when the external bit is set."""
+
+        progress = lldb.SBProgress("Test SBProgress", "Test progress", self.dbg)
+        listener = lldb.SBListener("Test listener")
+        broadcaster = self.dbg.GetBroadcaster()
+        broadcaster.AddListener(listener, lldb.eBroadcastBitExternalProgress)
+        event = lldb.SBEvent()
+
+        expected_string = "Test progress first increment"
+        progress.Increment(1, expected_string)
+        self.assertTrue(listener.PeekAtNextEvent(event))
+        stream = lldb.SBStream()
+        event.GetDescription(stream)
+        self.assertIn(expected_string, stream.GetData())
+
+    def test_without_external_bit_set(self):
+        """Test SBProgress events are not listened to on the internal progress bit."""
+
+        progress = lldb.SBProgress("Test SBProgress", "Test progress", self.dbg)
+        listener = lldb.SBListener("Test listener")
+        broadcaster = self.dbg.GetBroadcaster()
+        broadcaster.AddListener(listener, lldb.eBroadcastBitProgress)
+        event = lldb.SBEvent()
+
+        expected_string = "Test progress first increment"
+        progress.Increment(1, expected_string)
+        self.assertFalse(listener.PeekAtNextEvent(event))
diff --git lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py
index f4f30b6677e5..580ad38ab51c 100644
--- lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py
+++ lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py
@@ -672,6 +672,7 @@ class TestDAP_variables(lldbdap_testcase.DAPTestCaseBase):
         self.do_test_indexedVariables(enableSyntheticChildDebugging=True)
 
     @skipIfWindows
+    @skipIfAsan # FIXME this fails with a non-asan issue on green dragon.
     def test_registers(self):
         """
         Test that registers whose byte size is the size of a pointer on
diff --git lldb/unittests/Core/FormatEntityTest.cpp lldb/unittests/Core/FormatEntityTest.cpp
index 0a68c9340b77..5983c9de99ef 100644
--- lldb/unittests/Core/FormatEntityTest.cpp
+++ lldb/unittests/Core/FormatEntityTest.cpp
@@ -148,6 +148,9 @@ constexpr llvm::StringRef lookupStrings[] = {
     "${thread.return-value}",
     "${thread.completed-expression}",
     "${target.arch}",
+    "${target.file.basename}",
+    "${target.file.dirname}",
+    "${target.file.fullpath}",
     "${var.dummy-var-to-test-wildcard}"};
 
 TEST(FormatEntity, LookupAllEntriesInTree) {
diff --git llvm/CMakeLists.txt llvm/CMakeLists.txt
index f14065ab0379..ad12100fdb5b 100644
--- llvm/CMakeLists.txt
+++ llvm/CMakeLists.txt
@@ -486,6 +486,7 @@ set(LLVM_ALL_TARGETS
   PowerPC
   RISCV
   Sparc
+  SPIRV
   SystemZ
   VE
   WebAssembly
@@ -498,7 +499,6 @@ set(LLVM_ALL_EXPERIMENTAL_TARGETS
   CSKY
   DirectX
   M68k
-  SPIRV
   Xtensa
 )
 
diff --git llvm/Maintainers.md llvm/Maintainers.md
index 10714b508ca6..534d81e68d02 100644
--- llvm/Maintainers.md
+++ llvm/Maintainers.md
@@ -87,6 +87,11 @@ flo@fhahn.com (email), [fhahn](https://github.com/fhahn) (GitHub)
 Alina Sbirlea \
 asbirlea@google.com (email), [alinas](https://github.com/alinas) (GitHub)
 
+#### LoopInterchange
+
+Madhur Amilkanthwar \
+madhura@nvidia.com (email), [madhur13490](https://github.com/madhur13490) (GitHub)
+
 #### SandboxVectorizer
 
 Vasileios Porpodas \
@@ -279,8 +284,11 @@ koachan@protonmail.com (email), [koachan](https://github.com/koachan) (GitHub)
 
 #### SPIRV backend
 
-Ilia Diachkov \
-ilia.diachkov@gmail.com (email), [iliya-diyachkov](https://github.com/iliya-diyachkov) (GitHub)
+Vyacheslav Levytskyy \
+vyacheslav.levytskyy@intel.com, vyacheslav.levytskyy@gmail.com (email), [VyacheslavLevytskyy](https://github.com/VyacheslavLevytskyy) (GitHub)
+
+Nathan GauÃ«r \
+brioche@google.com (email), [Keenuts](https://github.com/Keenuts) (GitHub)
 
 #### SystemZ backend
 
@@ -328,10 +336,10 @@ jakub@nod-labs.com (email), [kuhar](https://github.com/kuhar) (GitHub)
 Peter Collingbourne \
 peter@pcc.me.uk (email), [pcc](https://github.com/pcc) (GitHub)
 
-#### CMake and library layering
+#### CMake
 
-Chandler Carruth \
-chandlerc@gmail.com, chandlerc@google.com (email), [chandlerc](https://github.com/chandlerc) (GitHub)
+Petr Hosek \
+phosek@google.com (email), [petrhosek](https://github.com/petrhosek) (GitHub)
 
 #### Debug info and DWARF
 
@@ -351,6 +359,11 @@ echristo@gmail.com (email), [echristo](https://github.com/echristo) (GitHub)
 Teresa Johnson \
 tejohnson@google.com (email), [teresajohnson](https://github.com/teresajohnson) (GitHub)
 
+#### Library layering
+
+Takumi Nakamura \
+geek4civic@gmail.com (email), [chapuni](https://github.com/chapuni) (GitHub)
+
 #### MCJIT, Orc, RuntimeDyld, PerfJITEvents
 
 Lang Hames \
@@ -472,7 +485,7 @@ sabre@nondot.org (email), [lattner](https://github.com/lattner) (GitHub), clattn
 
 Paul C. Anagnostopoulos (paul@windfall.com, [Paul-C-Anagnostopoulos](https://github.com/Paul-C-Anagnostopoulos)) -- TableGen \
 Justin Bogner (mail@justinbogner.com, [bogner](https://github.com/bogner)) -- SelectionDAG \
-Chandler Carruth (chandlerc@gmail.com, chandlerc@google.com, [chandlerc](https://github.com/chandlerc)) -- ADT, Support, Inlining \
+Chandler Carruth (chandlerc@gmail.com, chandlerc@google.com, [chandlerc](https://github.com/chandlerc)) -- ADT, Support, Inlining, CMake and library layering \
 Peter Collingbourne (peter@pcc.me.uk, [pcc](https://github.com/pcc)) -- LTO \
 Evan Cheng (evan.cheng@apple.com) -- Parts of code generator not covered by someone else \
 Jake Ehrlich (jakehehrlich@google.com, [jakehehrlich](https://github.com/jakehehrlich)) -- llvm-objcopy and ObjCopy library \
diff --git llvm/bindings/ocaml/debuginfo/debuginfo_ocaml.c llvm/bindings/ocaml/debuginfo/debuginfo_ocaml.c
index ba4a9cbde37a..023ebd6d60cd 100644
--- llvm/bindings/ocaml/debuginfo/debuginfo_ocaml.c
+++ llvm/bindings/ocaml/debuginfo/debuginfo_ocaml.c
@@ -616,9 +616,10 @@ value llvm_dibuild_create_member_pointer_type_bytecode(value *argv, int argn) {
   );
 }
 
-value llvm_dibuild_create_object_pointer_type(value Builder, value Type) {
+value llvm_dibuild_create_object_pointer_type(value Builder, value Type,
+                                              value Implicit) {
   LLVMMetadataRef Metadata = LLVMDIBuilderCreateObjectPointerType(
-      DIBuilder_val(Builder), Metadata_val(Type));
+      DIBuilder_val(Builder), Metadata_val(Type), Bool_val(Implicit));
   return to_val(Metadata);
 }
 
diff --git llvm/bindings/ocaml/debuginfo/llvm_debuginfo.ml llvm/bindings/ocaml/debuginfo/llvm_debuginfo.ml
index 8bb5edb17a2c..1b882d94a30b 100644
--- llvm/bindings/ocaml/debuginfo/llvm_debuginfo.ml
+++ llvm/bindings/ocaml/debuginfo/llvm_debuginfo.ml
@@ -398,7 +398,7 @@ external dibuild_create_member_pointer_type :
   = "llvm_dibuild_create_member_pointer_type_bytecode" "llvm_dibuild_create_member_pointer_type_native"
 
 external dibuild_create_object_pointer_type :
-  lldibuilder -> Llvm.llmetadata -> Llvm.llmetadata
+  lldibuilder -> Llvm.llmetadata -> implicit:bool -> Llvm.llmetadata
   = "llvm_dibuild_create_object_pointer_type"
 
 external dibuild_create_qualified_type :
diff --git llvm/bindings/ocaml/debuginfo/llvm_debuginfo.mli llvm/bindings/ocaml/debuginfo/llvm_debuginfo.mli
index 7c7882ccce85..5c619a2646f5 100644
--- llvm/bindings/ocaml/debuginfo/llvm_debuginfo.mli
+++ llvm/bindings/ocaml/debuginfo/llvm_debuginfo.mli
@@ -471,10 +471,11 @@ val dibuild_create_member_pointer_type :
     a pointer to member. See LLVMDIBuilderCreateMemberPointerType *)
 
 val dibuild_create_object_pointer_type :
-  lldibuilder -> Llvm.llmetadata -> Llvm.llmetadata
+  lldibuilder -> Llvm.llmetadata -> implicit:bool -> Llvm.llmetadata
 (** [dibuild_create_object_pointer_type dib ty] Create a uniqued DIType* clone
-  with FlagObjectPointer and FlagArtificial set. [dib] is the dibuilder
-  value and [ty] the underlying type to which this pointer points. *)
+  with FlagObjectPointer. [dib] is the dibuilder
+  value and [ty] the underlying type to which this pointer points. If
+  [implicit] is true, also set FlagArtificial. *)
 
 val dibuild_create_qualified_type :
   lldibuilder -> tag:int -> Llvm.llmetadata -> Llvm.llmetadata
diff --git llvm/cmake/modules/AddLLVM.cmake llvm/cmake/modules/AddLLVM.cmake
index e046e3798e54..d3e9377c8d2f 100644
--- llvm/cmake/modules/AddLLVM.cmake
+++ llvm/cmake/modules/AddLLVM.cmake
@@ -1220,9 +1220,9 @@ function(add_llvm_pass_plugin name)
     endif()
     set_property(GLOBAL APPEND PROPERTY LLVM_STATIC_EXTENSIONS ${name})
   elseif(NOT ARG_NO_MODULE)
-    add_llvm_library(${name} MODULE ${ARG_UNPARSED_ARGUMENTS})
+    add_llvm_library(${name} MODULE NO_EXPORT ${ARG_UNPARSED_ARGUMENTS})
   else()
-    add_llvm_library(${name} OBJECT ${ARG_UNPARSED_ARGUMENTS})
+    add_llvm_library(${name} OBJECT NO_EXPORT ${ARG_UNPARSED_ARGUMENTS})
   endif()
   message(STATUS "Registering ${name} as a pass plugin (static build: ${LLVM_${name_upper}_LINK_INTO_TOOLS})")
 
diff --git llvm/docs/DeveloperPolicy.rst llvm/docs/DeveloperPolicy.rst
index 6614d036a014..18b05d2e58e6 100644
--- llvm/docs/DeveloperPolicy.rst
+++ llvm/docs/DeveloperPolicy.rst
@@ -136,7 +136,7 @@ awareness of. For such changes, the following should be done:
 
 .. warning::
 
-  Phabricator is deprecated is available in read-only mode,
+  Phabricator is deprecated and is available in read-only mode,
   for new code contributions use :ref:`GitHub Pull Requests <github-reviews>`.
   This section contains old information that needs to be updated.
 
diff --git llvm/docs/GitHub.rst llvm/docs/GitHub.rst
index 85766bfe94af..892b8abcc2d4 100644
--- llvm/docs/GitHub.rst
+++ llvm/docs/GitHub.rst
@@ -50,7 +50,7 @@ documentation refer to `GitHub's documentation <https://docs.github.com/pull-req
 .. note::
    If you are using a Pull Request for purposes other than review
    (eg: precommit CI results, convenient web-based reverts, etc)
-   `skip-precommit-approval <https://github.com/llvm/llvm-project/labels?q=skip-precommit-approval>`_
+   add the `skip-precommit-approval <https://github.com/llvm/llvm-project/labels?q=skip-precommit-approval>`_
    label to the PR.
 
 GitHub Tools
diff --git llvm/docs/MyFirstTypoFix.rst llvm/docs/MyFirstTypoFix.rst
index 733b3eac141f..5856615bee8b 100644
--- llvm/docs/MyFirstTypoFix.rst
+++ llvm/docs/MyFirstTypoFix.rst
@@ -378,7 +378,7 @@ your branch with more commits and push to your GitHub fork of ``llvm-project``.
 It is best if you answer comments from the reviewer directly instead of expecting
 them to read through all the changes again.
 
-For example you might comment "I have done this." or "I was able to this part
+For example you might comment "I have done this." or "I was able to do this part
 but have a question about...".
 
 Review expectations
diff --git llvm/docs/ReleaseNotes.md llvm/docs/ReleaseNotes.md
index 8f88b824f965..48f962d212e8 100644
--- llvm/docs/ReleaseNotes.md
+++ llvm/docs/ReleaseNotes.md
@@ -47,6 +47,12 @@ for adding a new subsection. -->
   same semantics. The normalizer makes it easier to spot semantic differences
   when diffing two modules which have undergone different passes.
 
+* The SPIR-V backend is now an official LLVM target, providing OpenCL and SYCL
+  conformance and establishing a foundation for broader applicability to other
+  APIs, including Vulkan, GLSL, and HLSL. This backend aims to offer a unified
+  approach for diverse compute and graphics workloads, providing a robust
+  alternative to the Khronos SPIR-V LLVM Translator.
+
 * ...
 
 <!-- If you would like to document a larger change, then you can add a
diff --git llvm/include/llvm-c/DebugInfo.h llvm/include/llvm-c/DebugInfo.h
index 07f87d44088e..ac7ee5a7cc9a 100644
--- llvm/include/llvm-c/DebugInfo.h
+++ llvm/include/llvm-c/DebugInfo.h
@@ -870,13 +870,16 @@ LLVMDIBuilderCreateObjCProperty(LLVMDIBuilderRef Builder,
                                 LLVMMetadataRef Ty);
 
 /**
- * Create a uniqued DIType* clone with FlagObjectPointer and FlagArtificial set.
+ * Create a uniqued DIType* clone with FlagObjectPointer. If \c Implicit
+ * is true, then also set FlagArtificial.
  * \param Builder   The DIBuilder.
  * \param Type      The underlying type to which this pointer points.
+ * \param Implicit  Indicates whether this pointer was implicitly generated
+ *                  (i.e., not spelled out in source).
  */
-LLVMMetadataRef
-LLVMDIBuilderCreateObjectPointerType(LLVMDIBuilderRef Builder,
-                                     LLVMMetadataRef Type);
+LLVMMetadataRef LLVMDIBuilderCreateObjectPointerType(LLVMDIBuilderRef Builder,
+                                                     LLVMMetadataRef Type,
+                                                     LLVMBool Implicit);
 
 /**
  * Create debugging information entry for a qualified
diff --git llvm/include/llvm/BinaryFormat/DXContainerConstants.def llvm/include/llvm/BinaryFormat/DXContainerConstants.def
index 1aacbb2f65b2..96d4499c9cad 100644
--- llvm/include/llvm/BinaryFormat/DXContainerConstants.def
+++ llvm/include/llvm/BinaryFormat/DXContainerConstants.def
@@ -58,7 +58,7 @@ SHADER_FEATURE_FLAG(31, 36, NextUnusedBit, "Next reserved shader flag bit (not a
 DXIL_MODULE_FLAG( 0,  DisableOptimizations,   "D3D11_1_SB_GLOBAL_FLAG_SKIP_OPTIMIZATION")
 DXIL_MODULE_FLAG( 1,  DisableMathRefactoring, "D3D10_SB_GLOBAL_FLAG_REFACTORING_ALLOWED")
 DXIL_MODULE_FLAG( 3,  ForceEarlyDepthStencil, "D3D11_SB_GLOBAL_FLAG_FORCE_EARLY_DEPTH_STENCIL")
-DXIL_MODULE_FLAG( 4,  EnableRawAndStructuredBuffers, "D3D11_SB_GLOBAL_FLAG_ENABLE_RAW_AND_STRUCTURED_BUFFERS")
+DXIL_MODULE_FLAG( 4,  EnableRawAndStructuredBuffers, "Raw and Structured buffers")
 DXIL_MODULE_FLAG( 5,  LowPrecisionPresent, "D3D11_1_SB_GLOBAL_FLAG_ENABLE_MINIMUM_PRECISION")
 DXIL_MODULE_FLAG( 8,  AllResourcesBound, "D3D12_SB_GLOBAL_FLAG_ALL_RESOURCES_BOUND")
 DXIL_MODULE_FLAG(23,  UseNativeLowPrecision, "Native 16bit types enabled")
diff --git llvm/include/llvm/BinaryFormat/Wasm.h llvm/include/llvm/BinaryFormat/Wasm.h
index 759e43212509..ede2d692a594 100644
--- llvm/include/llvm/BinaryFormat/Wasm.h
+++ llvm/include/llvm/BinaryFormat/Wasm.h
@@ -170,7 +170,7 @@ enum : unsigned {
   WASM_ELEM_SEGMENT_HAS_TABLE_NUMBER = 0x02, // if passive == 0
   WASM_ELEM_SEGMENT_HAS_INIT_EXPRS = 0x04,
 };
-const unsigned WASM_ELEM_SEGMENT_MASK_HAS_ELEM_KIND = 0x3;
+const unsigned WASM_ELEM_SEGMENT_MASK_HAS_ELEM_DESC = 0x3;
 
 // Feature policy prefixes used in the custom "target_features" section
 enum : uint8_t {
@@ -415,6 +415,10 @@ struct WasmDataSegment {
   uint32_t Comdat; // from the "comdat info" section
 };
 
+// 3 different element segment modes are encodable. This class is currently
+// only used during decoding (see WasmElemSegment below).
+enum class ElemSegmentMode { Active, Passive, Declarative };
+
 // Represents a Wasm element segment, with some limitations compared the spec:
 // 1) Does not model passive or declarative segments (Segment will end up with
 // an Offset field of i32.const 0)
diff --git llvm/include/llvm/CodeGen/BasicTTIImpl.h llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 232106fd445e..e4e4e264180d 100644
--- llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -223,12 +223,11 @@ private:
     //
     // First, compute the cost of the individual memory operations.
     InstructionCost AddrExtractCost =
-        IsGatherScatter
-            ? getScalarizationOverhead(
-                  FixedVectorType::get(
-                      PointerType::get(VT->getElementType(), 0), VF),
-                  /*Insert=*/false, /*Extract=*/true, CostKind)
-            : 0;
+        IsGatherScatter ? getScalarizationOverhead(
+                              FixedVectorType::get(
+                                  PointerType::get(VT->getContext(), 0), VF),
+                              /*Insert=*/false, /*Extract=*/true, CostKind)
+                        : 0;
 
     // The cost of the scalar loads/stores.
     InstructionCost MemoryOpCost =
diff --git llvm/include/llvm/CodeGen/CallingConvLower.h llvm/include/llvm/CodeGen/CallingConvLower.h
index d5a63c8dd627..85171138d1eb 100644
--- llvm/include/llvm/CodeGen/CallingConvLower.h
+++ llvm/include/llvm/CodeGen/CallingConvLower.h
@@ -254,7 +254,7 @@ public:
   /// isAllocated - Return true if the specified register (or an alias) is
   /// allocated.
   bool isAllocated(MCRegister Reg) const {
-    return UsedRegs[Reg / 32] & (1 << (Reg & 31));
+    return UsedRegs[Reg.id() / 32] & (1 << (Reg.id() & 31));
   }
 
   /// AnalyzeFormalArguments - Analyze an array of argument values,
diff --git llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 94e36e412b0c..9b78342c8fc3 100644
--- llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -840,8 +840,10 @@ public:
   bool matchRedundantBinOpInEquality(MachineInstr &MI,
                                      BuildFnTy &MatchInfo) const;
 
-  /// Match shifts greater or equal to the bitwidth of the operation.
-  bool matchShiftsTooBig(MachineInstr &MI) const;
+  /// Match shifts greater or equal to the range (the bitwidth of the result
+  /// datatype, or the effective bitwidth of the source value).
+  bool matchShiftsTooBig(MachineInstr &MI,
+                         std::optional<int64_t> &MatchInfo) const;
 
   /// Match constant LHS ops that should be commuted.
   bool matchCommuteConstantToRHS(MachineInstr &MI) const;
diff --git llvm/include/llvm/CodeGen/ISDOpcodes.h llvm/include/llvm/CodeGen/ISDOpcodes.h
index 604dc9419025..fd8784a4c100 100644
--- llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -1480,6 +1480,10 @@ enum NodeType {
   // Output: Output Chain
   EXPERIMENTAL_VECTOR_HISTOGRAM,
 
+  // Finds the index of the last active mask element
+  // Operands: Mask
+  VECTOR_FIND_LAST_ACTIVE,
+
   // llvm.clear_cache intrinsic
   // Operands: Input Chain, Start Addres, End Address
   // Outputs: Output Chain
diff --git llvm/include/llvm/CodeGen/LivePhysRegs.h llvm/include/llvm/CodeGen/LivePhysRegs.h
index d315e4ff6f3a..037905119eb2 100644
--- llvm/include/llvm/CodeGen/LivePhysRegs.h
+++ llvm/include/llvm/CodeGen/LivePhysRegs.h
@@ -93,7 +93,7 @@ public:
     assert(TRI && "LivePhysRegs is not initialized.");
     assert(Reg <= TRI->getNumRegs() && "Expected a physical register.");
     for (MCRegAliasIterator R(Reg, TRI, true); R.isValid(); ++R)
-      LiveRegs.erase(*R);
+      LiveRegs.erase((*R).id());
   }
 
   /// Removes physical registers clobbered by the regmask operand \p MO.
diff --git llvm/include/llvm/CodeGen/LiveRegMatrix.h llvm/include/llvm/CodeGen/LiveRegMatrix.h
index 373f4402dd8d..ce7810802910 100644
--- llvm/include/llvm/CodeGen/LiveRegMatrix.h
+++ llvm/include/llvm/CodeGen/LiveRegMatrix.h
@@ -161,7 +161,7 @@ public:
   /// Use MCRegUnitIterator to enumerate all regunits in the desired PhysReg.
   /// This returns a reference to an internal Query data structure that is only
   /// valid until the next query() call.
-  LiveIntervalUnion::Query &query(const LiveRange &LR, MCRegister RegUnit);
+  LiveIntervalUnion::Query &query(const LiveRange &LR, MCRegUnit RegUnit);
 
   /// Directly access the live interval unions per regunit.
   /// This returns an array indexed by the regunit number.
diff --git llvm/include/llvm/CodeGen/MachineBasicBlock.h llvm/include/llvm/CodeGen/MachineBasicBlock.h
index 7fe33c3913f2..0b803a972474 100644
--- llvm/include/llvm/CodeGen/MachineBasicBlock.h
+++ llvm/include/llvm/CodeGen/MachineBasicBlock.h
@@ -129,7 +129,7 @@ public:
   /// clearly as they both have an integer type.
   struct RegisterMaskPair {
   public:
-    MCPhysReg PhysReg;
+    MCRegister PhysReg;
     LaneBitmask LaneMask;
 
     RegisterMaskPair(MCPhysReg PhysReg, LaneBitmask LaneMask)
diff --git llvm/include/llvm/CodeGen/MachineOperand.h llvm/include/llvm/CodeGen/MachineOperand.h
index 63a172134538..be1b4fb7d54f 100644
--- llvm/include/llvm/CodeGen/MachineOperand.h
+++ llvm/include/llvm/CodeGen/MachineOperand.h
@@ -645,8 +645,9 @@ public:
   /// mask pointers.
   static bool clobbersPhysReg(const uint32_t *RegMask, MCRegister PhysReg) {
     // See TargetRegisterInfo.h.
-    assert(PhysReg < (1u << 30) && "Not a physical register");
-    return !(RegMask[PhysReg / 32] & (1u << PhysReg % 32));
+    assert((!PhysReg.isValid() || PhysReg.isPhysical()) &&
+           "Not a physical register");
+    return !(RegMask[PhysReg.id() / 32] & (1u << PhysReg.id() % 32));
   }
 
   /// clobbersPhysReg - Returns true if this RegMask operand clobbers PhysReg.
diff --git llvm/include/llvm/CodeGen/MachineRegisterInfo.h llvm/include/llvm/CodeGen/MachineRegisterInfo.h
index 5ee3aef28a4f..91f68581df48 100644
--- llvm/include/llvm/CodeGen/MachineRegisterInfo.h
+++ llvm/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -938,7 +938,7 @@ public:
     MCRegAliasIterator R(PhysReg, TRI, true);
 
     for (; R.isValid(); ++R)
-      ReservedRegs.set(*R);
+      ReservedRegs.set((*R).id());
   }
 
   /// reservedRegsFrozen - Returns true after freezeReservedRegs() was called
@@ -951,7 +951,7 @@ public:
   /// register.  Any register can be reserved before freezeReservedRegs() is
   /// called.
   bool canReserveReg(MCRegister PhysReg) const {
-    return !reservedRegsFrozen() || ReservedRegs.test(PhysReg);
+    return !reservedRegsFrozen() || ReservedRegs.test(PhysReg.id());
   }
 
   /// getReservedRegs - Returns a reference to the frozen set of reserved
diff --git llvm/include/llvm/CodeGen/ReachingDefAnalysis.h llvm/include/llvm/CodeGen/ReachingDefAnalysis.h
index 0c1e707e4ecb..2e976a88b4ce 100644
--- llvm/include/llvm/CodeGen/ReachingDefAnalysis.h
+++ llvm/include/llvm/CodeGen/ReachingDefAnalysis.h
@@ -177,24 +177,23 @@ public:
 
   /// Provides the instruction id of the closest reaching def instruction of
   /// Reg that reaches MI, relative to the begining of MI's basic block.
-  int getReachingDef(MachineInstr *MI, MCRegister Reg) const;
+  int getReachingDef(MachineInstr *MI, Register Reg) const;
 
   /// Return whether A and B use the same def of Reg.
-  bool hasSameReachingDef(MachineInstr *A, MachineInstr *B,
-                          MCRegister Reg) const;
+  bool hasSameReachingDef(MachineInstr *A, MachineInstr *B, Register Reg) const;
 
   /// Return whether the reaching def for MI also is live out of its parent
   /// block.
-  bool isReachingDefLiveOut(MachineInstr *MI, MCRegister Reg) const;
+  bool isReachingDefLiveOut(MachineInstr *MI, Register Reg) const;
 
   /// Return the local MI that produces the live out value for Reg, or
   /// nullptr for a non-live out or non-local def.
   MachineInstr *getLocalLiveOutMIDef(MachineBasicBlock *MBB,
-                                     MCRegister Reg) const;
+                                     Register Reg) const;
 
   /// If a single MachineInstr creates the reaching definition, then return it.
   /// Otherwise return null.
-  MachineInstr *getUniqueReachingMIDef(MachineInstr *MI, MCRegister Reg) const;
+  MachineInstr *getUniqueReachingMIDef(MachineInstr *MI, Register Reg) const;
 
   /// If a single MachineInstr creates the reaching definition, for MIs operand
   /// at Idx, then return it. Otherwise return null.
@@ -206,43 +205,42 @@ public:
 
   /// Provide whether the register has been defined in the same basic block as,
   /// and before, MI.
-  bool hasLocalDefBefore(MachineInstr *MI, MCRegister Reg) const;
+  bool hasLocalDefBefore(MachineInstr *MI, Register Reg) const;
 
   /// Return whether the given register is used after MI, whether it's a local
   /// use or a live out.
-  bool isRegUsedAfter(MachineInstr *MI, MCRegister Reg) const;
+  bool isRegUsedAfter(MachineInstr *MI, Register Reg) const;
 
   /// Return whether the given register is defined after MI.
-  bool isRegDefinedAfter(MachineInstr *MI, MCRegister Reg) const;
+  bool isRegDefinedAfter(MachineInstr *MI, Register Reg) const;
 
   /// Provides the clearance - the number of instructions since the closest
   /// reaching def instuction of Reg that reaches MI.
-  int getClearance(MachineInstr *MI, MCRegister Reg) const;
+  int getClearance(MachineInstr *MI, Register Reg) const;
 
   /// Provides the uses, in the same block as MI, of register that MI defines.
   /// This does not consider live-outs.
-  void getReachingLocalUses(MachineInstr *MI, MCRegister Reg,
+  void getReachingLocalUses(MachineInstr *MI, Register Reg,
                             InstSet &Uses) const;
 
   /// Search MBB for a definition of Reg and insert it into Defs. If no
   /// definition is found, recursively search the predecessor blocks for them.
-  void getLiveOuts(MachineBasicBlock *MBB, MCRegister Reg, InstSet &Defs,
+  void getLiveOuts(MachineBasicBlock *MBB, Register Reg, InstSet &Defs,
                    BlockSet &VisitedBBs) const;
-  void getLiveOuts(MachineBasicBlock *MBB, MCRegister Reg, InstSet &Defs) const;
+  void getLiveOuts(MachineBasicBlock *MBB, Register Reg, InstSet &Defs) const;
 
   /// For the given block, collect the instructions that use the live-in
   /// value of the provided register. Return whether the value is still
   /// live on exit.
-  bool getLiveInUses(MachineBasicBlock *MBB, MCRegister Reg,
-                     InstSet &Uses) const;
+  bool getLiveInUses(MachineBasicBlock *MBB, Register Reg, InstSet &Uses) const;
 
   /// Collect the users of the value stored in Reg, which is defined
   /// by MI.
-  void getGlobalUses(MachineInstr *MI, MCRegister Reg, InstSet &Uses) const;
+  void getGlobalUses(MachineInstr *MI, Register Reg, InstSet &Uses) const;
 
   /// Collect all possible definitions of the value stored in Reg, which is
   /// used by MI.
-  void getGlobalReachingDefs(MachineInstr *MI, MCRegister Reg,
+  void getGlobalReachingDefs(MachineInstr *MI, Register Reg,
                              InstSet &Defs) const;
 
   /// Return whether From can be moved forwards to just before To.
@@ -267,13 +265,12 @@ public:
 
   /// Return whether a MachineInstr could be inserted at MI and safely define
   /// the given register without affecting the program.
-  bool isSafeToDefRegAt(MachineInstr *MI, MCRegister Reg) const;
+  bool isSafeToDefRegAt(MachineInstr *MI, Register Reg) const;
 
   /// Return whether a MachineInstr could be inserted at MI and safely define
   /// the given register without affecting the program, ignoring any effects
   /// on the provided instructions.
-  bool isSafeToDefRegAt(MachineInstr *MI, MCRegister Reg,
-                        InstSet &Ignore) const;
+  bool isSafeToDefRegAt(MachineInstr *MI, Register Reg, InstSet &Ignore) const;
 
 private:
   /// Set up LiveRegs by merging predecessor live-out values.
@@ -308,7 +305,7 @@ private:
 
   /// Provides the instruction of the closest reaching def instruction of
   /// Reg that reaches MI, relative to the begining of MI's basic block.
-  MachineInstr *getReachingLocalMIDef(MachineInstr *MI, MCRegister Reg) const;
+  MachineInstr *getReachingLocalMIDef(MachineInstr *MI, Register Reg) const;
 };
 
 } // namespace llvm
diff --git llvm/include/llvm/CodeGen/Register.h llvm/include/llvm/CodeGen/Register.h
index 4a61ea8af3b4..fac5f00110ef 100644
--- llvm/include/llvm/CodeGen/Register.h
+++ llvm/include/llvm/CodeGen/Register.h
@@ -21,7 +21,7 @@ class Register {
 
 public:
   constexpr Register(unsigned Val = 0) : Reg(Val) {}
-  constexpr Register(MCRegister Val) : Reg(Val) {}
+  constexpr Register(MCRegister Val) : Reg(Val.id()) {}
 
   // Register numbers can represent physical registers, virtual registers, and
   // sometimes stack slots. The unsigned values are divided into these ranges:
@@ -108,8 +108,7 @@ public:
   /// expected to have already validated that this Register is, indeed,
   /// physical.
   MCRegister asMCReg() const {
-    assert(Reg == MCRegister::NoRegister ||
-           MCRegister::isPhysicalRegister(Reg));
+    assert(!isValid() || isPhysical());
     return MCRegister(Reg);
   }
 
diff --git llvm/include/llvm/CodeGen/SelectionDAGNodes.h llvm/include/llvm/CodeGen/SelectionDAGNodes.h
index 03899493847b..49467ce0a54c 100644
--- llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -916,10 +916,10 @@ public:
                                    SmallVectorImpl<const SDNode *> &Worklist,
                                    unsigned int MaxSteps = 0,
                                    bool TopologicalPrune = false) {
-    SmallVector<const SDNode *, 8> DeferredNodes;
     if (Visited.count(N))
       return true;
 
+    SmallVector<const SDNode *, 8> DeferredNodes;
     // Node Id's are assigned in three places: As a topological
     // ordering (> 0), during legalization (results in values set to
     // 0), new nodes (set to -1). If N has a topolgical id then we
diff --git llvm/include/llvm/CodeGen/TargetLowering.h llvm/include/llvm/CodeGen/TargetLowering.h
index ce58777655e0..38ac90f0c081 100644
--- llvm/include/llvm/CodeGen/TargetLowering.h
+++ llvm/include/llvm/CodeGen/TargetLowering.h
@@ -4781,7 +4781,7 @@ public:
   virtual bool CanLowerReturn(CallingConv::ID /*CallConv*/,
                               MachineFunction &/*MF*/, bool /*isVarArg*/,
                const SmallVectorImpl<ISD::OutputArg> &/*Outs*/,
-               LLVMContext &/*Context*/) const
+               LLVMContext &/*Context*/, const Type *RetTy) const
   {
     // Return true by default to get preexisting behavior.
     return true;
@@ -5368,6 +5368,11 @@ public:
   /// \returns The expansion result or SDValue() if it fails.
   SDValue expandVPCTTZElements(SDNode *N, SelectionDAG &DAG) const;
 
+  /// Expand VECTOR_FIND_LAST_ACTIVE nodes
+  /// \param N Node to expand
+  /// \returns The expansion result or SDValue() if it fails.
+  SDValue expandVectorFindLastActive(SDNode *N, SelectionDAG &DAG) const;
+
   /// Expand ABS nodes. Expands vector/scalar ABS nodes,
   /// vector nodes can only succeed if all operations are legal/custom.
   /// (ABS x) -> (XOR (ADD x, (SRA x, type_size)), (SRA x, type_size))
diff --git llvm/include/llvm/CodeGen/TargetSubtargetInfo.h llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
index a94ebf55f6c1..76c94981e1af 100644
--- llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
+++ llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
@@ -324,7 +324,7 @@ public:
   /// written in the tablegen descriptions, false if it should allocate
   /// the specified physical register later if is it callee-saved.
   virtual bool ignoreCSRForAllocationOrder(const MachineFunction &MF,
-                                           unsigned PhysReg) const {
+                                           MCRegister PhysReg) const {
     return false;
   }
 
diff --git llvm/include/llvm/CodeGen/VirtRegMap.h llvm/include/llvm/CodeGen/VirtRegMap.h
index 45750f34fa20..c9e405e1981d 100644
--- llvm/include/llvm/CodeGen/VirtRegMap.h
+++ llvm/include/llvm/CodeGen/VirtRegMap.h
@@ -94,7 +94,7 @@ public:
 
   /// creates a mapping for the specified virtual register to
   /// the specified physical register
-  void assignVirt2Phys(Register virtReg, MCPhysReg physReg);
+  void assignVirt2Phys(Register virtReg, MCRegister physReg);
 
   bool isShapeMapEmpty() const { return Virt2ShapeMap.empty(); }
 
diff --git llvm/include/llvm/DebugInfo/CodeView/CodeViewRegisters.def llvm/include/llvm/DebugInfo/CodeView/CodeViewRegisters.def
index 5d537755b2d6..7c38b536c277 100644
--- llvm/include/llvm/DebugInfo/CodeView/CodeViewRegisters.def
+++ llvm/include/llvm/DebugInfo/CodeView/CodeViewRegisters.def
@@ -16,7 +16,8 @@
 
 #if !defined(CV_REGISTERS_ALL) && !defined(CV_REGISTERS_X86) &&                \
     !defined(CV_REGISTERS_ARM) &&                                              \
-    !defined(CV_REGISTERS_ARM64)
+    !defined(CV_REGISTERS_ARM64) &&                                            \
+    !defined(CV_REGISTERS_MIPS)
 #error Need include at least one register set.
 #endif
 
@@ -793,3 +794,88 @@ CV_REGISTER(ARM64_H31, 301)
 #pragma pop_macro("ARM64_FPCR")
 
 #endif // defined(CV_REGISTERS_ALL) || defined(CV_REGISTERS_ARM64)
+
+#if defined(CV_REGISTERS_ALL) || defined(CV_REGISTERS_MIPS)
+
+// MIPS registers
+CV_REGISTER(MIPS_NOREG, 0)
+
+// General purpose integer registers
+
+CV_REGISTER(MIPS_ZERO, 10)
+CV_REGISTER(MIPS_AT, 11)
+CV_REGISTER(MIPS_V0, 12)
+CV_REGISTER(MIPS_V1, 13)
+CV_REGISTER(MIPS_A0, 14)
+CV_REGISTER(MIPS_A1, 15)
+CV_REGISTER(MIPS_A2, 16)
+CV_REGISTER(MIPS_A3, 17)
+CV_REGISTER(MIPS_T0, 18)
+CV_REGISTER(MIPS_T1, 19)
+CV_REGISTER(MIPS_T2, 20)
+CV_REGISTER(MIPS_T3, 21)
+CV_REGISTER(MIPS_T4, 22)
+CV_REGISTER(MIPS_T5, 23)
+CV_REGISTER(MIPS_T6, 24)
+CV_REGISTER(MIPS_T7, 25)
+CV_REGISTER(MIPS_S0, 26)
+CV_REGISTER(MIPS_S1, 27)
+CV_REGISTER(MIPS_S2, 28)
+CV_REGISTER(MIPS_S3, 29)
+CV_REGISTER(MIPS_S4, 30)
+CV_REGISTER(MIPS_S5, 31)
+CV_REGISTER(MIPS_S6, 32)
+CV_REGISTER(MIPS_S7, 33)
+CV_REGISTER(MIPS_T8, 34)
+CV_REGISTER(MIPS_T9, 35)
+CV_REGISTER(MIPS_K0, 36)
+CV_REGISTER(MIPS_K1, 37)
+CV_REGISTER(MIPS_GP, 38)
+CV_REGISTER(MIPS_SP, 39)
+CV_REGISTER(MIPS_S8, 40)
+CV_REGISTER(MIPS_RA, 41)
+CV_REGISTER(MIPS_LO, 42)
+CV_REGISTER(MIPS_HI, 43)
+
+// Status registers
+
+CV_REGISTER(MIPS_Fir, 50)
+CV_REGISTER(MIPS_Psr, 51)
+
+// Floating-point registers
+
+CV_REGISTER(MIPS_F0, 60)
+CV_REGISTER(MIPS_F1, 61)
+CV_REGISTER(MIPS_F2, 62)
+CV_REGISTER(MIPS_F3, 63)
+CV_REGISTER(MIPS_F4, 64)
+CV_REGISTER(MIPS_F5, 65)
+CV_REGISTER(MIPS_F6, 66)
+CV_REGISTER(MIPS_F7, 67)
+CV_REGISTER(MIPS_F8, 68)
+CV_REGISTER(MIPS_F9, 69)
+CV_REGISTER(MIPS_F10, 70)
+CV_REGISTER(MIPS_F11, 71)
+CV_REGISTER(MIPS_F12, 72)
+CV_REGISTER(MIPS_F13, 73)
+CV_REGISTER(MIPS_F14, 74)
+CV_REGISTER(MIPS_F15, 75)
+CV_REGISTER(MIPS_F16, 76)
+CV_REGISTER(MIPS_F17, 77)
+CV_REGISTER(MIPS_F18, 78)
+CV_REGISTER(MIPS_F19, 79)
+CV_REGISTER(MIPS_F20, 80)
+CV_REGISTER(MIPS_F21, 81)
+CV_REGISTER(MIPS_F22, 82)
+CV_REGISTER(MIPS_F23, 83)
+CV_REGISTER(MIPS_F24, 84)
+CV_REGISTER(MIPS_F25, 85)
+CV_REGISTER(MIPS_F26, 86)
+CV_REGISTER(MIPS_F27, 87)
+CV_REGISTER(MIPS_F28, 88)
+CV_REGISTER(MIPS_F29, 89)
+CV_REGISTER(MIPS_F30, 90)
+CV_REGISTER(MIPS_F31, 91)
+CV_REGISTER(MIPS_Fsr, 92)
+
+#endif // defined(CV_REGISTERS_ALL) || defined(CV_REGISTERS_MIPS)
diff --git llvm/include/llvm/ExecutionEngine/Orc/EHFrameRegistrationPlugin.h llvm/include/llvm/ExecutionEngine/Orc/EHFrameRegistrationPlugin.h
new file mode 100644
index 000000000000..6cbbc0c94a37
--- /dev/null
+++ llvm/include/llvm/ExecutionEngine/Orc/EHFrameRegistrationPlugin.h
@@ -0,0 +1,55 @@
+//===----- EHFrameRegistrationPlugin.h - Register eh-frames -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Register eh-frame sections with a registrar.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_EHFRAMEREGISTRATIONPLUGIN_H
+#define LLVM_EXECUTIONENGINE_ORC_EHFRAMEREGISTRATIONPLUGIN_H
+
+#include "llvm/ExecutionEngine/Orc/LinkGraphLinkingLayer.h"
+
+#include <memory>
+#include <mutex>
+#include <vector>
+
+namespace llvm {
+
+namespace jitlink {
+class EHFrameRegistrar;
+} // namespace jitlink
+
+namespace orc {
+
+class EHFrameRegistrationPlugin : public LinkGraphLinkingLayer::Plugin {
+public:
+  EHFrameRegistrationPlugin(
+      ExecutionSession &ES,
+      std::unique_ptr<jitlink::EHFrameRegistrar> Registrar);
+  void modifyPassConfig(MaterializationResponsibility &MR,
+                        jitlink::LinkGraph &G,
+                        jitlink::PassConfiguration &PassConfig) override;
+  Error notifyEmitted(MaterializationResponsibility &MR) override;
+  Error notifyFailed(MaterializationResponsibility &MR) override;
+  Error notifyRemovingResources(JITDylib &JD, ResourceKey K) override;
+  void notifyTransferringResources(JITDylib &JD, ResourceKey DstKey,
+                                   ResourceKey SrcKey) override;
+
+private:
+  std::mutex EHFramePluginMutex;
+  ExecutionSession &ES;
+  std::unique_ptr<jitlink::EHFrameRegistrar> Registrar;
+  DenseMap<MaterializationResponsibility *, ExecutorAddrRange> InProcessLinks;
+  DenseMap<ResourceKey, std::vector<ExecutorAddrRange>> EHFrameRanges;
+};
+
+} // end namespace orc
+} // end namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_ORC_EHFRAMEREGISTRATIONPLUGIN_H
diff --git llvm/include/llvm/ExecutionEngine/Orc/LinkGraphLinkingLayer.h llvm/include/llvm/ExecutionEngine/Orc/LinkGraphLinkingLayer.h
index 3375bd9e4e2e..060fda57bd4f 100644
--- llvm/include/llvm/ExecutionEngine/Orc/LinkGraphLinkingLayer.h
+++ llvm/include/llvm/ExecutionEngine/Orc/LinkGraphLinkingLayer.h
@@ -173,28 +173,6 @@ private:
   std::vector<std::shared_ptr<Plugin>> Plugins;
 };
 
-class EHFrameRegistrationPlugin : public LinkGraphLinkingLayer::Plugin {
-public:
-  EHFrameRegistrationPlugin(
-      ExecutionSession &ES,
-      std::unique_ptr<jitlink::EHFrameRegistrar> Registrar);
-  void modifyPassConfig(MaterializationResponsibility &MR,
-                        jitlink::LinkGraph &G,
-                        jitlink::PassConfiguration &PassConfig) override;
-  Error notifyEmitted(MaterializationResponsibility &MR) override;
-  Error notifyFailed(MaterializationResponsibility &MR) override;
-  Error notifyRemovingResources(JITDylib &JD, ResourceKey K) override;
-  void notifyTransferringResources(JITDylib &JD, ResourceKey DstKey,
-                                   ResourceKey SrcKey) override;
-
-private:
-  std::mutex EHFramePluginMutex;
-  ExecutionSession &ES;
-  std::unique_ptr<jitlink::EHFrameRegistrar> Registrar;
-  DenseMap<MaterializationResponsibility *, ExecutorAddrRange> InProcessLinks;
-  DenseMap<ResourceKey, std::vector<ExecutorAddrRange>> EHFrameRanges;
-};
-
 } // end namespace orc
 } // end namespace llvm
 
diff --git llvm/include/llvm/IR/DIBuilder.h llvm/include/llvm/IR/DIBuilder.h
index cb1150c269a1..6c479415b9ed 100644
--- llvm/include/llvm/IR/DIBuilder.h
+++ llvm/include/llvm/IR/DIBuilder.h
@@ -662,9 +662,9 @@ namespace llvm {
     /// Create a uniqued clone of \p Ty with FlagArtificial set.
     static DIType *createArtificialType(DIType *Ty);
 
-    /// Create a uniqued clone of \p Ty with FlagObjectPointer and
-    /// FlagArtificial set.
-    static DIType *createObjectPointerType(DIType *Ty);
+    /// Create a uniqued clone of \p Ty with FlagObjectPointer set.
+    /// If \p Implicit is true, also set FlagArtificial.
+    static DIType *createObjectPointerType(DIType *Ty, bool Implicit);
 
     /// Create a permanent forward-declared type.
     DICompositeType *createForwardDecl(unsigned Tag, StringRef Name,
diff --git llvm/include/llvm/IR/GlobalValue.h llvm/include/llvm/IR/GlobalValue.h
index d9104d7af5f9..2176e2c2cfbf 100644
--- llvm/include/llvm/IR/GlobalValue.h
+++ llvm/include/llvm/IR/GlobalValue.h
@@ -79,7 +79,8 @@ public:
 protected:
   GlobalValue(Type *Ty, ValueTy VTy, AllocInfo AllocInfo, LinkageTypes Linkage,
               const Twine &Name, unsigned AddressSpace)
-      : Constant(PointerType::get(Ty, AddressSpace), VTy, AllocInfo),
+      : Constant(PointerType::get(Ty->getContext(), AddressSpace), VTy,
+                 AllocInfo),
         ValueType(Ty), Visibility(DefaultVisibility),
         UnnamedAddrVal(unsigned(UnnamedAddr::None)),
         DllStorageClass(DefaultStorageClass), ThreadLocal(NotThreadLocal),
diff --git llvm/include/llvm/IR/IntrinsicsAArch64.td llvm/include/llvm/IR/IntrinsicsAArch64.td
index cc7a81e15f66..b31a65d9bcc0 100644
--- llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -538,17 +538,6 @@ let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in {
   def int_aarch64_neon_bfmlalb : AdvSIMD_BF16FML_Intrinsic;
   def int_aarch64_neon_bfmlalt : AdvSIMD_BF16FML_Intrinsic;
 
-
-  // v8.6-A Bfloat Intrinsics
-  def int_aarch64_neon_bfcvt
-    : DefaultAttrsIntrinsic<[llvm_bfloat_ty], [llvm_float_ty], [IntrNoMem]>;
-  def int_aarch64_neon_bfcvtn
-    : DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v4f32_ty], [IntrNoMem]>;
-  def int_aarch64_neon_bfcvtn2
-    : DefaultAttrsIntrinsic<[llvm_v8bf16_ty],
-                [llvm_v8bf16_ty, llvm_v4f32_ty],
-                [IntrNoMem]>;
-
   // v8.2-A FP16 Fused Multiply-Add Long
   def int_aarch64_neon_fmlal : AdvSIMD_FP16FML_Intrinsic;
   def int_aarch64_neon_fmlsl : AdvSIMD_FP16FML_Intrinsic;
diff --git llvm/include/llvm/IR/IntrinsicsSystemZ.td llvm/include/llvm/IR/IntrinsicsSystemZ.td
index 4f925979cf85..38b7463c7b07 100644
--- llvm/include/llvm/IR/IntrinsicsSystemZ.td
+++ llvm/include/llvm/IR/IntrinsicsSystemZ.td
@@ -445,6 +445,39 @@ let TargetPrefix = "s390" in {
                       Intrinsic<[llvm_v8i16_ty],
                                 [llvm_v8i16_ty, llvm_i32_ty],
                                 [IntrNoMem, ImmArg<ArgIndex<1>>]>;
+
+  // Instructions from the Vector Enhancements Facility 3
+  def int_s390_vgemb  : SystemZUnaryConv<"vgemb", llvm_v16i8_ty, llvm_v8i16_ty>;
+  def int_s390_vgemh  : SystemZUnaryConv<"vgemh", llvm_v8i16_ty, llvm_v16i8_ty>;
+  def int_s390_vgemf  : SystemZUnaryConv<"vgemf", llvm_v4i32_ty, llvm_v16i8_ty>;
+  def int_s390_vgemg  : SystemZUnaryConv<"vgemg", llvm_v2i64_ty, llvm_v16i8_ty>;
+  def int_s390_vgemq  : SystemZUnaryConv<"vgemq", llvm_i128_ty, llvm_v16i8_ty>;
+  def int_s390_vuplg  : SystemZUnaryConv<"vuplg", llvm_i128_ty, llvm_v2i64_ty>;
+  def int_s390_vupllg : SystemZUnaryConv<"vupllg", llvm_i128_ty, llvm_v2i64_ty>;
+  def int_s390_vuphg  : SystemZUnaryConv<"vuphg", llvm_i128_ty, llvm_v2i64_ty>;
+  def int_s390_vuplhg : SystemZUnaryConv<"vuplhg", llvm_i128_ty, llvm_v2i64_ty>;
+  def int_s390_vavgq  : SystemZBinary<"vavgq", llvm_i128_ty>;
+  def int_s390_vavglq : SystemZBinary<"vavglq", llvm_i128_ty>;
+  def int_s390_veval  : SystemZQuaternaryInt<"veval", llvm_v16i8_ty>;
+  def int_s390_vmahg  : SystemZTernary<"vmahg", llvm_v2i64_ty>;
+  def int_s390_vmahq  : SystemZTernary<"vmahq", llvm_i128_ty>;
+  def int_s390_vmalhg : SystemZTernary<"vmalhg", llvm_v2i64_ty>;
+  def int_s390_vmalhq : SystemZTernary<"vmalhq", llvm_i128_ty>;
+  def int_s390_vmaeg  : SystemZTernaryConv<"vmaeg", llvm_i128_ty, llvm_v2i64_ty>;
+  def int_s390_vmaleg : SystemZTernaryConv<"vmaleg", llvm_i128_ty, llvm_v2i64_ty>;
+  def int_s390_vmaog  : SystemZTernaryConv<"vmaog", llvm_i128_ty, llvm_v2i64_ty>;
+  def int_s390_vmalog : SystemZTernaryConv<"vmalog", llvm_i128_ty, llvm_v2i64_ty>;
+  def int_s390_vmhg   : SystemZBinary<"vmhg", llvm_v2i64_ty>;
+  def int_s390_vmhq   : SystemZBinary<"vmhq", llvm_i128_ty>;
+  def int_s390_vmlhg  : SystemZBinary<"vmlhg", llvm_v2i64_ty>;
+  def int_s390_vmlhq  : SystemZBinary<"vmlhq", llvm_i128_ty>;
+  def int_s390_vmeg   : SystemZBinaryConv<"vmeg", llvm_i128_ty, llvm_v2i64_ty>;
+  def int_s390_vmleg  : SystemZBinaryConv<"vmleg", llvm_i128_ty, llvm_v2i64_ty>;
+  def int_s390_vmog   : SystemZBinaryConv<"vmog", llvm_i128_ty, llvm_v2i64_ty>;
+  def int_s390_vmlog  : SystemZBinaryConv<"vmlog", llvm_i128_ty, llvm_v2i64_ty>;
+  def int_s390_vceqqs : SystemZBinaryCC<llvm_i128_ty>;
+  def int_s390_vchqs  : SystemZBinaryCC<llvm_i128_ty>;
+  def int_s390_vchlqs : SystemZBinaryCC<llvm_i128_ty>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -461,4 +494,12 @@ let TargetPrefix = "s390" in {
 
   def int_s390_tdc : Intrinsic<[llvm_i32_ty], [llvm_anyfloat_ty, llvm_i64_ty],
                                [IntrNoMem]>;
+
+  // Instructions from the Miscellaneous Instruction Extensions Facility 4
+  def int_s390_bdepg : ClangBuiltin<"__builtin_s390_bdepg">,
+                       Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
+                                 [IntrNoMem]>;
+  def int_s390_bextg : ClangBuiltin<"__builtin_s390_bextg">,
+                       Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
+                                 [IntrNoMem]>;
 }
diff --git llvm/include/llvm/IR/IntrinsicsX86.td llvm/include/llvm/IR/IntrinsicsX86.td
index fb12949e10c7..4bac5cd61084 100644
--- llvm/include/llvm/IR/IntrinsicsX86.td
+++ llvm/include/llvm/IR/IntrinsicsX86.td
@@ -7279,13 +7279,13 @@ let TargetPrefix = "x86" in {
 }
 
 let TargetPrefix = "x86" in {
-def int_x86_avx10_vminmaxnepbf16128 : ClangBuiltin<"__builtin_ia32_vminmaxnepbf16128">,
+def int_x86_avx10_vminmaxbf16128 : ClangBuiltin<"__builtin_ia32_vminmaxbf16128">,
         DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v8bf16_ty, llvm_v8bf16_ty, llvm_i32_ty],
                   [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-def int_x86_avx10_vminmaxnepbf16256 : ClangBuiltin<"__builtin_ia32_vminmaxnepbf16256">,
+def int_x86_avx10_vminmaxbf16256 : ClangBuiltin<"__builtin_ia32_vminmaxbf16256">,
         DefaultAttrsIntrinsic<[llvm_v16bf16_ty], [llvm_v16bf16_ty, llvm_v16bf16_ty, llvm_i32_ty],
                   [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-def int_x86_avx10_vminmaxnepbf16512 : ClangBuiltin<"__builtin_ia32_vminmaxnepbf16512">,
+def int_x86_avx10_vminmaxbf16512 : ClangBuiltin<"__builtin_ia32_vminmaxbf16512">,
         DefaultAttrsIntrinsic<[llvm_v32bf16_ty], [llvm_v32bf16_ty, llvm_v32bf16_ty, llvm_i32_ty],
                   [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 def int_x86_avx10_vminmaxpd128 : ClangBuiltin<"__builtin_ia32_vminmaxpd128">,
diff --git llvm/include/llvm/IR/PatternMatch.h llvm/include/llvm/IR/PatternMatch.h
index cd9a36029e6d..b3eeb1d7ba88 100644
--- llvm/include/llvm/IR/PatternMatch.h
+++ llvm/include/llvm/IR/PatternMatch.h
@@ -1430,6 +1430,34 @@ m_NUWAddLike(const LHS &L, const RHS &R) {
   return m_CombineOr(m_NUWAdd(L, R), m_DisjointOr(L, R));
 }
 
+template <typename LHS, typename RHS>
+struct XorLike_match {
+  LHS L;
+  RHS R;
+
+  XorLike_match(const LHS &L, const RHS &R) : L(L), R(R) {}
+
+  template <typename OpTy> bool match(OpTy *V) {
+    if (auto *Op = dyn_cast<BinaryOperator>(V)) {
+      if (Op->getOpcode() == Instruction::Sub && Op->hasNoUnsignedWrap() &&
+          PatternMatch::match(Op->getOperand(0), m_LowBitMask()))
+		  ; // Pass
+      else if (Op->getOpcode() != Instruction::Xor)
+        return false;
+      return (L.match(Op->getOperand(0)) && R.match(Op->getOperand(1))) ||
+             (L.match(Op->getOperand(1)) && R.match(Op->getOperand(0)));
+    }
+    return false;
+  }
+};
+
+/// Match either `(xor L, R)`, `(xor R, L)` or `(sub nuw R, L)` iff `R.isMask()`
+/// Only commutative matcher as the `sub` will need to swap the L and R.
+template <typename LHS, typename RHS>
+inline auto m_c_XorLike(const LHS &L, const RHS &R) {
+  return XorLike_match<LHS, RHS>(L, R);
+}
+
 //===----------------------------------------------------------------------===//
 // Class that matches a group of binary opcodes.
 //
diff --git llvm/include/llvm/IR/PrintPasses.h llvm/include/llvm/IR/PrintPasses.h
index 95b97e76c867..0aa1b379c35c 100644
--- llvm/include/llvm/IR/PrintPasses.h
+++ llvm/include/llvm/IR/PrintPasses.h
@@ -51,6 +51,9 @@ std::vector<std::string> printAfterPasses();
 // Returns true if we should always print the entire module.
 bool forcePrintModuleIR();
 
+// Returns true if we should print the entire function for loop passes.
+bool forcePrintFuncIR();
+
 // Return true if -filter-passes is empty or contains the pass name.
 bool isPassInPrintList(StringRef PassName);
 bool isFilterPassesEmpty();
diff --git llvm/include/llvm/MC/MCAsmBackend.h llvm/include/llvm/MC/MCAsmBackend.h
index b105a294d875..505bd1f59dd4 100644
--- llvm/include/llvm/MC/MCAsmBackend.h
+++ llvm/include/llvm/MC/MCAsmBackend.h
@@ -96,6 +96,7 @@ public:
   virtual bool shouldForceRelocation(const MCAssembler &Asm,
                                      const MCFixup &Fixup,
                                      const MCValue &Target,
+                                     const uint64_t Value,
                                      const MCSubtargetInfo *STI) {
     return false;
   }
diff --git llvm/include/llvm/MC/MCRegister.h llvm/include/llvm/MC/MCRegister.h
index 2d21e0acca35..53005bb03c2e 100644
--- llvm/include/llvm/MC/MCRegister.h
+++ llvm/include/llvm/MC/MCRegister.h
@@ -68,6 +68,10 @@ public:
     return FirstPhysicalReg <= Reg && Reg < FirstStackSlot;
   }
 
+  /// Return true if the specified register number is in the physical register
+  /// namespace.
+  constexpr bool isPhysical() const { return isPhysicalRegister(Reg); }
+
   constexpr operator unsigned() const { return Reg; }
 
   /// Check the provided unsigned value is a valid MCRegister.
diff --git llvm/include/llvm/MC/MCRegisterInfo.h llvm/include/llvm/MC/MCRegisterInfo.h
index 164ef1ef44bb..1579ecb035c2 100644
--- llvm/include/llvm/MC/MCRegisterInfo.h
+++ llvm/include/llvm/MC/MCRegisterInfo.h
@@ -530,7 +530,7 @@ public:
 
   MCSubRegIterator(MCRegister Reg, const MCRegisterInfo *MCRI,
                    bool IncludeSelf = false) {
-    assert(MCRegister::isPhysicalRegister(Reg.id()));
+    assert(Reg.isPhysical());
     I.init(Reg.id(), MCRI->DiffLists + MCRI->get(Reg).SubRegs);
     // Initially, the iterator points to Reg itself.
     Val = MCPhysReg(*I);
@@ -600,7 +600,7 @@ public:
 
   MCSuperRegIterator(MCRegister Reg, const MCRegisterInfo *MCRI,
                      bool IncludeSelf = false) {
-    assert(MCRegister::isPhysicalRegister(Reg.id()));
+    assert(Reg.isPhysical());
     I.init(Reg.id(), MCRI->DiffLists + MCRI->get(Reg).SuperRegs);
     // Initially, the iterator points to Reg itself.
     Val = MCPhysReg(*I);
@@ -646,8 +646,7 @@ public:
   MCRegUnitIterator() = default;
 
   MCRegUnitIterator(MCRegister Reg, const MCRegisterInfo *MCRI) {
-    assert(Reg && "Null register has no regunits");
-    assert(MCRegister::isPhysicalRegister(Reg.id()));
+    assert(Reg.isPhysical());
     // Decode the RegUnits MCRegisterDesc field.
     unsigned RU = MCRI->get(Reg).RegUnits;
     unsigned FirstRU = RU & ((1u << RegUnitBits) - 1);
diff --git llvm/include/llvm/SandboxIR/Instruction.h llvm/include/llvm/SandboxIR/Instruction.h
index 34a7feb63bec..49ea6707ecd8 100644
--- llvm/include/llvm/SandboxIR/Instruction.h
+++ llvm/include/llvm/SandboxIR/Instruction.h
@@ -2478,13 +2478,12 @@ protected:
 public:
   using Predicate = llvm::CmpInst::Predicate;
 
-  static CmpInst *create(Predicate Pred, Value *S1, Value *S2,
-                         InsertPosition Pos, Context &Ctx,
-                         const Twine &Name = "");
-  static CmpInst *createWithCopiedFlags(Predicate Pred, Value *S1, Value *S2,
-                                        const Instruction *FlagsSource,
-                                        InsertPosition Pos, Context &Ctx,
-                                        const Twine &Name = "");
+  static Value *create(Predicate Pred, Value *S1, Value *S2, InsertPosition Pos,
+                       Context &Ctx, const Twine &Name = "");
+  static Value *createWithCopiedFlags(Predicate Pred, Value *S1, Value *S2,
+                                      const Instruction *FlagsSource,
+                                      InsertPosition Pos, Context &Ctx,
+                                      const Twine &Name = "");
   void setPredicate(Predicate P);
   void swapOperands();
 
diff --git llvm/include/llvm/SandboxIR/Type.h llvm/include/llvm/SandboxIR/Type.h
index 3218b991b31a..c7a8943632ba 100644
--- llvm/include/llvm/SandboxIR/Type.h
+++ llvm/include/llvm/SandboxIR/Type.h
@@ -291,6 +291,8 @@ public:
 class PointerType : public Type {
 public:
   // TODO: add missing functions
+
+  // TODO: Remove non-opaque variant of sandboxir::PointerType::get
   static PointerType *get(Type *ElementType, unsigned AddressSpace);
   static PointerType *get(Context &Ctx, unsigned AddressSpace);
 
diff --git llvm/include/llvm/Target/GlobalISel/Combine.td llvm/include/llvm/Target/GlobalISel/Combine.td
index 8641eabbdd84..3590ab221ad4 100644
--- llvm/include/llvm/Target/GlobalISel/Combine.td
+++ llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -306,11 +306,23 @@ def ptr_add_immed_chain : GICombineRule<
          [{ return Helper.matchPtrAddImmedChain(*${d}, ${matchinfo}); }]),
   (apply [{ Helper.applyPtrAddImmedChain(*${d}, ${matchinfo}); }])>;
 
+def shift_const_op : GICombinePatFrag<
+  (outs root:$dst), (ins),
+  !foreach(op,
+           [G_SHL, G_ASHR, G_LSHR],
+           (pattern (op $dst, $shifted, $amt)))>;
+def shift_result_matchdata : GIDefMatchData<"std::optional<int64_t>">;
 def shifts_too_big : GICombineRule<
-  (defs root:$root),
-  (match (wip_match_opcode G_SHL, G_ASHR, G_LSHR):$root,
-         [{ return Helper.matchShiftsTooBig(*${root}); }]),
-  (apply [{ Helper.replaceInstWithUndef(*${root}); }])>;
+  (defs root:$root, shift_result_matchdata:$matchinfo),
+  (match (shift_const_op $root):$mi,
+         [{ return Helper.matchShiftsTooBig(*${mi}, ${matchinfo}); }]),
+  (apply [{
+    if (${matchinfo}) {
+      Helper.replaceInstWithConstant(*${mi}, *${matchinfo});
+    } else {
+      Helper.replaceInstWithUndef(*${mi});
+    }
+  }])>;
 
 // Fold shift (shift base x), y -> shift base, (x+y), if shifts are same
 def shift_immed_matchdata : GIDefMatchData<"RegisterImmPair">;
diff --git llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/InstrMaps.h llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/InstrMaps.h
index 586de53f3a72..c931319d3b00 100644
--- llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/InstrMaps.h
+++ llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/InstrMaps.h
@@ -13,9 +13,12 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/SandboxIR/Context.h"
+#include "llvm/SandboxIR/Instruction.h"
 #include "llvm/SandboxIR/Value.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
 
 namespace llvm::sandboxir {
 
@@ -30,8 +33,37 @@ class InstrMaps {
   /// with the same lane, as they may be coming from vectorizing different
   /// original values.
   DenseMap<Value *, DenseMap<Value *, unsigned>> VectorToOrigLaneMap;
+  Context &Ctx;
+  std::optional<Context::CallbackID> EraseInstrCB;
+
+private:
+  void notifyEraseInstr(Value *V) {
+    // We don't know if V is an original or a vector value.
+    auto It = OrigToVectorMap.find(V);
+    if (It != OrigToVectorMap.end()) {
+      // V is an original value.
+      // Remove it from VectorToOrigLaneMap.
+      Value *Vec = It->second;
+      VectorToOrigLaneMap[Vec].erase(V);
+      // Now erase V from OrigToVectorMap.
+      OrigToVectorMap.erase(It);
+    } else {
+      // V is a vector value.
+      // Go over the original values it came from and remove them from
+      // OrigToVectorMap.
+      for (auto [Orig, Lane] : VectorToOrigLaneMap[V])
+        OrigToVectorMap.erase(Orig);
+      // Now erase V from VectorToOrigLaneMap.
+      VectorToOrigLaneMap.erase(V);
+    }
+  }
 
 public:
+  InstrMaps(Context &Ctx) : Ctx(Ctx) {
+    EraseInstrCB = Ctx.registerEraseInstrCallback(
+        [this](Instruction *I) { notifyEraseInstr(I); });
+  }
+  ~InstrMaps() { Ctx.unregisterEraseInstrCallback(*EraseInstrCB); }
   /// \Returns the vector value that we got from vectorizing \p Orig, or
   /// nullptr if not found.
   Value *getVectorForOrig(Value *Orig) const {
diff --git llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h
index c03e7a10397a..4858ebaf0770 100644
--- llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h
+++ llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h
@@ -25,10 +25,62 @@ class LegalityAnalysis;
 class Value;
 class InstrMaps;
 
+class ShuffleMask {
+public:
+  using IndicesVecT = SmallVector<int, 8>;
+
+private:
+  IndicesVecT Indices;
+
+public:
+  ShuffleMask(SmallVectorImpl<int> &&Indices) : Indices(std::move(Indices)) {}
+  ShuffleMask(std::initializer_list<int> Indices) : Indices(Indices) {}
+  explicit ShuffleMask(ArrayRef<int> Indices) : Indices(Indices) {}
+  operator ArrayRef<int>() const { return Indices; }
+  /// Creates and returns an identity shuffle mask of size \p Sz.
+  /// For example if Sz == 4 the returned mask is {0, 1, 2, 3}.
+  static ShuffleMask getIdentity(unsigned Sz) {
+    IndicesVecT Indices;
+    Indices.reserve(Sz);
+    for (auto Idx : seq<int>(0, (int)Sz))
+      Indices.push_back(Idx);
+    return ShuffleMask(std::move(Indices));
+  }
+  /// \Returns true if the mask is a perfect identity mask with consecutive
+  /// indices, i.e., performs no lane shuffling, like 0,1,2,3...
+  bool isIdentity() const {
+    for (auto [Idx, Elm] : enumerate(Indices)) {
+      if ((int)Idx != Elm)
+        return false;
+    }
+    return true;
+  }
+  bool operator==(const ShuffleMask &Other) const {
+    return Indices == Other.Indices;
+  }
+  bool operator!=(const ShuffleMask &Other) const { return !(*this == Other); }
+  size_t size() const { return Indices.size(); }
+  int operator[](int Idx) const { return Indices[Idx]; }
+  using const_iterator = IndicesVecT::const_iterator;
+  const_iterator begin() const { return Indices.begin(); }
+  const_iterator end() const { return Indices.end(); }
+#ifndef NDEBUG
+  friend raw_ostream &operator<<(raw_ostream &OS, const ShuffleMask &Mask) {
+    Mask.print(OS);
+    return OS;
+  }
+  void print(raw_ostream &OS) const {
+    interleave(Indices, OS, [&OS](auto Elm) { OS << Elm; }, ",");
+  }
+  LLVM_DUMP_METHOD void dump() const;
+#endif
+};
+
 enum class LegalityResultID {
-  Pack,         ///> Collect scalar values.
-  Widen,        ///> Vectorize by combining scalars to a vector.
-  DiamondReuse, ///> Don't generate new code, reuse existing vector.
+  Pack,                    ///> Collect scalar values.
+  Widen,                   ///> Vectorize by combining scalars to a vector.
+  DiamondReuse,            ///> Don't generate new code, reuse existing vector.
+  DiamondReuseWithShuffle, ///> Reuse the existing vector but add a shuffle.
 };
 
 /// The reason for vectorizing or not vectorizing.
@@ -54,6 +106,8 @@ struct ToStr {
       return "Widen";
     case LegalityResultID::DiamondReuse:
       return "DiamondReuse";
+    case LegalityResultID::DiamondReuseWithShuffle:
+      return "DiamondReuseWithShuffle";
     }
     llvm_unreachable("Unknown LegalityResultID enum");
   }
@@ -154,6 +208,22 @@ public:
   Value *getVector() const { return Vec; }
 };
 
+class DiamondReuseWithShuffle final : public LegalityResult {
+  friend class LegalityAnalysis;
+  Value *Vec;
+  ShuffleMask Mask;
+  DiamondReuseWithShuffle(Value *Vec, const ShuffleMask &Mask)
+      : LegalityResult(LegalityResultID::DiamondReuseWithShuffle), Vec(Vec),
+        Mask(Mask) {}
+
+public:
+  static bool classof(const LegalityResult *From) {
+    return From->getSubclassID() == LegalityResultID::DiamondReuseWithShuffle;
+  }
+  Value *getVector() const { return Vec; }
+  const ShuffleMask &getMask() const { return Mask; }
+};
+
 class Pack final : public LegalityResultWithReason {
   Pack(ResultReason Reason)
       : LegalityResultWithReason(LegalityResultID::Pack, Reason) {}
@@ -192,23 +262,22 @@ public:
   CollectDescr(SmallVectorImpl<ExtractElementDescr> &&Descrs)
       : Descrs(std::move(Descrs)) {}
   /// If all elements come from a single vector input, then return that vector
-  /// and whether we need a shuffle to get them in order.
-  std::optional<std::pair<Value *, bool>> getSingleInput() const {
+  /// and also the shuffle mask required to get them in order.
+  std::optional<std::pair<Value *, ShuffleMask>> getSingleInput() const {
     const auto &Descr0 = *Descrs.begin();
     Value *V0 = Descr0.getValue();
     if (!Descr0.needsExtract())
       return std::nullopt;
-    bool NeedsShuffle = Descr0.getExtractIdx() != 0;
-    int Lane = 1;
+    ShuffleMask::IndicesVecT MaskIndices;
+    MaskIndices.push_back(Descr0.getExtractIdx());
     for (const auto &Descr : drop_begin(Descrs)) {
       if (!Descr.needsExtract())
         return std::nullopt;
       if (Descr.getValue() != V0)
         return std::nullopt;
-      if (Descr.getExtractIdx() != Lane++)
-        NeedsShuffle = true;
+      MaskIndices.push_back(Descr.getExtractIdx());
     }
-    return std::make_pair(V0, NeedsShuffle);
+    return std::make_pair(V0, ShuffleMask(std::move(MaskIndices)));
   }
   bool hasVectorInputs() const {
     return any_of(Descrs, [](const auto &D) { return D.needsExtract(); });
diff --git llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h
index 69cea3c4c7b5..b463b8acf4c8 100644
--- llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h
+++ llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h
@@ -26,9 +26,10 @@ namespace llvm::sandboxir {
 class BottomUpVec final : public FunctionPass {
   bool Change = false;
   std::unique_ptr<LegalityAnalysis> Legality;
+  /// The original instructions that are potentially dead after vectorization.
   DenseSet<Instruction *> DeadInstrCandidates;
   /// Maps scalars to vectors.
-  InstrMaps IMaps;
+  std::unique_ptr<InstrMaps> IMaps;
 
   /// Creates and returns a vector instruction that replaces the instructions in
   /// \p Bndl. \p Operands are the already vectorized operands.
@@ -36,8 +37,14 @@ class BottomUpVec final : public FunctionPass {
   /// Erases all dead instructions from the dead instruction candidates
   /// collected during vectorization.
   void tryEraseDeadInstrs();
+  /// Creates a shuffle instruction that shuffles \p VecOp according to \p Mask.
+  Value *createShuffle(Value *VecOp, const ShuffleMask &Mask);
   /// Packs all elements of \p ToPack into a vector and returns that vector.
   Value *createPack(ArrayRef<Value *> ToPack);
+  /// After we create vectors for groups of instructions, the original
+  /// instructions are potentially dead and may need to be removed. This
+  /// function helps collect these instructions (along with the pointer operands
+  /// for loads/stores) so that they can be cleaned up later.
   void collectPotentiallyDeadInstrs(ArrayRef<Value *> Bndl);
   /// Recursively try to vectorize \p Bndl and its operands.
   Value *vectorizeRec(ArrayRef<Value *> Bndl, unsigned Depth);
diff --git llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/VecUtils.h llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/VecUtils.h
index 28fa33656dd5..6cbbb396ea82 100644
--- llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/VecUtils.h
+++ llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/VecUtils.h
@@ -134,15 +134,13 @@ public:
     return ScalarTy;
   }
   /// \Returns the first integer power of 2 that is <= Num.
-  static unsigned getFloorPowerOf2(unsigned Num) {
-    if (Num == 0)
-      return Num;
-    unsigned Mask = Num;
-    Mask >>= 1;
-    for (unsigned ShiftBy = 1; ShiftBy < sizeof(Num) * 8; ShiftBy <<= 1)
-      Mask |= Mask >> ShiftBy;
-    return Num & ~Mask;
-  }
+  static unsigned getFloorPowerOf2(unsigned Num);
+
+#ifndef NDEBUG
+  /// Helper dump function for debugging.
+  LLVM_DUMP_METHOD static void dump(ArrayRef<Value *> Bndl);
+  LLVM_DUMP_METHOD static void dump(ArrayRef<Instruction *> Bndl);
+#endif // NDEBUG
 };
 
 } // namespace llvm::sandboxir
diff --git llvm/lib/Analysis/ConstantFolding.cpp llvm/lib/Analysis/ConstantFolding.cpp
index 3e87ea0e90fd..80c1277e6316 100644
--- llvm/lib/Analysis/ConstantFolding.cpp
+++ llvm/lib/Analysis/ConstantFolding.cpp
@@ -3940,6 +3940,9 @@ bool llvm::isMathLibCallNoop(const CallBase *Call,
       case LibFunc_log10f:
         return Op.isNaN() || (!Op.isZero() && !Op.isNegative());
 
+      case LibFunc_ilogb:
+        return !Op.isNaN() && !Op.isZero() && !Op.isInfinity();
+
       case LibFunc_expl:
       case LibFunc_exp:
       case LibFunc_expf:
diff --git llvm/lib/Analysis/InlineCost.cpp llvm/lib/Analysis/InlineCost.cpp
index 85287a39f2ca..8fa150f7d690 100644
--- llvm/lib/Analysis/InlineCost.cpp
+++ llvm/lib/Analysis/InlineCost.cpp
@@ -2698,8 +2698,10 @@ void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) {
   auto IsEdgeDead = [&](BasicBlock *Pred, BasicBlock *Succ) {
     // A CFG edge is dead if the predecessor is dead or the predecessor has a
     // known successor which is not the one under exam.
-    return (DeadBlocks.count(Pred) ||
-            (KnownSuccessors[Pred] && KnownSuccessors[Pred] != Succ));
+    if (DeadBlocks.count(Pred))
+      return true;
+    BasicBlock *KnownSucc = KnownSuccessors[Pred];
+    return KnownSucc && KnownSucc != Succ;
   };
 
   auto IsNewlyDead = [&](BasicBlock *BB) {
diff --git llvm/lib/Analysis/LoopInfo.cpp llvm/lib/Analysis/LoopInfo.cpp
index 6bb5f001e9bd..7bd5e1e0cfac 100644
--- llvm/lib/Analysis/LoopInfo.cpp
+++ llvm/lib/Analysis/LoopInfo.cpp
@@ -999,6 +999,18 @@ void llvm::printLoop(Loop &L, raw_ostream &OS, const std::string &Banner) {
     return;
   }
 
+  if (forcePrintFuncIR()) {
+    // handling -print-loop-func-scope.
+    // -print-module-scope overrides this.
+    OS << Banner << " (loop: ";
+    L.getHeader()->printAsOperand(OS, false);
+    OS << ")\n";
+
+    // printing whole function.
+    OS << *L.getHeader()->getParent();
+    return;
+  }
+
   OS << Banner;
 
   auto *PreHeader = L.getLoopPreheader();
diff --git llvm/lib/AsmParser/LLParser.cpp llvm/lib/AsmParser/LLParser.cpp
index be6166f0c416..fa0079bac435 100644
--- llvm/lib/AsmParser/LLParser.cpp
+++ llvm/lib/AsmParser/LLParser.cpp
@@ -2975,7 +2975,7 @@ bool LLParser::parseType(Type *&Result, const Twine &Msg, bool AllowVoid) {
         return tokError("pointers to void are invalid - use i8* instead");
       if (!PointerType::isValidElementType(Result))
         return tokError("pointer to this type is invalid");
-      Result = PointerType::getUnqual(Result);
+      Result = PointerType::getUnqual(Context);
       Lex.Lex();
       break;
 
@@ -2992,7 +2992,7 @@ bool LLParser::parseType(Type *&Result, const Twine &Msg, bool AllowVoid) {
           parseToken(lltok::star, "expected '*' in address space"))
         return true;
 
-      Result = PointerType::get(Result, AddrSpace);
+      Result = PointerType::get(Context, AddrSpace);
       break;
     }
 
@@ -6515,7 +6515,7 @@ bool LLParser::parseFunctionHeader(Function *&Fn, bool IsDefine,
     return error(RetTypeLoc, "functions with 'sret' argument must return void");
 
   FunctionType *FT = FunctionType::get(RetType, ParamTypeList, IsVarArg);
-  PointerType *PFT = PointerType::get(FT, AddrSpace);
+  PointerType *PFT = PointerType::get(Context, AddrSpace);
 
   Fn = nullptr;
   GlobalValue *FwdFn = nullptr;
@@ -7410,7 +7410,7 @@ bool LLParser::parseInvoke(Instruction *&Inst, PerFunctionState &PFS) {
 
   // Look up the callee.
   Value *Callee;
-  if (convertValIDToValue(PointerType::get(Ty, InvokeAddrSpace), CalleeID,
+  if (convertValIDToValue(PointerType::get(Context, InvokeAddrSpace), CalleeID,
                           Callee, &PFS))
     return true;
 
@@ -7724,7 +7724,8 @@ bool LLParser::parseCallBr(Instruction *&Inst, PerFunctionState &PFS) {
 
   // Look up the callee.
   Value *Callee;
-  if (convertValIDToValue(PointerType::getUnqual(Ty), CalleeID, Callee, &PFS))
+  if (convertValIDToValue(PointerType::getUnqual(Context), CalleeID, Callee,
+                          &PFS))
     return true;
 
   // Set up the Attribute for the function.
@@ -8115,8 +8116,8 @@ bool LLParser::parseCall(Instruction *&Inst, PerFunctionState &PFS,
 
   // Look up the callee.
   Value *Callee;
-  if (convertValIDToValue(PointerType::get(Ty, CallAddrSpace), CalleeID, Callee,
-                          &PFS))
+  if (convertValIDToValue(PointerType::get(Context, CallAddrSpace), CalleeID,
+                          Callee, &PFS))
     return true;
 
   // Set up the Attribute for the function.
diff --git llvm/lib/Bitcode/Reader/BitcodeReader.cpp llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 56f5ff4b20e5..551dfd4af88b 100644
--- llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -2598,7 +2598,7 @@ Error BitcodeReader::parseTypeTableBody() {
           !PointerType::isValidElementType(ResultTy))
         return error("Invalid type");
       ContainedIDs.push_back(Record[0]);
-      ResultTy = PointerType::get(ResultTy, AddressSpace);
+      ResultTy = PointerType::get(ResultTy->getContext(), AddressSpace);
       break;
     }
     case bitc::TYPE_CODE_OPAQUE_POINTER: { // OPAQUE_POINTER: [addrspace]
diff --git llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp
index e40248197c7c..2f123c22b330 100644
--- llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp
+++ llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp
@@ -420,7 +420,7 @@ void AggressiveAntiDepBreaker::PrescanInstruction(
       if (TRI->isSuperRegister(Reg, *AI) && State->IsLive(*AI))
         continue;
 
-      DefIndices[*AI] = Count;
+      DefIndices[(*AI).id()] = Count;
     }
   }
 }
diff --git llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
index 05bceb87403b..bda0e266d01d 100644
--- llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
+++ llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -123,6 +123,8 @@ static CPUType mapArchToCVCPUType(Triple::ArchType Type) {
     return CPUType::ARMNT;
   case Triple::ArchType::aarch64:
     return CPUType::ARM64;
+  case Triple::ArchType::mipsel:
+    return CPUType::MIPS;
   default:
     report_fatal_error("target architecture doesn't map to a CodeView CPUType");
   }
diff --git llvm/lib/CodeGen/BranchFolding.cpp llvm/lib/CodeGen/BranchFolding.cpp
index bc1a65064a8c..65476fa05a20 100644
--- llvm/lib/CodeGen/BranchFolding.cpp
+++ llvm/lib/CodeGen/BranchFolding.cpp
@@ -381,7 +381,7 @@ void BranchFolder::replaceTailWithBranchTo(MachineBasicBlock::iterator OldInst,
       // full registers:
       assert(P.LaneMask == LaneBitmask::getAll() &&
              "Can only handle full register.");
-      MCPhysReg Reg = P.PhysReg;
+      MCRegister Reg = P.PhysReg;
       if (!LiveRegs.available(*MRI, Reg))
         continue;
       DebugLoc DL;
diff --git llvm/lib/CodeGen/CallingConvLower.cpp llvm/lib/CodeGen/CallingConvLower.cpp
index b7152587a9fa..cebc9f5c4639 100644
--- llvm/lib/CodeGen/CallingConvLower.cpp
+++ llvm/lib/CodeGen/CallingConvLower.cpp
@@ -61,12 +61,12 @@ void CCState::HandleByVal(unsigned ValNo, MVT ValVT, MVT LocVT,
 /// Mark a register and all of its aliases as allocated.
 void CCState::MarkAllocated(MCPhysReg Reg) {
   for (MCRegAliasIterator AI(Reg, &TRI, true); AI.isValid(); ++AI)
-    UsedRegs[*AI / 32] |= 1 << (*AI & 31);
+    UsedRegs[(*AI).id() / 32] |= 1 << ((*AI).id() & 31);
 }
 
 void CCState::MarkUnallocated(MCPhysReg Reg) {
   for (MCRegAliasIterator AI(Reg, &TRI, true); AI.isValid(); ++AI)
-    UsedRegs[*AI / 32] &= ~(1 << (*AI & 31));
+    UsedRegs[(*AI).id() / 32] &= ~(1 << ((*AI).id() & 31));
 }
 
 bool CCState::IsShadowAllocatedReg(MCRegister Reg) const {
diff --git llvm/lib/CodeGen/CodeGen.cpp llvm/lib/CodeGen/CodeGen.cpp
index 8efe54077091..925d9af7d0e0 100644
--- llvm/lib/CodeGen/CodeGen.cpp
+++ llvm/lib/CodeGen/CodeGen.cpp
@@ -82,6 +82,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeMachineCycleInfoWrapperPassPass(Registry);
   initializeMachineDominatorTreeWrapperPassPass(Registry);
   initializeMachineFunctionPrinterPassPass(Registry);
+  initializeMachineFunctionSplitterPass(Registry);
   initializeMachineLateInstrsCleanupPass(Registry);
   initializeMachineLICMPass(Registry);
   initializeMachineLoopInfoWrapperPassPass(Registry);
diff --git llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp
index 106db7c51f27..e4bf77b6563a 100644
--- llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp
+++ llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp
@@ -67,7 +67,7 @@ void CriticalAntiDepBreaker::StartBlock(MachineBasicBlock *BB) {
   for (const MachineBasicBlock *Succ : BB->successors())
     for (const auto &LI : Succ->liveins()) {
       for (MCRegAliasIterator AI(LI.PhysReg, TRI, true); AI.isValid(); ++AI) {
-        unsigned Reg = *AI;
+        unsigned Reg = (*AI).id();
         Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
         KillIndices[Reg] = BBSize;
         DefIndices[Reg] = ~0u;
@@ -85,7 +85,7 @@ void CriticalAntiDepBreaker::StartBlock(MachineBasicBlock *BB) {
     if (!IsReturnBlock && !Pristine.test(Reg))
       continue;
     for (MCRegAliasIterator AI(*I, TRI, true); AI.isValid(); ++AI) {
-      unsigned Reg = *AI;
+      unsigned Reg = (*AI).id();
       Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
       KillIndices[Reg] = BBSize;
       DefIndices[Reg] = ~0u;
@@ -200,7 +200,7 @@ void CriticalAntiDepBreaker::PrescanInstruction(MachineInstr &MI) {
       // If an alias of the reg is used during the live range, give up.
       // Note that this allows us to skip checking if AntiDepReg
       // overlaps with any of the aliases, among other things.
-      unsigned AliasReg = *AI;
+      unsigned AliasReg = (*AI).id();
       if (Classes[AliasReg]) {
         Classes[AliasReg] = reinterpret_cast<TargetRegisterClass *>(-1);
         Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
@@ -327,7 +327,7 @@ void CriticalAntiDepBreaker::ScanInstruction(MachineInstr &MI, unsigned Count) {
     // It wasn't previously live but now it is, this is a kill.
     // Repeat for all aliases.
     for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
-      unsigned AliasReg = *AI;
+      unsigned AliasReg = (*AI).id();
       if (KillIndices[AliasReg] == ~0u) {
         KillIndices[AliasReg] = Count;
         DefIndices[AliasReg] = ~0u;
diff --git llvm/lib/CodeGen/EarlyIfConversion.cpp llvm/lib/CodeGen/EarlyIfConversion.cpp
index b95516f616e0..caec0524e7ab 100644
--- llvm/lib/CodeGen/EarlyIfConversion.cpp
+++ llvm/lib/CodeGen/EarlyIfConversion.cpp
@@ -895,7 +895,7 @@ bool EarlyIfConverter::shouldConvertIf() {
         if (!MO.isReg() || !MO.isUse())
           return false;
         Register Reg = MO.getReg();
-        if (Register::isPhysicalRegister(Reg))
+        if (Reg.isPhysical())
           return false;
 
         MachineInstr *Def = MRI->getVRegDef(Reg);
@@ -906,7 +906,7 @@ bool EarlyIfConverter::shouldConvertIf() {
                  if (!MO.isReg() || !MO.isUse())
                    return false;
                  Register Reg = MO.getReg();
-                 if (Register::isPhysicalRegister(Reg))
+                 if (Reg.isPhysical())
                    return false;
 
                  MachineInstr *Def = MRI->getVRegDef(Reg);
diff --git llvm/lib/CodeGen/ExecutionDomainFix.cpp llvm/lib/CodeGen/ExecutionDomainFix.cpp
index 21a7d02a320c..8bb5ac5a6de7 100644
--- llvm/lib/CodeGen/ExecutionDomainFix.cpp
+++ llvm/lib/CodeGen/ExecutionDomainFix.cpp
@@ -445,7 +445,7 @@ bool ExecutionDomainFix::runOnMachineFunction(MachineFunction &mf) {
     for (unsigned i = 0, e = RC->getNumRegs(); i != e; ++i)
       for (MCRegAliasIterator AI(RC->getRegister(i), TRI, true); AI.isValid();
            ++AI)
-        AliasMap[*AI].push_back(i);
+        AliasMap[(*AI).id()].push_back(i);
   }
 
   // Initialize the MBBOutRegsInfos
diff --git llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp
index 0ebe845e473f..edb85d212a4d 100644
--- llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp
+++ llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp
@@ -190,7 +190,8 @@ public:
   // Does basic block MBB contains reload of Reg from FI?
   bool hasReload(Register Reg, int FI, const MachineBasicBlock *MBB) {
     RegSlotPair RSP(Reg, FI);
-    return Reloads.count(MBB) && Reloads[MBB].count(RSP);
+    auto It = Reloads.find(MBB);
+    return It != Reloads.end() && It->second.count(RSP);
   }
 };
 
@@ -242,9 +243,10 @@ public:
       It.second.Index = 0;
 
     ReservedSlots.clear();
-    if (EHPad && GlobalIndices.count(EHPad))
-      for (auto &RSP : GlobalIndices[EHPad])
-        ReservedSlots.insert(RSP.second);
+    if (EHPad)
+      if (auto It = GlobalIndices.find(EHPad); It != GlobalIndices.end())
+        for (auto &RSP : It->second)
+          ReservedSlots.insert(RSP.second);
   }
 
   // Get frame index to spill the register.
diff --git llvm/lib/CodeGen/GlobalISel/CallLowering.cpp llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
index d17b20d977ce..437dc4f42bae 100644
--- llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
+++ llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -1054,7 +1054,7 @@ void CallLowering::insertSRetIncomingArgument(
   DemoteReg = MRI.createGenericVirtualRegister(
       LLT::pointer(AS, DL.getPointerSizeInBits(AS)));
 
-  Type *PtrTy = PointerType::get(F.getReturnType(), AS);
+  Type *PtrTy = PointerType::get(F.getContext(), AS);
 
   SmallVector<EVT, 1> ValueVTs;
   ComputeValueVTs(*TLI, DL, PtrTy, ValueVTs);
@@ -1081,7 +1081,7 @@ void CallLowering::insertSRetOutgoingArgument(MachineIRBuilder &MIRBuilder,
       DL.getTypeAllocSize(RetTy), DL.getPrefTypeAlign(RetTy), false);
 
   Register DemoteReg = MIRBuilder.buildFrameIndex(FramePtrTy, FI).getReg(0);
-  ArgInfo DemoteArg(DemoteReg, PointerType::get(RetTy, AS),
+  ArgInfo DemoteArg(DemoteReg, PointerType::get(RetTy->getContext(), AS),
                     ArgInfo::NoArgIndex);
   setArgFlags(DemoteArg, AttributeList::ReturnIndex, DL, CB);
   DemoteArg.Flags[0].setSRet();
diff --git llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 4e3aaf5da719..b193d8bb0aa1 100644
--- llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -6590,12 +6590,57 @@ bool CombinerHelper::matchRedundantBinOpInEquality(MachineInstr &MI,
   return CmpInst::isEquality(Pred) && Y.isValid();
 }
 
-bool CombinerHelper::matchShiftsTooBig(MachineInstr &MI) const {
+/// Return the minimum useless shift amount that results in complete loss of the
+/// source value. Return std::nullopt when it cannot determine a value.
+static std::optional<unsigned>
+getMinUselessShift(KnownBits ValueKB, unsigned Opcode,
+                   std::optional<int64_t> &Result) {
+  assert(Opcode == TargetOpcode::G_SHL || Opcode == TargetOpcode::G_LSHR ||
+         Opcode == TargetOpcode::G_ASHR && "Expect G_SHL, G_LSHR or G_ASHR.");
+  auto SignificantBits = 0;
+  switch (Opcode) {
+  case TargetOpcode::G_SHL:
+    SignificantBits = ValueKB.countMinTrailingZeros();
+    Result = 0;
+    break;
+  case TargetOpcode::G_LSHR:
+    Result = 0;
+    SignificantBits = ValueKB.countMinLeadingZeros();
+    break;
+  case TargetOpcode::G_ASHR:
+    if (ValueKB.isNonNegative()) {
+      SignificantBits = ValueKB.countMinLeadingZeros();
+      Result = 0;
+    } else if (ValueKB.isNegative()) {
+      SignificantBits = ValueKB.countMinLeadingOnes();
+      Result = -1;
+    } else {
+      // Cannot determine shift result.
+      Result = std::nullopt;
+    }
+    break;
+  default:
+    break;
+  }
+  return ValueKB.getBitWidth() - SignificantBits;
+}
+
+bool CombinerHelper::matchShiftsTooBig(
+    MachineInstr &MI, std::optional<int64_t> &MatchInfo) const {
+  Register ShiftVal = MI.getOperand(1).getReg();
   Register ShiftReg = MI.getOperand(2).getReg();
   LLT ResTy = MRI.getType(MI.getOperand(0).getReg());
   auto IsShiftTooBig = [&](const Constant *C) {
     auto *CI = dyn_cast<ConstantInt>(C);
-    return CI && CI->uge(ResTy.getScalarSizeInBits());
+    if (!CI)
+      return false;
+    if (CI->uge(ResTy.getScalarSizeInBits())) {
+      MatchInfo = std::nullopt;
+      return true;
+    }
+    auto OptMaxUsefulShift = getMinUselessShift(KB->getKnownBits(ShiftVal),
+                                                MI.getOpcode(), MatchInfo);
+    return OptMaxUsefulShift && CI->uge(*OptMaxUsefulShift);
   };
   return matchUnaryPredicate(MRI, ShiftReg, IsShiftTooBig);
 }
diff --git llvm/lib/CodeGen/InterferenceCache.cpp llvm/lib/CodeGen/InterferenceCache.cpp
index 73cde07cfd51..ebdf0506bb22 100644
--- llvm/lib/CodeGen/InterferenceCache.cpp
+++ llvm/lib/CodeGen/InterferenceCache.cpp
@@ -78,7 +78,7 @@ InterferenceCache::Entry *InterferenceCache::get(MCRegister PhysReg) {
       continue;
     }
     Entries[E].reset(PhysReg, LIUArray, TRI, MF);
-    PhysRegEntries[PhysReg] = E;
+    PhysRegEntries[PhysReg.id()] = E;
     return &Entries[E];
   }
   llvm_unreachable("Ran out of interference cache entries.");
diff --git llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
index ade67bb545d1..012bc37dd767 100644
--- llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
+++ llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
@@ -283,7 +283,7 @@ public:
     if (Reg >= MTracker->NumRegs)
       return false;
     for (MCRegAliasIterator RAI(Reg, &TRI, true); RAI.isValid(); ++RAI)
-      if (CalleeSavedRegs.test(*RAI))
+      if (CalleeSavedRegs.test((*RAI).id()))
         return true;
     return false;
   };
@@ -1345,7 +1345,7 @@ bool InstrRefBasedLDV::isCalleeSaved(LocIdx L) const {
 }
 bool InstrRefBasedLDV::isCalleeSavedReg(Register R) const {
   for (MCRegAliasIterator RAI(R, TRI, true); RAI.isValid(); ++RAI)
-    if (CalleeSavedRegs.test(*RAI))
+    if (CalleeSavedRegs.test((*RAI).id()))
       return true;
   return false;
 }
@@ -1880,7 +1880,7 @@ void InstrRefBasedLDV::transferRegisterDef(MachineInstr &MI) {
       // Remove ranges of all aliased registers.
       for (MCRegAliasIterator RAI(MO.getReg(), TRI, true); RAI.isValid(); ++RAI)
         // FIXME: Can we break out of this loop early if no insertion occurs?
-        DeadRegs.insert(*RAI);
+        DeadRegs.insert((*RAI).id());
     } else if (MO.isRegMask()) {
       RegMasks.push_back(MO.getRegMask());
       RegMaskPtrs.push_back(&MO);
diff --git llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp
index a5e6bebcd29c..94e7f1b734fd 100644
--- llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp
+++ llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp
@@ -1606,7 +1606,7 @@ void VarLocBasedLDV::transferRegisterDef(MachineInstr &MI,
       // Remove ranges of all aliased registers.
       for (MCRegAliasIterator RAI(MO.getReg(), TRI, true); RAI.isValid(); ++RAI)
         // FIXME: Can we break out of this loop early if no insertion occurs?
-        DeadRegs.insert(*RAI);
+        DeadRegs.insert((*RAI).id());
       RegSetInstrs.erase(MO.getReg());
       RegSetInstrs.insert({MO.getReg(), &MI});
     } else if (MO.isRegMask()) {
@@ -1866,7 +1866,7 @@ void VarLocBasedLDV::transferRegisterCopy(MachineInstr &MI,
 
   auto isCalleeSavedReg = [&](Register Reg) {
     for (MCRegAliasIterator RAI(Reg, TRI, true); RAI.isValid(); ++RAI)
-      if (CalleeSavedRegs.test(*RAI))
+      if (CalleeSavedRegs.test((*RAI).id()))
         return true;
     return false;
   };
diff --git llvm/lib/CodeGen/LivePhysRegs.cpp llvm/lib/CodeGen/LivePhysRegs.cpp
index 96380d408482..2ba17e46be5a 100644
--- llvm/lib/CodeGen/LivePhysRegs.cpp
+++ llvm/lib/CodeGen/LivePhysRegs.cpp
@@ -154,7 +154,7 @@ bool LivePhysRegs::available(const MachineRegisterInfo &MRI,
 /// Add live-in registers of basic block \p MBB to \p LiveRegs.
 void LivePhysRegs::addBlockLiveIns(const MachineBasicBlock &MBB) {
   for (const auto &LI : MBB.liveins()) {
-    MCPhysReg Reg = LI.PhysReg;
+    MCRegister Reg = LI.PhysReg;
     LaneBitmask Mask = LI.LaneMask;
     MCSubRegIndexIterator S(Reg, TRI);
     assert(Mask.any() && "Invalid livein mask");
diff --git llvm/lib/CodeGen/LiveRegMatrix.cpp llvm/lib/CodeGen/LiveRegMatrix.cpp
index 3367171a1566..cfda262aac82 100644
--- llvm/lib/CodeGen/LiveRegMatrix.cpp
+++ llvm/lib/CodeGen/LiveRegMatrix.cpp
@@ -165,7 +165,8 @@ bool LiveRegMatrix::checkRegMaskInterference(const LiveInterval &VirtReg,
   // The BitVector is indexed by PhysReg, not register unit.
   // Regmask interference is more fine grained than regunits.
   // For example, a Win64 call can clobber %ymm8 yet preserve %xmm8.
-  return !RegMaskUsable.empty() && (!PhysReg || !RegMaskUsable.test(PhysReg));
+  return !RegMaskUsable.empty() &&
+         (!PhysReg || !RegMaskUsable.test(PhysReg.id()));
 }
 
 bool LiveRegMatrix::checkRegUnitInterference(const LiveInterval &VirtReg,
@@ -183,7 +184,7 @@ bool LiveRegMatrix::checkRegUnitInterference(const LiveInterval &VirtReg,
 }
 
 LiveIntervalUnion::Query &LiveRegMatrix::query(const LiveRange &LR,
-                                               MCRegister RegUnit) {
+                                               MCRegUnit RegUnit) {
   LiveIntervalUnion::Query &Q = Queries[RegUnit];
   Q.init(UserTag, LR, Matrix[RegUnit]);
   return Q;
@@ -205,7 +206,7 @@ LiveRegMatrix::checkInterference(const LiveInterval &VirtReg,
 
   // Check the matrix for virtual register interference.
   bool Interference = foreachUnit(TRI, VirtReg, PhysReg,
-                                  [&](MCRegister Unit, const LiveRange &LR) {
+                                  [&](MCRegUnit Unit, const LiveRange &LR) {
                                     return query(LR, Unit).checkInterference();
                                   });
   if (Interference)
diff --git llvm/lib/CodeGen/LiveVariables.cpp llvm/lib/CodeGen/LiveVariables.cpp
index 55428ab7832d..00dae84b5840 100644
--- llvm/lib/CodeGen/LiveVariables.cpp
+++ llvm/lib/CodeGen/LiveVariables.cpp
@@ -576,7 +576,7 @@ void LiveVariables::runOnBlock(MachineBasicBlock *MBB, unsigned NumRegs) {
   // Mark live-in registers as live-in.
   SmallVector<Register, 4> Defs;
   for (const auto &LI : MBB->liveins()) {
-    assert(Register::isPhysicalRegister(LI.PhysReg) &&
+    assert(LI.PhysReg.isPhysical() &&
            "Cannot have a live-in virtual register!");
     HandlePhysRegDef(LI.PhysReg, nullptr, Defs);
   }
diff --git llvm/lib/CodeGen/MIRParser/MIParser.cpp llvm/lib/CodeGen/MIRParser/MIParser.cpp
index f77c4613ad80..19c73374c370 100644
--- llvm/lib/CodeGen/MIRParser/MIParser.cpp
+++ llvm/lib/CodeGen/MIRParser/MIParser.cpp
@@ -1316,9 +1316,10 @@ bool MIParser::parseMachineMetadata() {
 
     assert(PFS.MachineMetadataNodes[ID] == MD && "Tracking VH didn't work");
   } else {
-    if (PFS.MachineMetadataNodes.count(ID))
+    auto [It, Inserted] = PFS.MachineMetadataNodes.try_emplace(ID);
+    if (!Inserted)
       return error("Metadata id is already used");
-    PFS.MachineMetadataNodes[ID].reset(MD);
+    It->second.reset(MD);
   }
 
   return false;
diff --git llvm/lib/CodeGen/MachineBasicBlock.cpp llvm/lib/CodeGen/MachineBasicBlock.cpp
index 5ac6472a01e9..9bc8989cbfa1 100644
--- llvm/lib/CodeGen/MachineBasicBlock.cpp
+++ llvm/lib/CodeGen/MachineBasicBlock.cpp
@@ -644,7 +644,7 @@ void MachineBasicBlock::sortUniqueLiveIns() {
 Register
 MachineBasicBlock::addLiveIn(MCRegister PhysReg, const TargetRegisterClass *RC) {
   assert(getParent() && "MBB must be inserted in function");
-  assert(Register::isPhysicalRegister(PhysReg) && "Expected physreg");
+  assert(PhysReg.isPhysical() && "Expected physreg");
   assert(RC && "Register class is required");
   assert((isEHPad() || this == &getParent()->front()) &&
          "Only the entry block and landing pads can have physreg live ins");
diff --git llvm/lib/CodeGen/MachineCopyPropagation.cpp llvm/lib/CodeGen/MachineCopyPropagation.cpp
index 0afd73d8ecdc..d44b064dcb4b 100644
--- llvm/lib/CodeGen/MachineCopyPropagation.cpp
+++ llvm/lib/CodeGen/MachineCopyPropagation.cpp
@@ -127,11 +127,11 @@ public:
   BitVector &getPreservedRegUnits(const MachineOperand &RegMaskOp,
                                   const TargetRegisterInfo &TRI) {
     const uint32_t *RegMask = RegMaskOp.getRegMask();
-    auto Existing = RegMaskToPreservedRegUnits.find(RegMask);
-    if (Existing != RegMaskToPreservedRegUnits.end()) {
-      return Existing->second;
+    auto [It, Inserted] = RegMaskToPreservedRegUnits.try_emplace(RegMask);
+    if (!Inserted) {
+      return It->second;
     } else {
-      BitVector &PreservedRegUnits = RegMaskToPreservedRegUnits[RegMask];
+      BitVector &PreservedRegUnits = It->second;
 
       PreservedRegUnits.resize(TRI.getNumRegUnits());
       for (unsigned SafeReg = 0, E = TRI.getNumRegs(); SafeReg < E; ++SafeReg)
diff --git llvm/lib/CodeGen/MachineOperand.cpp llvm/lib/CodeGen/MachineOperand.cpp
index 5c9ca91e784e..d11ac614ace3 100644
--- llvm/lib/CodeGen/MachineOperand.cpp
+++ llvm/lib/CodeGen/MachineOperand.cpp
@@ -91,7 +91,7 @@ void MachineOperand::substVirtReg(Register Reg, unsigned SubIdx,
 }
 
 void MachineOperand::substPhysReg(MCRegister Reg, const TargetRegisterInfo &TRI) {
-  assert(Register::isPhysicalRegister(Reg));
+  assert(Reg.isPhysical());
   if (getSubReg()) {
     Reg = TRI.getSubReg(Reg, getSubReg());
     // Note that getSubReg() may return 0 if the sub-register doesn't exist.
diff --git llvm/lib/CodeGen/MachinePipeliner.cpp llvm/lib/CodeGen/MachinePipeliner.cpp
index acd42aa497c6..54d9c1cf08e3 100644
--- llvm/lib/CodeGen/MachinePipeliner.cpp
+++ llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -3204,7 +3204,7 @@ bool SMSchedule::isValidSchedule(SwingSchedulerDAG *SSD) {
     for (auto &OE : SSD->getDDG()->getOutEdges(&SU)) {
       SUnit *Dst = OE.getDst();
       if (OE.isAssignedRegDep() && !Dst->isBoundaryNode())
-        if (Register::isPhysicalRegister(OE.getReg())) {
+        if (OE.getReg().isPhysical()) {
           if (stageScheduled(Dst) != StageDef)
             return false;
           if (InstrToCycle[Dst] <= CycleDef)
diff --git llvm/lib/CodeGen/MachineRegisterInfo.cpp llvm/lib/CodeGen/MachineRegisterInfo.cpp
index 394b99b85ddc..f058445cc556 100644
--- llvm/lib/CodeGen/MachineRegisterInfo.cpp
+++ llvm/lib/CodeGen/MachineRegisterInfo.cpp
@@ -526,7 +526,7 @@ void MachineRegisterInfo::freezeReservedRegs() {
 }
 
 bool MachineRegisterInfo::isConstantPhysReg(MCRegister PhysReg) const {
-  assert(Register::isPhysicalRegister(PhysReg));
+  assert(PhysReg.isPhysical());
 
   const TargetRegisterInfo *TRI = getTargetRegisterInfo();
   if (TRI->isConstantPhysReg(PhysReg))
@@ -584,7 +584,7 @@ static bool isNoReturnDef(const MachineOperand &MO) {
 
 bool MachineRegisterInfo::isPhysRegModified(MCRegister PhysReg,
                                             bool SkipNoReturnDef) const {
-  if (UsedPhysRegMask.test(PhysReg))
+  if (UsedPhysRegMask.test(PhysReg.id()))
     return true;
   const TargetRegisterInfo *TRI = getTargetRegisterInfo();
   for (MCRegAliasIterator AI(PhysReg, TRI, true); AI.isValid(); ++AI) {
@@ -599,7 +599,7 @@ bool MachineRegisterInfo::isPhysRegModified(MCRegister PhysReg,
 
 bool MachineRegisterInfo::isPhysRegUsed(MCRegister PhysReg,
                                         bool SkipRegMaskTest) const {
-  if (!SkipRegMaskTest && UsedPhysRegMask.test(PhysReg))
+  if (!SkipRegMaskTest && UsedPhysRegMask.test(PhysReg.id()))
     return true;
   const TargetRegisterInfo *TRI = getTargetRegisterInfo();
   for (MCRegAliasIterator AliasReg(PhysReg, TRI, true); AliasReg.isValid();
diff --git llvm/lib/CodeGen/MachineSink.cpp llvm/lib/CodeGen/MachineSink.cpp
index 3c816f976509..03d93cecaa59 100644
--- llvm/lib/CodeGen/MachineSink.cpp
+++ llvm/lib/CodeGen/MachineSink.cpp
@@ -1784,11 +1784,12 @@ bool MachineSinking::SinkInstruction(MachineInstr &MI, bool &SawStore,
   for (auto &MO : MI.all_defs()) {
     if (!MO.getReg().isVirtual())
       continue;
-    if (!SeenDbgUsers.count(MO.getReg()))
+    auto It = SeenDbgUsers.find(MO.getReg());
+    if (It == SeenDbgUsers.end())
       continue;
 
     // Sink any users that don't pass any other DBG_VALUEs for this variable.
-    auto &Users = SeenDbgUsers[MO.getReg()];
+    auto &Users = It->second;
     for (auto &User : Users) {
       MachineInstr *DbgMI = User.getPointer();
       if (User.getInt()) {
diff --git llvm/lib/CodeGen/MachineVerifier.cpp llvm/lib/CodeGen/MachineVerifier.cpp
index 594ff5ac4c07..d41b11307e7b 100644
--- llvm/lib/CodeGen/MachineVerifier.cpp
+++ llvm/lib/CodeGen/MachineVerifier.cpp
@@ -894,7 +894,7 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
   regsLive.clear();
   if (MRI->tracksLiveness()) {
     for (const auto &LI : MBB->liveins()) {
-      if (!Register::isPhysicalRegister(LI.PhysReg)) {
+      if (!LI.PhysReg.isPhysical()) {
         report("MBB live-in list contains non-physical register", MBB);
         continue;
       }
@@ -3448,7 +3448,7 @@ void MachineVerifier::visitMachineFunctionAfter() {
   if (MRI->tracksLiveness())
     for (const auto &MBB : *MF)
       for (MachineBasicBlock::RegisterMaskPair P : MBB.liveins()) {
-        MCPhysReg LiveInReg = P.PhysReg;
+        MCRegister LiveInReg = P.PhysReg;
         bool hasAliases = MCRegAliasIterator(LiveInReg, TRI, false).isValid();
         if (hasAliases || isAllocatable(LiveInReg) || isReserved(LiveInReg))
           continue;
diff --git llvm/lib/CodeGen/PrologEpilogInserter.cpp llvm/lib/CodeGen/PrologEpilogInserter.cpp
index 34dd79c7b618..51e9a067707e 100644
--- llvm/lib/CodeGen/PrologEpilogInserter.cpp
+++ llvm/lib/CodeGen/PrologEpilogInserter.cpp
@@ -1240,9 +1240,9 @@ void PEI::insertZeroCallUsedRegs(MachineFunction &MF) {
             continue;
 
           MCRegister Reg = MO.getReg();
-          if (AllocatableSet[Reg] && !MO.isImplicit() &&
+          if (AllocatableSet[Reg.id()] && !MO.isImplicit() &&
               (MO.isDef() || MO.isUse()))
-            UsedRegs.set(Reg);
+            UsedRegs.set(Reg.id());
         }
       }
 
@@ -1262,20 +1262,20 @@ void PEI::insertZeroCallUsedRegs(MachineFunction &MF) {
       continue;
 
     // Want only used registers.
-    if (OnlyUsed && !UsedRegs[Reg])
+    if (OnlyUsed && !UsedRegs[Reg.id()])
       continue;
 
     // Want only registers used for arguments.
     if (OnlyArg) {
       if (OnlyUsed) {
-        if (!LiveIns[Reg])
+        if (!LiveIns[Reg.id()])
           continue;
       } else if (!TRI.isArgumentRegister(MF, Reg)) {
         continue;
       }
     }
 
-    RegsToZero.set(Reg);
+    RegsToZero.set(Reg.id());
   }
 
   // Don't clear registers that are live when leaving the function.
@@ -1328,7 +1328,7 @@ void PEI::insertZeroCallUsedRegs(MachineFunction &MF) {
   for (const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(&MF);
        MCPhysReg CSReg = *CSRegs; ++CSRegs)
     for (MCRegister Reg : TRI.sub_and_superregs_inclusive(CSReg))
-      RegsToZero.reset(Reg);
+      RegsToZero.reset(Reg.id());
 
   const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering();
   for (MachineBasicBlock &MBB : MF)
diff --git llvm/lib/CodeGen/RDFLiveness.cpp llvm/lib/CodeGen/RDFLiveness.cpp
index 11f3fedaa5f9..682d316a5bfa 100644
--- llvm/lib/CodeGen/RDFLiveness.cpp
+++ llvm/lib/CodeGen/RDFLiveness.cpp
@@ -895,7 +895,7 @@ void Liveness::computeLiveIns() {
 void Liveness::resetLiveIns() {
   for (auto &B : DFG.getMF()) {
     // Remove all live-ins.
-    std::vector<unsigned> T;
+    std::vector<MCRegister> T;
     for (const MachineBasicBlock::RegisterMaskPair &LI : B.liveins())
       T.push_back(LI.PhysReg);
     for (auto I : T)
@@ -917,7 +917,7 @@ void Liveness::resetKills(MachineBasicBlock *B) {
     for (auto I : B->liveins()) {
       MCSubRegIndexIterator S(I.PhysReg, &TRI);
       if (!S.isValid()) {
-        LV.set(I.PhysReg);
+        LV.set(I.PhysReg.id());
         continue;
       }
       do {
@@ -960,7 +960,7 @@ void Liveness::resetKills(MachineBasicBlock *B) {
         continue;
       bool IsLive = false;
       for (MCRegAliasIterator AR(R, &TRI, true); AR.isValid(); ++AR) {
-        if (!Live[*AR])
+        if (!Live[(*AR).id()])
           continue;
         IsLive = true;
         break;
diff --git llvm/lib/CodeGen/ReachingDefAnalysis.cpp llvm/lib/CodeGen/ReachingDefAnalysis.cpp
index 3ab6315f9c8e..9459904d56e4 100644
--- llvm/lib/CodeGen/ReachingDefAnalysis.cpp
+++ llvm/lib/CodeGen/ReachingDefAnalysis.cpp
@@ -30,7 +30,7 @@ static bool isValidRegUse(const MachineOperand &MO) {
   return isValidReg(MO) && MO.isUse();
 }
 
-static bool isValidRegUseOf(const MachineOperand &MO, MCRegister Reg,
+static bool isValidRegUseOf(const MachineOperand &MO, Register Reg,
                             const TargetRegisterInfo *TRI) {
   if (!isValidRegUse(MO))
     return false;
@@ -41,7 +41,7 @@ static bool isValidRegDef(const MachineOperand &MO) {
   return isValidReg(MO) && MO.isDef();
 }
 
-static bool isValidRegDefOf(const MachineOperand &MO, MCRegister Reg,
+static bool isValidRegDefOf(const MachineOperand &MO, Register Reg,
                             const TargetRegisterInfo *TRI) {
   if (!isValidRegDef(MO))
     return false;
@@ -260,8 +260,7 @@ void ReachingDefAnalysis::traverse() {
 #endif
 }
 
-int ReachingDefAnalysis::getReachingDef(MachineInstr *MI,
-                                        MCRegister Reg) const {
+int ReachingDefAnalysis::getReachingDef(MachineInstr *MI, Register Reg) const {
   assert(InstIds.count(MI) && "Unexpected machine instuction.");
   int InstId = InstIds.lookup(MI);
   int DefRes = ReachingDefDefaultVal;
@@ -281,14 +280,14 @@ int ReachingDefAnalysis::getReachingDef(MachineInstr *MI,
 }
 
 MachineInstr *ReachingDefAnalysis::getReachingLocalMIDef(MachineInstr *MI,
-                                                         MCRegister Reg) const {
+                                                         Register Reg) const {
   return hasLocalDefBefore(MI, Reg)
              ? getInstFromId(MI->getParent(), getReachingDef(MI, Reg))
              : nullptr;
 }
 
 bool ReachingDefAnalysis::hasSameReachingDef(MachineInstr *A, MachineInstr *B,
-                                             MCRegister Reg) const {
+                                             Register Reg) const {
   MachineBasicBlock *ParentA = A->getParent();
   MachineBasicBlock *ParentB = B->getParent();
   if (ParentA != ParentB)
@@ -317,18 +316,17 @@ MachineInstr *ReachingDefAnalysis::getInstFromId(MachineBasicBlock *MBB,
   return nullptr;
 }
 
-int ReachingDefAnalysis::getClearance(MachineInstr *MI, MCRegister Reg) const {
+int ReachingDefAnalysis::getClearance(MachineInstr *MI, Register Reg) const {
   assert(InstIds.count(MI) && "Unexpected machine instuction.");
   return InstIds.lookup(MI) - getReachingDef(MI, Reg);
 }
 
 bool ReachingDefAnalysis::hasLocalDefBefore(MachineInstr *MI,
-                                            MCRegister Reg) const {
+                                            Register Reg) const {
   return getReachingDef(MI, Reg) >= 0;
 }
 
-void ReachingDefAnalysis::getReachingLocalUses(MachineInstr *Def,
-                                               MCRegister Reg,
+void ReachingDefAnalysis::getReachingLocalUses(MachineInstr *Def, Register Reg,
                                                InstSet &Uses) const {
   MachineBasicBlock *MBB = Def->getParent();
   MachineBasicBlock::iterator MI = MachineBasicBlock::iterator(Def);
@@ -352,7 +350,7 @@ void ReachingDefAnalysis::getReachingLocalUses(MachineInstr *Def,
   }
 }
 
-bool ReachingDefAnalysis::getLiveInUses(MachineBasicBlock *MBB, MCRegister Reg,
+bool ReachingDefAnalysis::getLiveInUses(MachineBasicBlock *MBB, Register Reg,
                                         InstSet &Uses) const {
   for (MachineInstr &MI :
        instructionsWithoutDebug(MBB->instr_begin(), MBB->instr_end())) {
@@ -370,7 +368,7 @@ bool ReachingDefAnalysis::getLiveInUses(MachineBasicBlock *MBB, MCRegister Reg,
   return isReachingDefLiveOut(&*Last, Reg);
 }
 
-void ReachingDefAnalysis::getGlobalUses(MachineInstr *MI, MCRegister Reg,
+void ReachingDefAnalysis::getGlobalUses(MachineInstr *MI, Register Reg,
                                         InstSet &Uses) const {
   MachineBasicBlock *MBB = MI->getParent();
 
@@ -395,8 +393,7 @@ void ReachingDefAnalysis::getGlobalUses(MachineInstr *MI, MCRegister Reg,
   }
 }
 
-void ReachingDefAnalysis::getGlobalReachingDefs(MachineInstr *MI,
-                                                MCRegister Reg,
+void ReachingDefAnalysis::getGlobalReachingDefs(MachineInstr *MI, Register Reg,
                                                 InstSet &Defs) const {
   if (auto *Def = getUniqueReachingMIDef(MI, Reg)) {
     Defs.insert(Def);
@@ -407,13 +404,13 @@ void ReachingDefAnalysis::getGlobalReachingDefs(MachineInstr *MI,
     getLiveOuts(MBB, Reg, Defs);
 }
 
-void ReachingDefAnalysis::getLiveOuts(MachineBasicBlock *MBB, MCRegister Reg,
+void ReachingDefAnalysis::getLiveOuts(MachineBasicBlock *MBB, Register Reg,
                                       InstSet &Defs) const {
   SmallPtrSet<MachineBasicBlock*, 2> VisitedBBs;
   getLiveOuts(MBB, Reg, Defs, VisitedBBs);
 }
 
-void ReachingDefAnalysis::getLiveOuts(MachineBasicBlock *MBB, MCRegister Reg,
+void ReachingDefAnalysis::getLiveOuts(MachineBasicBlock *MBB, Register Reg,
                                       InstSet &Defs,
                                       BlockSet &VisitedBBs) const {
   if (VisitedBBs.count(MBB))
@@ -432,9 +429,8 @@ void ReachingDefAnalysis::getLiveOuts(MachineBasicBlock *MBB, MCRegister Reg,
       getLiveOuts(Pred, Reg, Defs, VisitedBBs);
 }
 
-MachineInstr *
-ReachingDefAnalysis::getUniqueReachingMIDef(MachineInstr *MI,
-                                            MCRegister Reg) const {
+MachineInstr *ReachingDefAnalysis::getUniqueReachingMIDef(MachineInstr *MI,
+                                                          Register Reg) const {
   // If there's a local def before MI, return it.
   MachineInstr *LocalDef = getReachingLocalMIDef(MI, Reg);
   if (LocalDef && InstIds.lookup(LocalDef) < InstIds.lookup(MI))
@@ -465,8 +461,7 @@ MachineInstr *ReachingDefAnalysis::getMIOperand(MachineInstr *MI,
   return getUniqueReachingMIDef(MI, MO.getReg());
 }
 
-bool ReachingDefAnalysis::isRegUsedAfter(MachineInstr *MI,
-                                         MCRegister Reg) const {
+bool ReachingDefAnalysis::isRegUsedAfter(MachineInstr *MI, Register Reg) const {
   MachineBasicBlock *MBB = MI->getParent();
   LiveRegUnits LiveRegs(*TRI);
   LiveRegs.addLiveOuts(*MBB);
@@ -487,7 +482,7 @@ bool ReachingDefAnalysis::isRegUsedAfter(MachineInstr *MI,
 }
 
 bool ReachingDefAnalysis::isRegDefinedAfter(MachineInstr *MI,
-                                            MCRegister Reg) const {
+                                            Register Reg) const {
   MachineBasicBlock *MBB = MI->getParent();
   auto Last = MBB->getLastNonDebugInstr();
   if (Last != MBB->end() &&
@@ -501,7 +496,7 @@ bool ReachingDefAnalysis::isRegDefinedAfter(MachineInstr *MI,
 }
 
 bool ReachingDefAnalysis::isReachingDefLiveOut(MachineInstr *MI,
-                                               MCRegister Reg) const {
+                                               Register Reg) const {
   MachineBasicBlock *MBB = MI->getParent();
   LiveRegUnits LiveRegs(*TRI);
   LiveRegs.addLiveOuts(*MBB);
@@ -522,7 +517,7 @@ bool ReachingDefAnalysis::isReachingDefLiveOut(MachineInstr *MI,
 }
 
 MachineInstr *ReachingDefAnalysis::getLocalLiveOutMIDef(MachineBasicBlock *MBB,
-                                                        MCRegister Reg) const {
+                                                        Register Reg) const {
   LiveRegUnits LiveRegs(*TRI);
   LiveRegs.addLiveOuts(*MBB);
   if (LiveRegs.available(Reg))
@@ -646,7 +641,7 @@ ReachingDefAnalysis::isSafeToRemove(MachineInstr *MI, InstSet &Visited,
 void ReachingDefAnalysis::collectKilledOperands(MachineInstr *MI,
                                                 InstSet &Dead) const {
   Dead.insert(MI);
-  auto IsDead = [this, &Dead](MachineInstr *Def, MCRegister Reg) {
+  auto IsDead = [this, &Dead](MachineInstr *Def, Register Reg) {
     if (mayHaveSideEffects(*Def))
       return false;
 
@@ -676,12 +671,12 @@ void ReachingDefAnalysis::collectKilledOperands(MachineInstr *MI,
 }
 
 bool ReachingDefAnalysis::isSafeToDefRegAt(MachineInstr *MI,
-                                           MCRegister Reg) const {
+                                           Register Reg) const {
   SmallPtrSet<MachineInstr*, 1> Ignore;
   return isSafeToDefRegAt(MI, Reg, Ignore);
 }
 
-bool ReachingDefAnalysis::isSafeToDefRegAt(MachineInstr *MI, MCRegister Reg,
+bool ReachingDefAnalysis::isSafeToDefRegAt(MachineInstr *MI, Register Reg,
                                            InstSet &Ignore) const {
   // Check for any uses of the register after MI.
   if (isRegUsedAfter(MI, Reg)) {
diff --git llvm/lib/CodeGen/RegAllocFast.cpp llvm/lib/CodeGen/RegAllocFast.cpp
index 3863ca80bb44..e2309b65cf9a 100644
--- llvm/lib/CodeGen/RegAllocFast.cpp
+++ llvm/lib/CodeGen/RegAllocFast.cpp
@@ -276,7 +276,7 @@ private:
   // Assign index for each instruction to quickly determine dominance.
   InstrPosIndexes PosIndexes;
 
-  void setPhysRegState(MCPhysReg PhysReg, unsigned NewState);
+  void setPhysRegState(MCRegister PhysReg, unsigned NewState);
   bool isPhysRegFree(MCPhysReg PhysReg) const;
 
   /// Mark a physreg as used in this instruction.
@@ -449,7 +449,7 @@ bool RegAllocFastImpl::shouldAllocateRegister(const Register Reg) const {
   return ShouldAllocateRegisterImpl(*TRI, *MRI, Reg);
 }
 
-void RegAllocFastImpl::setPhysRegState(MCPhysReg PhysReg, unsigned NewState) {
+void RegAllocFastImpl::setPhysRegState(MCRegister PhysReg, unsigned NewState) {
   for (MCRegUnit Unit : TRI->regunits(PhysReg))
     RegUnitStates[Unit] = NewState;
 }
@@ -671,7 +671,7 @@ void RegAllocFastImpl::reloadAtBegin(MachineBasicBlock &MBB) {
     return;
 
   for (MachineBasicBlock::RegisterMaskPair P : MBB.liveins()) {
-    MCPhysReg Reg = P.PhysReg;
+    MCRegister Reg = P.PhysReg;
     // Set state to live-in. This possibly overrides mappings to virtual
     // registers but we don't care anymore at this point.
     setPhysRegState(Reg, regLiveIn);
@@ -688,7 +688,7 @@ void RegAllocFastImpl::reloadAtBegin(MachineBasicBlock &MBB) {
     if (PhysReg == 0 || LR.Error)
       continue;
 
-    MCRegister FirstUnit = *TRI->regunits(PhysReg).begin();
+    MCRegUnit FirstUnit = *TRI->regunits(PhysReg).begin();
     if (RegUnitStates[FirstUnit] == regLiveIn)
       continue;
 
@@ -758,7 +758,7 @@ bool RegAllocFastImpl::displacePhysReg(MachineInstr &MI, MCPhysReg PhysReg) {
 void RegAllocFastImpl::freePhysReg(MCPhysReg PhysReg) {
   LLVM_DEBUG(dbgs() << "Freeing " << printReg(PhysReg, TRI) << ':');
 
-  MCRegister FirstUnit = *TRI->regunits(PhysReg).begin();
+  MCRegUnit FirstUnit = *TRI->regunits(PhysReg).begin();
   switch (unsigned VirtReg = RegUnitStates[FirstUnit]) {
   case regFree:
     LLVM_DEBUG(dbgs() << '\n');
diff --git llvm/lib/CodeGen/RegAllocGreedy.cpp llvm/lib/CodeGen/RegAllocGreedy.cpp
index 66e9cf546b83..6077cfd514de 100644
--- llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -439,7 +439,7 @@ MCRegister RAGreedy::tryAssign(const LiveInterval &VirtReg,
     }
 
   // Try to evict interference from a cheaper alternative.
-  uint8_t Cost = RegCosts[PhysReg];
+  uint8_t Cost = RegCosts[PhysReg.id()];
 
   // Most registers have 0 additional cost.
   if (!Cost)
@@ -559,7 +559,7 @@ RegAllocEvictionAdvisor::getOrderLimit(const LiveInterval &VirtReg,
 
 bool RegAllocEvictionAdvisor::canAllocatePhysReg(unsigned CostPerUseLimit,
                                                  MCRegister PhysReg) const {
-  if (RegCosts[PhysReg] >= CostPerUseLimit)
+  if (RegCosts[PhysReg.id()] >= CostPerUseLimit)
     return false;
   // The first use of a callee-saved register in a function has cost 1.
   // Don't start using a CSR when the CostPerUseLimit is low.
diff --git llvm/lib/CodeGen/RegUsageInfoCollector.cpp llvm/lib/CodeGen/RegUsageInfoCollector.cpp
index f539966fe617..07b4bc7ffd3e 100644
--- llvm/lib/CodeGen/RegUsageInfoCollector.cpp
+++ llvm/lib/CodeGen/RegUsageInfoCollector.cpp
@@ -162,12 +162,12 @@ bool RegUsageInfoCollector::run(MachineFunction &MF) {
   computeCalleeSavedRegs(SavedRegs, MF);
 
   const BitVector &UsedPhysRegsMask = MRI->getUsedPhysRegsMask();
-  auto SetRegAsDefined = [&RegMask] (unsigned Reg) {
-    RegMask[Reg / 32] &= ~(1u << Reg % 32);
+  auto SetRegAsDefined = [&RegMask](MCRegister Reg) {
+    RegMask[Reg.id() / 32] &= ~(1u << Reg.id() % 32);
   };
 
   // Don't include $noreg in any regmasks.
-  SetRegAsDefined(MCRegister::NoRegister);
+  SetRegAsDefined(MCRegister());
 
   // Some targets can clobber registers "inside" a call, typically in
   // linker-generated code.
@@ -186,7 +186,7 @@ bool RegUsageInfoCollector::run(MachineFunction &MF) {
     // with all it's unsaved aliases.
     if (!MRI->def_empty(PReg)) {
       for (MCRegAliasIterator AI(PReg, TRI, true); AI.isValid(); ++AI)
-        if (!SavedRegs.test(*AI))
+        if (!SavedRegs.test((*AI).id()))
           SetRegAsDefined(*AI);
       continue;
     }
diff --git llvm/lib/CodeGen/RegisterClassInfo.cpp llvm/lib/CodeGen/RegisterClassInfo.cpp
index 9312bc03bc52..40fc35a16335 100644
--- llvm/lib/CodeGen/RegisterClassInfo.cpp
+++ llvm/lib/CodeGen/RegisterClassInfo.cpp
@@ -95,7 +95,8 @@ void RegisterClassInfo::runOnMachineFunction(const MachineFunction &mf) {
   BitVector CSRHintsForAllocOrder(TRI->getNumRegs());
   for (const MCPhysReg *I = CSR; *I; ++I)
     for (MCRegAliasIterator AI(*I, TRI, true); AI.isValid(); ++AI)
-      CSRHintsForAllocOrder[*AI] = STI.ignoreCSRForAllocationOrder(mf, *AI);
+      CSRHintsForAllocOrder[(*AI).id()] =
+          STI.ignoreCSRForAllocationOrder(mf, *AI);
   if (IgnoreCSRForAllocOrder != CSRHintsForAllocOrder) {
     Update = true;
     IgnoreCSRForAllocOrder = CSRHintsForAllocOrder;
diff --git llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 49e5b7d9ef01..33f0c8b5555e 100644
--- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -22807,15 +22807,15 @@ static SDValue scalarizeExtractedBinOp(SDNode *ExtElt, SelectionDAG &DAG,
       Vec->getNumValues() != 1)
     return SDValue();
 
+  // Targets may want to avoid this to prevent an expensive register transfer.
+  if (!TLI.shouldScalarizeBinop(Vec))
+    return SDValue();
+
   EVT ResVT = ExtElt->getValueType(0);
   if (Opc == ISD::SETCC &&
       (ResVT != Vec.getValueType().getVectorElementType() || LegalTypes))
     return SDValue();
 
-  // Targets may want to avoid this to prevent an expensive register transfer.
-  if (!TLI.shouldScalarizeBinop(Vec))
-    return SDValue();
-
   // Extracting an element of a vector constant is constant-folded, so this
   // transform is just replacing a vector op with a scalar op while moving the
   // extract.
@@ -22834,8 +22834,21 @@ static SDValue scalarizeExtractedBinOp(SDNode *ExtElt, SelectionDAG &DAG,
     EVT OpVT = Op0.getValueType().getVectorElementType();
     Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT, Op0, Index);
     Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT, Op1, Index);
-    return DAG.getSetCC(DL, ResVT, Op0, Op1,
-                        cast<CondCodeSDNode>(Vec->getOperand(2))->get());
+    SDValue NewVal = DAG.getSetCC(
+        DL, ResVT, Op0, Op1, cast<CondCodeSDNode>(Vec->getOperand(2))->get());
+    // We may need to sign- or zero-extend the result to match the same
+    // behaviour as the vector version of SETCC.
+    unsigned VecBoolContents = TLI.getBooleanContents(Vec.getValueType());
+    if (ResVT != MVT::i1 &&
+        VecBoolContents != TargetLowering::UndefinedBooleanContent &&
+        VecBoolContents != TLI.getBooleanContents(ResVT)) {
+      if (VecBoolContents == TargetLowering::ZeroOrNegativeOneBooleanContent)
+        NewVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ResVT, NewVal,
+                             DAG.getValueType(MVT::i1));
+      else
+        NewVal = DAG.getZeroExtendInReg(NewVal, DL, MVT::i1);
+    }
+    return NewVal;
   }
   Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Op0, Index);
   Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Op1, Index);
diff --git llvm/lib/CodeGen/SelectionDAG/FastISel.cpp llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
index ec5b058da297..5a314570c776 100644
--- llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -1001,7 +1001,7 @@ bool FastISel::lowerCallTo(CallLoweringInfo &CLI) {
   GetReturnInfo(CLI.CallConv, CLI.RetTy, getReturnAttrs(CLI), Outs, TLI, DL);
 
   bool CanLowerReturn = TLI.CanLowerReturn(
-      CLI.CallConv, *FuncInfo.MF, CLI.IsVarArg, Outs, CLI.RetTy->getContext());
+      CLI.CallConv, *FuncInfo.MF, CLI.IsVarArg, Outs, CLI.RetTy->getContext(), CLI.RetTy);
 
   // FIXME: sret demotion isn't supported yet - bail out.
   if (!CanLowerReturn)
diff --git llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
index 1de336429fe1..3e89b18585f1 100644
--- llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -99,7 +99,7 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
   GetReturnInfo(CC, Fn->getReturnType(), Fn->getAttributes(), Outs, *TLI,
                 mf.getDataLayout());
   CanLowerReturn =
-      TLI->CanLowerReturn(CC, *MF, Fn->isVarArg(), Outs, Fn->getContext());
+      TLI->CanLowerReturn(CC, *MF, Fn->isVarArg(), Outs, Fn->getContext(), Fn->getReturnType());
 
   // If this personality uses funclets, we need to do a bit more work.
   DenseMap<const AllocaInst *, TinyPtrVector<int *>> CatchObjects;
diff --git llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index be7521f34168..b0a624680231 100644
--- llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -155,6 +155,10 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::ZERO_EXTEND_VECTOR_INREG:
                          Res = PromoteIntRes_EXTEND_VECTOR_INREG(N); break;
 
+  case ISD::VECTOR_FIND_LAST_ACTIVE:
+    Res = PromoteIntRes_VECTOR_FIND_LAST_ACTIVE(N);
+    break;
+
   case ISD::SIGN_EXTEND:
   case ISD::VP_SIGN_EXTEND:
   case ISD::ZERO_EXTEND:
@@ -2069,6 +2073,9 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
   case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:
     Res = PromoteIntOp_VECTOR_HISTOGRAM(N, OpNo);
     break;
+  case ISD::VECTOR_FIND_LAST_ACTIVE:
+    Res = PromoteIntOp_VECTOR_FIND_LAST_ACTIVE(N, OpNo);
+    break;
   }
 
   // If the result is null, the sub-method took care of registering results etc.
@@ -2810,6 +2817,13 @@ SDValue DAGTypeLegalizer::PromoteIntOp_VECTOR_HISTOGRAM(SDNode *N,
   return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
 }
 
+SDValue DAGTypeLegalizer::PromoteIntOp_VECTOR_FIND_LAST_ACTIVE(SDNode *N,
+                                                               unsigned OpNo) {
+  SmallVector<SDValue, 1> NewOps(N->ops());
+  NewOps[OpNo] = GetPromotedInteger(N->getOperand(OpNo));
+  return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
+}
+
 //===----------------------------------------------------------------------===//
 //  Integer Result Expansion
 //===----------------------------------------------------------------------===//
@@ -5084,13 +5098,9 @@ void DAGTypeLegalizer::ExpandIntRes_XMULO(SDNode *N,
   if (LC == RTLIB::UNKNOWN_LIBCALL || !TLI.getLibcallName(LC) ||
       TLI.getLibcallName(LC) == DAG.getMachineFunction().getName()) {
     // FIXME: This is not an optimal expansion, but better than crashing.
-    EVT WideVT =
-        EVT::getIntegerVT(*DAG.getContext(), VT.getScalarSizeInBits() * 2);
-    SDValue LHS = DAG.getNode(ISD::SIGN_EXTEND, dl, WideVT, N->getOperand(0));
-    SDValue RHS = DAG.getNode(ISD::SIGN_EXTEND, dl, WideVT, N->getOperand(1));
-    SDValue Mul = DAG.getNode(ISD::MUL, dl, WideVT, LHS, RHS);
     SDValue MulLo, MulHi;
-    SplitInteger(Mul, MulLo, MulHi);
+    TLI.forceExpandWideMUL(DAG, dl, /*Signed=*/true, N->getOperand(0),
+                           N->getOperand(1), MulLo, MulHi);
     SDValue SRA =
         DAG.getNode(ISD::SRA, dl, VT, MulLo,
                     DAG.getConstant(VT.getScalarSizeInBits() - 1, dl, VT));
@@ -6124,6 +6134,12 @@ SDValue DAGTypeLegalizer::PromoteIntRes_EXTEND_VECTOR_INREG(SDNode *N) {
   return DAG.getNode(N->getOpcode(), dl, NVT, N->getOperand(0));
 }
 
+SDValue DAGTypeLegalizer::PromoteIntRes_VECTOR_FIND_LAST_ACTIVE(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  return DAG.getNode(ISD::VECTOR_FIND_LAST_ACTIVE, SDLoc(N), NVT, N->ops());
+}
+
 SDValue DAGTypeLegalizer::PromoteIntRes_INSERT_VECTOR_ELT(SDNode *N) {
   EVT OutVT = N->getValueType(0);
   EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
diff --git llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index caaa40a64c7e..f13f70e66cfa 100644
--- llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -378,6 +378,7 @@ private:
   SDValue PromoteIntRes_VPFunnelShift(SDNode *N);
   SDValue PromoteIntRes_IS_FPCLASS(SDNode *N);
   SDValue PromoteIntRes_PATCHPOINT(SDNode *N);
+  SDValue PromoteIntRes_VECTOR_FIND_LAST_ACTIVE(SDNode *N);
 
   // Integer Operand Promotion.
   bool PromoteIntegerOperand(SDNode *N, unsigned OpNo);
@@ -428,6 +429,7 @@ private:
   SDValue PromoteIntOp_VP_STRIDED(SDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_VP_SPLICE(SDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_VECTOR_HISTOGRAM(SDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_VECTOR_FIND_LAST_ACTIVE(SDNode *N, unsigned OpNo);
 
   void SExtOrZExtPromotedOperands(SDValue &LHS, SDValue &RHS);
   void PromoteSetCCOperands(SDValue &LHS,SDValue &RHS, ISD::CondCode Code);
diff --git llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index a6d1b1cb7b10..6ad08bce44b0 100644
--- llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -503,6 +503,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::VECREDUCE_FMIN:
   case ISD::VECREDUCE_FMAXIMUM:
   case ISD::VECREDUCE_FMINIMUM:
+  case ISD::VECTOR_FIND_LAST_ACTIVE:
     Action = TLI.getOperationAction(Node->getOpcode(),
                                     Node->getOperand(0).getValueType());
     break;
@@ -1225,6 +1226,9 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
   case ISD::VECTOR_COMPRESS:
     Results.push_back(TLI.expandVECTOR_COMPRESS(Node, DAG));
     return;
+  case ISD::VECTOR_FIND_LAST_ACTIVE:
+    Results.push_back(TLI.expandVectorFindLastActive(Node, DAG));
+    return;
   case ISD::SCMP:
   case ISD::UCMP:
     Results.push_back(TLI.expandCMP(Node, DAG));
diff --git llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 0dfd0302ae54..743ae4895a1b 100644
--- llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -954,6 +954,12 @@ static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) {
       ID.AddInteger(M);
     break;
   }
+  case ISD::ADDRSPACECAST: {
+    const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(N);
+    ID.AddInteger(ASC->getSrcAddressSpace());
+    ID.AddInteger(ASC->getDestAddressSpace());
+    break;
+  }
   case ISD::TargetBlockAddress:
   case ISD::BlockAddress: {
     const BlockAddressSDNode *BA = cast<BlockAddressSDNode>(N);
diff --git llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index f8d7c3ef7bbe..9f1aadcb279a 100644
--- llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6427,42 +6427,25 @@ void SelectionDAGBuilder::visitVectorExtractLastActive(const CallInst &I,
   assert(Intrinsic == Intrinsic::experimental_vector_extract_last_active &&
          "Tried lowering invalid vector extract last");
   SDLoc sdl = getCurSDLoc();
+  const DataLayout &Layout = DAG.getDataLayout();
   SDValue Data = getValue(I.getOperand(0));
   SDValue Mask = getValue(I.getOperand(1));
-  SDValue PassThru = getValue(I.getOperand(2));
 
-  EVT DataVT = Data.getValueType();
-  EVT ScalarVT = PassThru.getValueType();
-  EVT BoolVT = Mask.getValueType().getScalarType();
-
-  // Find a suitable type for a stepvector.
-  ConstantRange VScaleRange(1, /*isFullSet=*/true); // Dummy value.
-  if (DataVT.isScalableVector())
-    VScaleRange = getVScaleRange(I.getCaller(), 64);
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  unsigned EltWidth = TLI.getBitWidthForCttzElements(
-      I.getType(), DataVT.getVectorElementCount(), /*ZeroIsPoison=*/true,
-      &VScaleRange);
-  MVT StepVT = MVT::getIntegerVT(EltWidth);
-  EVT StepVecVT = DataVT.changeVectorElementType(StepVT);
-
-  // Zero out lanes with inactive elements, then find the highest remaining
-  // value from the stepvector.
-  SDValue Zeroes = DAG.getConstant(0, sdl, StepVecVT);
-  SDValue StepVec = DAG.getStepVector(sdl, StepVecVT);
-  SDValue ActiveElts = DAG.getSelect(sdl, StepVecVT, Mask, StepVec, Zeroes);
-  SDValue HighestIdx =
-      DAG.getNode(ISD::VECREDUCE_UMAX, sdl, StepVT, ActiveElts);
-
-  // Extract the corresponding lane from the data vector
-  EVT ExtVT = TLI.getVectorIdxTy(DAG.getDataLayout());
-  SDValue Idx = DAG.getZExtOrTrunc(HighestIdx, sdl, ExtVT);
-  SDValue Extract =
-      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, sdl, ScalarVT, Data, Idx);
-
-  // If all mask lanes were inactive, choose the passthru value instead.
-  SDValue AnyActive = DAG.getNode(ISD::VECREDUCE_OR, sdl, BoolVT, Mask);
-  SDValue Result = DAG.getSelect(sdl, ScalarVT, AnyActive, Extract, PassThru);
+  EVT ResVT = TLI.getValueType(Layout, I.getType());
+
+  EVT ExtVT = TLI.getVectorIdxTy(Layout);
+  SDValue Idx = DAG.getNode(ISD::VECTOR_FIND_LAST_ACTIVE, sdl, ExtVT, Mask);
+  SDValue Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, sdl, ResVT, Data, Idx);
+
+  Value *Default = I.getOperand(2);
+  if (!isa<PoisonValue>(Default) && !isa<UndefValue>(Default)) {
+    SDValue PassThru = getValue(Default);
+    EVT BoolVT = Mask.getValueType().getScalarType();
+    SDValue AnyActive = DAG.getNode(ISD::VECREDUCE_OR, sdl, BoolVT, Mask);
+    Result = DAG.getSelect(sdl, ResVT, AnyActive, Result, PassThru);
+  }
+
   setValue(&I, Result);
 }
 
@@ -11008,7 +10991,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
 
   bool CanLowerReturn =
       this->CanLowerReturn(CLI.CallConv, CLI.DAG.getMachineFunction(),
-                           CLI.IsVarArg, Outs, CLI.RetTy->getContext());
+                           CLI.IsVarArg, Outs, CLI.RetTy->getContext(), CLI.RetTy);
 
   SDValue DemoteStackSlot;
   int DemoteStackIdx = -100;
@@ -11021,8 +11004,8 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
     MachineFunction &MF = CLI.DAG.getMachineFunction();
     DemoteStackIdx =
         MF.getFrameInfo().CreateStackObject(TySize, Alignment, false);
-    Type *StackSlotPtrType = PointerType::get(CLI.RetTy,
-                                              DL.getAllocaAddrSpace());
+    Type *StackSlotPtrType =
+        PointerType::get(CLI.RetTy->getContext(), DL.getAllocaAddrSpace());
 
     DemoteStackSlot = CLI.DAG.getFrameIndex(DemoteStackIdx, getFrameIndexTy(DL));
     ArgListEntry Entry;
@@ -12769,7 +12752,7 @@ void SelectionDAGBuilder::visitCallBrLandingPad(const CallInst &I) {
       // the OpInfo.ConstraintVT is legal on the target or not.
       for (Register &Reg : OpInfo.AssignedRegs.Regs) {
         Register OriginalDef = FollowCopyChain(MRI, InitialDef++);
-        if (Register::isPhysicalRegister(OriginalDef))
+        if (OriginalDef.isPhysical())
           FuncInfo.MBB->addLiveIn(OriginalDef);
         // Update the assigned registers to use the original defs.
         Reg = OriginalDef;
diff --git llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 580ff1906555..f63c8dd3df1c 100644
--- llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -567,6 +567,9 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:
     return "histogram";
 
+  case ISD::VECTOR_FIND_LAST_ACTIVE:
+    return "find_last_active";
+
     // Vector Predication
 #define BEGIN_REGISTER_VP_SDNODE(SDID, LEGALARG, NAME, ...)                    \
   case ISD::SDID:                                                              \
diff --git llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
index a838003c34df..987ea826f782 100644
--- llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
+++ llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
@@ -871,10 +871,11 @@ SDValue SelectionDAGBuilder::LowerAsSTATEPOINT(
   for (const auto *Relocate : SI.GCRelocates) {
     Value *Derived = Relocate->getDerivedPtr();
     SDValue SD = getValue(Derived);
-    if (!LowerAsVReg.count(SD))
+    auto It = LowerAsVReg.find(SD);
+    if (It == LowerAsVReg.end())
       continue;
 
-    SDValue Relocated = SDValue(StatepointMCNode, LowerAsVReg[SD]);
+    SDValue Relocated = SDValue(StatepointMCNode, It->second);
 
     // Handle local relocate. Note that different relocates might
     // map to the same SDValue.
diff --git llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 56194e2614af..368800d8b46a 100644
--- llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -12,6 +12,7 @@
 
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/CodeGenCommonISel.h"
@@ -9451,6 +9452,43 @@ SDValue TargetLowering::expandVPCTTZElements(SDNode *N,
   return DAG.getNode(ISD::VP_REDUCE_UMIN, DL, ResVT, ExtEVL, Select, Mask, EVL);
 }
 
+SDValue TargetLowering::expandVectorFindLastActive(SDNode *N,
+                                                   SelectionDAG &DAG) const {
+  SDLoc DL(N);
+  SDValue Mask = N->getOperand(0);
+  EVT MaskVT = Mask.getValueType();
+  EVT BoolVT = MaskVT.getScalarType();
+
+  // Find a suitable type for a stepvector.
+  ConstantRange VScaleRange(1, /*isFullSet=*/true); // Fixed length default.
+  if (MaskVT.isScalableVector())
+    VScaleRange = getVScaleRange(&DAG.getMachineFunction().getFunction(), 64);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  unsigned EltWidth = TLI.getBitWidthForCttzElements(
+      BoolVT.getTypeForEVT(*DAG.getContext()), MaskVT.getVectorElementCount(),
+      /*ZeroIsPoison=*/true, &VScaleRange);
+  EVT StepVT = MVT::getIntegerVT(EltWidth);
+  EVT StepVecVT = MaskVT.changeVectorElementType(StepVT);
+
+  // If promotion is required to make the type legal, do it here; promotion
+  // of integers within LegalizeVectorOps is looking for types of the same
+  // size but with a smaller number of larger elements, not the usual larger
+  // size with the same number of larger elements.
+  if (TLI.getTypeAction(StepVecVT.getSimpleVT()) ==
+      TargetLowering::TypePromoteInteger) {
+    StepVecVT = TLI.getTypeToTransformTo(*DAG.getContext(), StepVecVT);
+    StepVT = StepVecVT.getVectorElementType();
+  }
+
+  // Zero out lanes with inactive elements, then find the highest remaining
+  // value from the stepvector.
+  SDValue Zeroes = DAG.getConstant(0, DL, StepVecVT);
+  SDValue StepVec = DAG.getStepVector(DL, StepVecVT);
+  SDValue ActiveElts = DAG.getSelect(DL, StepVecVT, Mask, StepVec, Zeroes);
+  SDValue HighestIdx = DAG.getNode(ISD::VECREDUCE_UMAX, DL, StepVT, ActiveElts);
+  return DAG.getZExtOrTrunc(HighestIdx, DL, N->getValueType(0));
+}
+
 SDValue TargetLowering::expandABS(SDNode *N, SelectionDAG &DAG,
                                   bool IsNegative) const {
   SDLoc dl(N);
diff --git llvm/lib/CodeGen/TargetLoweringBase.cpp llvm/lib/CodeGen/TargetLoweringBase.cpp
index 3b0e9c7526fd..73af0a9a7140 100644
--- llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -818,6 +818,9 @@ void TargetLoweringBase::initActions() {
     setOperationAction(ISD::SDOPC, VT, Expand);
 #include "llvm/IR/VPIntrinsics.def"
 
+    // Masked vector extracts default to expand.
+    setOperationAction(ISD::VECTOR_FIND_LAST_ACTIVE, VT, Expand);
+
     // FP environment operations default to expand.
     setOperationAction(ISD::GET_FPENV, VT, Expand);
     setOperationAction(ISD::SET_FPENV, VT, Expand);
diff --git llvm/lib/CodeGen/TargetRegisterInfo.cpp llvm/lib/CodeGen/TargetRegisterInfo.cpp
index 3be47a769d41..ba528f66980f 100644
--- llvm/lib/CodeGen/TargetRegisterInfo.cpp
+++ llvm/lib/CodeGen/TargetRegisterInfo.cpp
@@ -206,8 +206,7 @@ static const TargetRegisterClass *
 getMinimalPhysRegClass(const TargetRegisterInfo *TRI, MCRegister Reg,
                        TypeT Ty) {
   static_assert(std::is_same_v<TypeT, MVT> || std::is_same_v<TypeT, LLT>);
-  assert(Register::isPhysicalRegister(Reg) &&
-         "reg must be a physical register");
+  assert(Reg.isPhysical() && "reg must be a physical register");
 
   bool IsDefault = [&]() {
     if constexpr (std::is_same_v<TypeT, MVT>)
@@ -235,8 +234,7 @@ static const TargetRegisterClass *
 getCommonMinimalPhysRegClass(const TargetRegisterInfo *TRI, MCRegister Reg1,
                              MCRegister Reg2, TypeT Ty) {
   static_assert(std::is_same_v<TypeT, MVT> || std::is_same_v<TypeT, LLT>);
-  assert(Register::isPhysicalRegister(Reg1) &&
-         Register::isPhysicalRegister(Reg2) &&
+  assert(Reg1.isPhysical() && Reg2.isPhysical() &&
          "Reg1/Reg2 must be a physical register");
 
   bool IsDefault = [&]() {
@@ -504,14 +502,13 @@ bool TargetRegisterInfo::getRegAllocationHints(
 
 bool TargetRegisterInfo::isCalleeSavedPhysReg(
     MCRegister PhysReg, const MachineFunction &MF) const {
-  if (PhysReg == 0)
+  if (!PhysReg)
     return false;
   const uint32_t *callerPreservedRegs =
       getCallPreservedMask(MF, MF.getFunction().getCallingConv());
   if (callerPreservedRegs) {
-    assert(Register::isPhysicalRegister(PhysReg) &&
-           "Expected physical register");
-    return (callerPreservedRegs[PhysReg / 32] >> PhysReg % 32) & 1;
+    assert(PhysReg.isPhysical() && "Expected physical register");
+    return (callerPreservedRegs[PhysReg.id() / 32] >> PhysReg.id() % 32) & 1;
   }
   return false;
 }
diff --git llvm/lib/CodeGen/VirtRegMap.cpp llvm/lib/CodeGen/VirtRegMap.cpp
index d6c020172b96..b3a7acc15b3d 100644
--- llvm/lib/CodeGen/VirtRegMap.cpp
+++ llvm/lib/CodeGen/VirtRegMap.cpp
@@ -83,8 +83,8 @@ void VirtRegMap::grow() {
   Virt2SplitMap.resize(NumRegs);
 }
 
-void VirtRegMap::assignVirt2Phys(Register virtReg, MCPhysReg physReg) {
-  assert(virtReg.isVirtual() && Register::isPhysicalRegister(physReg));
+void VirtRegMap::assignVirt2Phys(Register virtReg, MCRegister physReg) {
+  assert(virtReg.isVirtual() && physReg.isPhysical());
   assert(!Virt2PhysMap[virtReg] &&
          "attempt to assign physical register to already mapped "
          "virtual register");
@@ -221,7 +221,7 @@ class VirtRegRewriter : public MachineFunctionPass {
   bool subRegLiveThrough(const MachineInstr &MI, MCRegister SuperPhysReg) const;
   LaneBitmask liveOutUndefPhiLanesForUndefSubregDef(
       const LiveInterval &LI, const MachineBasicBlock &MBB, unsigned SubReg,
-      MCPhysReg PhysReg, const MachineInstr &MI) const;
+      MCRegister PhysReg, const MachineInstr &MI) const;
 
 public:
   static char ID;
@@ -563,7 +563,7 @@ bool VirtRegRewriter::subRegLiveThrough(const MachineInstr &MI,
 /// is assigned to \p LI, which is the main range.
 LaneBitmask VirtRegRewriter::liveOutUndefPhiLanesForUndefSubregDef(
     const LiveInterval &LI, const MachineBasicBlock &MBB, unsigned SubReg,
-    MCPhysReg PhysReg, const MachineInstr &MI) const {
+    MCRegister PhysReg, const MachineInstr &MI) const {
   LaneBitmask UndefMask = ~TRI->getSubRegIndexLaneMask(SubReg);
   LaneBitmask LiveOutUndefLanes;
 
diff --git llvm/lib/ExecutionEngine/Orc/CMakeLists.txt llvm/lib/ExecutionEngine/Orc/CMakeLists.txt
index 93d253ee49aa..2ab5d6dd39b6 100644
--- llvm/lib/ExecutionEngine/Orc/CMakeLists.txt
+++ llvm/lib/ExecutionEngine/Orc/CMakeLists.txt
@@ -15,6 +15,7 @@ add_llvm_component_library(LLVMOrcJIT
   Core.cpp
   DebugObjectManagerPlugin.cpp
   DebugUtils.cpp
+  EHFrameRegistrationPlugin.cpp
   EPCDynamicLibrarySearchGenerator.cpp
   EPCDebugObjectRegistrar.cpp
   EPCEHFrameRegistrar.cpp
diff --git llvm/lib/ExecutionEngine/Orc/Debugging/CMakeLists.txt llvm/lib/ExecutionEngine/Orc/Debugging/CMakeLists.txt
index ed52692662a8..186df5dad072 100644
--- llvm/lib/ExecutionEngine/Orc/Debugging/CMakeLists.txt
+++ llvm/lib/ExecutionEngine/Orc/Debugging/CMakeLists.txt
@@ -18,6 +18,7 @@ add_llvm_component_library(LLVMOrcDebugging
   ${rt_lib}
 
   LINK_COMPONENTS
+  BinaryFormat
   DebugInfoDWARF
   JITLink
   OrcJIT
diff --git llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp
index 0d9a912e2560..1bafed79d696 100644
--- llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp
+++ llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp
@@ -121,18 +121,14 @@ public:
 
     // Write MachO header and debug section load commands.
     Builder.Header.filetype = MachO::MH_OBJECT;
-    switch (G.getTargetTriple().getArch()) {
-    case Triple::x86_64:
-      Builder.Header.cputype = MachO::CPU_TYPE_X86_64;
-      Builder.Header.cpusubtype = MachO::CPU_SUBTYPE_X86_64_ALL;
-      break;
-    case Triple::aarch64:
-      Builder.Header.cputype = MachO::CPU_TYPE_ARM64;
-      Builder.Header.cpusubtype = MachO::CPU_SUBTYPE_ARM64_ALL;
-      break;
-    default:
-      llvm_unreachable("Unsupported architecture");
-    }
+    if (auto CPUType = MachO::getCPUType(G.getTargetTriple()))
+      Builder.Header.cputype = *CPUType;
+    else
+      return CPUType.takeError();
+    if (auto CPUSubType = MachO::getCPUSubType(G.getTargetTriple()))
+      Builder.Header.cpusubtype = *CPUSubType;
+    else
+      return CPUSubType.takeError();
 
     Seg = &Builder.addSegment("");
 
diff --git llvm/lib/ExecutionEngine/Orc/EHFrameRegistrationPlugin.cpp llvm/lib/ExecutionEngine/Orc/EHFrameRegistrationPlugin.cpp
new file mode 100644
index 000000000000..217c693dae9c
--- /dev/null
+++ llvm/lib/ExecutionEngine/Orc/EHFrameRegistrationPlugin.cpp
@@ -0,0 +1,115 @@
+//===--------- EHFrameRegistrationPlugin.cpp - Register eh-frames ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/EHFrameRegistrationPlugin.h"
+
+#include "llvm/ExecutionEngine/JITLink/EHFrameSupport.h"
+
+#define DEBUG_TYPE "orc"
+
+using namespace llvm::jitlink;
+
+namespace llvm::orc {
+
+EHFrameRegistrationPlugin::EHFrameRegistrationPlugin(
+    ExecutionSession &ES, std::unique_ptr<EHFrameRegistrar> Registrar)
+    : ES(ES), Registrar(std::move(Registrar)) {}
+
+void EHFrameRegistrationPlugin::modifyPassConfig(
+    MaterializationResponsibility &MR, LinkGraph &G,
+    PassConfiguration &PassConfig) {
+
+  PassConfig.PostFixupPasses.push_back(createEHFrameRecorderPass(
+      G.getTargetTriple(), [this, &MR](ExecutorAddr Addr, size_t Size) {
+        if (Addr) {
+          std::lock_guard<std::mutex> Lock(EHFramePluginMutex);
+          assert(!InProcessLinks.count(&MR) &&
+                 "Link for MR already being tracked?");
+          InProcessLinks[&MR] = {Addr, Size};
+        }
+      }));
+}
+
+Error EHFrameRegistrationPlugin::notifyEmitted(
+    MaterializationResponsibility &MR) {
+
+  ExecutorAddrRange EmittedRange;
+  {
+    std::lock_guard<std::mutex> Lock(EHFramePluginMutex);
+
+    auto EHFrameRangeItr = InProcessLinks.find(&MR);
+    if (EHFrameRangeItr == InProcessLinks.end())
+      return Error::success();
+
+    EmittedRange = EHFrameRangeItr->second;
+    assert(EmittedRange.Start && "eh-frame addr to register can not be null");
+    InProcessLinks.erase(EHFrameRangeItr);
+  }
+
+  if (auto Err = MR.withResourceKeyDo(
+          [&](ResourceKey K) { EHFrameRanges[K].push_back(EmittedRange); }))
+    return Err;
+
+  return Registrar->registerEHFrames(EmittedRange);
+}
+
+Error EHFrameRegistrationPlugin::notifyFailed(
+    MaterializationResponsibility &MR) {
+  std::lock_guard<std::mutex> Lock(EHFramePluginMutex);
+  InProcessLinks.erase(&MR);
+  return Error::success();
+}
+
+Error EHFrameRegistrationPlugin::notifyRemovingResources(JITDylib &JD,
+                                                         ResourceKey K) {
+  std::vector<ExecutorAddrRange> RangesToRemove;
+
+  ES.runSessionLocked([&] {
+    auto I = EHFrameRanges.find(K);
+    if (I != EHFrameRanges.end()) {
+      RangesToRemove = std::move(I->second);
+      EHFrameRanges.erase(I);
+    }
+  });
+
+  Error Err = Error::success();
+  while (!RangesToRemove.empty()) {
+    auto RangeToRemove = RangesToRemove.back();
+    RangesToRemove.pop_back();
+    assert(RangeToRemove.Start && "Untracked eh-frame range must not be null");
+    Err = joinErrors(std::move(Err),
+                     Registrar->deregisterEHFrames(RangeToRemove));
+  }
+
+  return Err;
+}
+
+void EHFrameRegistrationPlugin::notifyTransferringResources(
+    JITDylib &JD, ResourceKey DstKey, ResourceKey SrcKey) {
+  auto SI = EHFrameRanges.find(SrcKey);
+  if (SI == EHFrameRanges.end())
+    return;
+
+  auto DI = EHFrameRanges.find(DstKey);
+  if (DI != EHFrameRanges.end()) {
+    auto &SrcRanges = SI->second;
+    auto &DstRanges = DI->second;
+    DstRanges.reserve(DstRanges.size() + SrcRanges.size());
+    for (auto &SrcRange : SrcRanges)
+      DstRanges.push_back(std::move(SrcRange));
+    EHFrameRanges.erase(SI);
+  } else {
+    // We need to move SrcKey's ranges over without invalidating the SI
+    // iterator.
+    auto Tmp = std::move(SI->second);
+    EHFrameRanges.erase(SI);
+    EHFrameRanges[DstKey] = std::move(Tmp);
+  }
+}
+
+} // namespace llvm::orc
diff --git llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp
index 8aa517a27d99..aae7369fc29c 100644
--- llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp
+++ llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp
@@ -10,6 +10,7 @@
 #include "llvm/ExecutionEngine/Orc/ELFNixPlatform.h"
 
 #include "llvm/ExecutionEngine/JITLink/aarch64.h"
+#include "llvm/ExecutionEngine/JITLink/loongarch.h"
 #include "llvm/ExecutionEngine/JITLink/ppc64.h"
 #include "llvm/ExecutionEngine/JITLink/x86_64.h"
 #include "llvm/ExecutionEngine/Orc/AbsoluteSymbols.h"
@@ -150,6 +151,9 @@ public:
     case Triple::ppc64le:
       EdgeKind = jitlink::ppc64::Pointer64;
       break;
+    case Triple::loongarch64:
+      EdgeKind = jitlink::loongarch::Pointer64;
+      break;
     default:
       llvm_unreachable("Unrecognized architecture");
     }
@@ -363,6 +367,7 @@ bool ELFNixPlatform::supportedTarget(const Triple &TT) {
   // FIXME: jitlink for ppc64 hasn't been well tested, leave it unsupported
   // right now.
   case Triple::ppc64le:
+  case Triple::loongarch64:
     return true;
   default:
     return false;
diff --git llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
index ee9acf0ab33a..7fb84e97fe4e 100644
--- llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
+++ llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
@@ -273,8 +273,8 @@ createLocalIndirectStubsManagerBuilder(const Triple &T) {
 Constant* createIRTypedAddress(FunctionType &FT, ExecutorAddr Addr) {
   Constant *AddrIntVal =
     ConstantInt::get(Type::getInt64Ty(FT.getContext()), Addr.getValue());
-  Constant *AddrPtrVal =
-    ConstantExpr::getIntToPtr(AddrIntVal, PointerType::get(&FT, 0));
+  Constant *AddrPtrVal = ConstantExpr::getIntToPtr(
+      AddrIntVal, PointerType::get(FT.getContext(), 0));
   return AddrPtrVal;
 }
 
diff --git llvm/lib/ExecutionEngine/Orc/LLJIT.cpp llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
index 59bd95e96167..76d5c1428ed6 100644
--- llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
+++ llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
@@ -11,6 +11,7 @@
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Config/llvm-config.h" // for LLVM_ENABLE_THREADS
 #include "llvm/ExecutionEngine/Orc/COFFPlatform.h"
+#include "llvm/ExecutionEngine/Orc/EHFrameRegistrationPlugin.h"
 #include "llvm/ExecutionEngine/Orc/ELFNixPlatform.h"
 #include "llvm/ExecutionEngine/Orc/EPCDynamicLibrarySearchGenerator.h"
 #include "llvm/ExecutionEngine/Orc/EPCEHFrameRegistrar.h"
diff --git llvm/lib/ExecutionEngine/Orc/LinkGraphLinkingLayer.cpp llvm/lib/ExecutionEngine/Orc/LinkGraphLinkingLayer.cpp
index fc5a11b338af..b4f78c617ae7 100644
--- llvm/lib/ExecutionEngine/Orc/LinkGraphLinkingLayer.cpp
+++ llvm/lib/ExecutionEngine/Orc/LinkGraphLinkingLayer.cpp
@@ -1,4 +1,4 @@
-//===----- LinkGraphLinkingLayer.cpp - JITLink backed ORC ObjectLayer -----===//
+//===------ LinkGraphLinkingLayer.cpp - Link LinkGraphs with JITLink ------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -581,101 +581,5 @@ void LinkGraphLinkingLayer::handleTransferResources(JITDylib &JD,
     P->notifyTransferringResources(JD, DstKey, SrcKey);
 }
 
-EHFrameRegistrationPlugin::EHFrameRegistrationPlugin(
-    ExecutionSession &ES, std::unique_ptr<EHFrameRegistrar> Registrar)
-    : ES(ES), Registrar(std::move(Registrar)) {}
-
-void EHFrameRegistrationPlugin::modifyPassConfig(
-    MaterializationResponsibility &MR, LinkGraph &G,
-    PassConfiguration &PassConfig) {
-
-  PassConfig.PostFixupPasses.push_back(createEHFrameRecorderPass(
-      G.getTargetTriple(), [this, &MR](ExecutorAddr Addr, size_t Size) {
-        if (Addr) {
-          std::lock_guard<std::mutex> Lock(EHFramePluginMutex);
-          assert(!InProcessLinks.count(&MR) &&
-                 "Link for MR already being tracked?");
-          InProcessLinks[&MR] = {Addr, Size};
-        }
-      }));
-}
-
-Error EHFrameRegistrationPlugin::notifyEmitted(
-    MaterializationResponsibility &MR) {
-
-  ExecutorAddrRange EmittedRange;
-  {
-    std::lock_guard<std::mutex> Lock(EHFramePluginMutex);
-
-    auto EHFrameRangeItr = InProcessLinks.find(&MR);
-    if (EHFrameRangeItr == InProcessLinks.end())
-      return Error::success();
-
-    EmittedRange = EHFrameRangeItr->second;
-    assert(EmittedRange.Start && "eh-frame addr to register can not be null");
-    InProcessLinks.erase(EHFrameRangeItr);
-  }
-
-  if (auto Err = MR.withResourceKeyDo(
-          [&](ResourceKey K) { EHFrameRanges[K].push_back(EmittedRange); }))
-    return Err;
-
-  return Registrar->registerEHFrames(EmittedRange);
-}
-
-Error EHFrameRegistrationPlugin::notifyFailed(
-    MaterializationResponsibility &MR) {
-  std::lock_guard<std::mutex> Lock(EHFramePluginMutex);
-  InProcessLinks.erase(&MR);
-  return Error::success();
-}
-
-Error EHFrameRegistrationPlugin::notifyRemovingResources(JITDylib &JD,
-                                                         ResourceKey K) {
-  std::vector<ExecutorAddrRange> RangesToRemove;
-
-  ES.runSessionLocked([&] {
-    auto I = EHFrameRanges.find(K);
-    if (I != EHFrameRanges.end()) {
-      RangesToRemove = std::move(I->second);
-      EHFrameRanges.erase(I);
-    }
-  });
-
-  Error Err = Error::success();
-  while (!RangesToRemove.empty()) {
-    auto RangeToRemove = RangesToRemove.back();
-    RangesToRemove.pop_back();
-    assert(RangeToRemove.Start && "Untracked eh-frame range must not be null");
-    Err = joinErrors(std::move(Err),
-                     Registrar->deregisterEHFrames(RangeToRemove));
-  }
-
-  return Err;
-}
-
-void EHFrameRegistrationPlugin::notifyTransferringResources(
-    JITDylib &JD, ResourceKey DstKey, ResourceKey SrcKey) {
-  auto SI = EHFrameRanges.find(SrcKey);
-  if (SI == EHFrameRanges.end())
-    return;
-
-  auto DI = EHFrameRanges.find(DstKey);
-  if (DI != EHFrameRanges.end()) {
-    auto &SrcRanges = SI->second;
-    auto &DstRanges = DI->second;
-    DstRanges.reserve(DstRanges.size() + SrcRanges.size());
-    for (auto &SrcRange : SrcRanges)
-      DstRanges.push_back(std::move(SrcRange));
-    EHFrameRanges.erase(SI);
-  } else {
-    // We need to move SrcKey's ranges over without invalidating the SI
-    // iterator.
-    auto Tmp = std::move(SI->second);
-    EHFrameRanges.erase(SI);
-    EHFrameRanges[DstKey] = std::move(Tmp);
-  }
-}
-
 } // End namespace orc.
 } // End namespace llvm.
diff --git llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
index 8e66d028f21c..f8f65ec3b4cf 100644
--- llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
+++ llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
@@ -583,6 +583,12 @@ MachOPlatform::MachOPlatform(
     return;
 
   // (5) Associate runtime support functions.
+  // TODO: Consider moving this above (4) to make runtime support functions
+  //       available to the bootstrap completion graph. We'd just need to be
+  //       sure that the runtime support functions are fully usable before any
+  //       bootstrap completion actions use them (e.g. the ORC runtime
+  //       macho_platform object would have to have been created and
+  //       initialized).
   if ((Err = associateRuntimeSupportFunctions()))
     return;
 }
diff --git llvm/lib/FuzzMutate/RandomIRBuilder.cpp llvm/lib/FuzzMutate/RandomIRBuilder.cpp
index a684307586a6..8aea3d6f7e05 100644
--- llvm/lib/FuzzMutate/RandomIRBuilder.cpp
+++ llvm/lib/FuzzMutate/RandomIRBuilder.cpp
@@ -370,7 +370,7 @@ Instruction *RandomIRBuilder::newSink(BasicBlock &BB,
       Type *Ty = V->getType();
       Ptr = createStackMemory(BB.getParent(), Ty, PoisonValue::get(Ty));
     } else {
-      Ptr = PoisonValue::get(PointerType::get(V->getType(), 0));
+      Ptr = PoisonValue::get(PointerType::get(V->getContext(), 0));
     }
   }
 
diff --git llvm/lib/IR/Assumptions.cpp llvm/lib/IR/Assumptions.cpp
index 27977d5d56b0..d1f8bcde53b2 100644
--- llvm/lib/IR/Assumptions.cpp
+++ llvm/lib/IR/Assumptions.cpp
@@ -108,4 +108,5 @@ StringSet<> llvm::KnownAssumptionStrings({
     "omp_no_parallelism",     // OpenMP 5.1
     "ompx_spmd_amenable",     // OpenMPOpt extension
     "ompx_no_call_asm",       // OpenMPOpt extension
+    "ompx_aligned_barrier",   // OpenMPOpt extension
 });
diff --git llvm/lib/IR/AutoUpgrade.cpp llvm/lib/IR/AutoUpgrade.cpp
index 06e62bf7f9f7..3725f412b893 100644
--- llvm/lib/IR/AutoUpgrade.cpp
+++ llvm/lib/IR/AutoUpgrade.cpp
@@ -45,6 +45,7 @@
 #include "llvm/Support/Regex.h"
 #include "llvm/TargetParser/Triple.h"
 #include <cstring>
+#include <numeric>
 
 using namespace llvm;
 
@@ -828,6 +829,13 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F,
           return true;
         }
       }
+
+      // Changed in 20.0: bfcvt/bfcvtn/bcvtn2 have been replaced with fptrunc.
+      if (Name.starts_with("bfcvt")) {
+        NewFn = nullptr;
+        return true;
+      }
+
       return false; // No other 'aarch64.neon.*'.
     }
     if (Name.consume_front("sve.")) {
@@ -1875,9 +1883,6 @@ static Value *upgradeX86ConcatShift(IRBuilder<> &Builder, CallBase &CI,
 
 static Value *upgradeMaskedStore(IRBuilder<> &Builder, Value *Ptr, Value *Data,
                                  Value *Mask, bool Aligned) {
-  // Cast the pointer to the right type.
-  Ptr = Builder.CreateBitCast(Ptr,
-                              llvm::PointerType::getUnqual(Data->getType()));
   const Align Alignment =
       Aligned
           ? Align(Data->getType()->getPrimitiveSizeInBits().getFixedValue() / 8)
@@ -1897,8 +1902,6 @@ static Value *upgradeMaskedStore(IRBuilder<> &Builder, Value *Ptr, Value *Data,
 static Value *upgradeMaskedLoad(IRBuilder<> &Builder, Value *Ptr,
                                 Value *Passthru, Value *Mask, bool Aligned) {
   Type *ValTy = Passthru->getType();
-  // Cast the pointer to the right type.
-  Ptr = Builder.CreateBitCast(Ptr, llvm::PointerType::getUnqual(ValTy));
   const Align Alignment =
       Aligned
           ? Align(
@@ -2421,13 +2424,10 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
 
     // Nontemporal (unaligned) store of the 0'th element of the float/double
     // vector.
-    Type *SrcEltTy = cast<VectorType>(Arg1->getType())->getElementType();
-    PointerType *EltPtrTy = PointerType::getUnqual(SrcEltTy);
-    Value *Addr = Builder.CreateBitCast(Arg0, EltPtrTy, "cast");
     Value *Extract =
         Builder.CreateExtractElement(Arg1, (uint64_t)0, "extractelement");
 
-    StoreInst *SI = Builder.CreateAlignedStore(Extract, Addr, Align(1));
+    StoreInst *SI = Builder.CreateAlignedStore(Extract, Arg0, Align(1));
     SI->setMetadata(LLVMContext::MD_nontemporal, Node);
   } else if (Name.starts_with("avx.movnt.") ||
              Name.starts_with("avx512.storent.")) {
@@ -2439,11 +2439,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
     Value *Arg0 = CI->getArgOperand(0);
     Value *Arg1 = CI->getArgOperand(1);
 
-    // Convert the type of the pointer to a pointer to the stored type.
-    Value *BC = Builder.CreateBitCast(
-        Arg0, PointerType::getUnqual(Arg1->getType()), "cast");
     StoreInst *SI = Builder.CreateAlignedStore(
-        Arg1, BC,
+        Arg1, Arg0,
         Align(Arg1->getType()->getPrimitiveSizeInBits().getFixedValue() / 8));
     SI->setMetadata(LLVMContext::MD_nontemporal, Node);
   } else if (Name == "sse2.storel.dq") {
@@ -2453,17 +2450,12 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
     auto *NewVecTy = FixedVectorType::get(Type::getInt64Ty(C), 2);
     Value *BC0 = Builder.CreateBitCast(Arg1, NewVecTy, "cast");
     Value *Elt = Builder.CreateExtractElement(BC0, (uint64_t)0);
-    Value *BC = Builder.CreateBitCast(
-        Arg0, PointerType::getUnqual(Elt->getType()), "cast");
-    Builder.CreateAlignedStore(Elt, BC, Align(1));
+    Builder.CreateAlignedStore(Elt, Arg0, Align(1));
   } else if (Name.starts_with("sse.storeu.") ||
              Name.starts_with("sse2.storeu.") ||
              Name.starts_with("avx.storeu.")) {
     Value *Arg0 = CI->getArgOperand(0);
     Value *Arg1 = CI->getArgOperand(1);
-
-    Arg0 = Builder.CreateBitCast(Arg0, PointerType::getUnqual(Arg1->getType()),
-                                 "cast");
     Builder.CreateAlignedStore(Arg1, Arg0, Align(1));
   } else if (Name == "avx512.mask.store.ss") {
     Value *Mask = Builder.CreateAnd(CI->getArgOperand(2), Builder.getInt8(1));
@@ -2813,31 +2805,21 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
                             CI->getArgOperand(2), Aligned);
   } else if (Name.starts_with("avx512.mask.expand.load.")) {
     auto *ResultTy = cast<FixedVectorType>(CI->getType());
-    Type *PtrTy = ResultTy->getElementType();
-
-    // Cast the pointer to element type.
-    Value *Ptr = Builder.CreateBitCast(CI->getOperand(0),
-                                       llvm::PointerType::getUnqual(PtrTy));
-
     Value *MaskVec = getX86MaskVec(Builder, CI->getArgOperand(2),
                                    ResultTy->getNumElements());
 
-    Rep = Builder.CreateIntrinsic(Intrinsic::masked_expandload, ResultTy,
-                                  {Ptr, MaskVec, CI->getOperand(1)});
+    Rep = Builder.CreateIntrinsic(
+        Intrinsic::masked_expandload, ResultTy,
+        {CI->getOperand(0), MaskVec, CI->getOperand(1)});
   } else if (Name.starts_with("avx512.mask.compress.store.")) {
     auto *ResultTy = cast<VectorType>(CI->getArgOperand(1)->getType());
-    Type *PtrTy = ResultTy->getElementType();
-
-    // Cast the pointer to element type.
-    Value *Ptr = Builder.CreateBitCast(CI->getOperand(0),
-                                       llvm::PointerType::getUnqual(PtrTy));
-
     Value *MaskVec =
         getX86MaskVec(Builder, CI->getArgOperand(2),
                       cast<FixedVectorType>(ResultTy)->getNumElements());
 
-    Rep = Builder.CreateIntrinsic(Intrinsic::masked_compressstore, ResultTy,
-                                  {CI->getArgOperand(1), Ptr, MaskVec});
+    Rep = Builder.CreateIntrinsic(
+        Intrinsic::masked_compressstore, ResultTy,
+        {CI->getArgOperand(1), CI->getArgOperand(0), MaskVec});
   } else if (Name.starts_with("avx512.mask.compress.") ||
              Name.starts_with("avx512.mask.expand.")) {
     auto *ResultTy = cast<FixedVectorType>(CI->getType());
@@ -2963,9 +2945,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
     Type *EltTy = cast<VectorType>(CI->getType())->getElementType();
     unsigned NumSrcElts = 128 / EltTy->getPrimitiveSizeInBits();
     auto *VT = FixedVectorType::get(EltTy, NumSrcElts);
-    Value *Op = Builder.CreatePointerCast(CI->getArgOperand(0),
-                                          PointerType::getUnqual(VT));
-    Value *Load = Builder.CreateAlignedLoad(VT, Op, Align(1));
+    Value *Load = Builder.CreateAlignedLoad(VT, CI->getArgOperand(0), Align(1));
     if (NumSrcElts == 2)
       Rep = Builder.CreateShuffleVector(Load, ArrayRef<int>{0, 1, 0, 1});
     else
@@ -3687,13 +3667,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
     MDNode *Node = MDNode::get(
         C, ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(C), 1)));
 
-    Value *Ptr = CI->getArgOperand(0);
-
-    // Convert the type of the pointer to a pointer to the stored type.
-    Value *BC = Builder.CreateBitCast(
-        Ptr, PointerType::getUnqual(CI->getType()), "cast");
     LoadInst *LI = Builder.CreateAlignedLoad(
-        CI->getType(), BC,
+        CI->getType(), CI->getArgOperand(0),
         Align(CI->getType()->getPrimitiveSizeInBits().getFixedValue() / 8));
     LI->setMetadata(LLVMContext::MD_nontemporal, Node);
     Rep = LI;
@@ -4045,10 +4020,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
 
     // Extract the second result and store it.
     Value *Data = Builder.CreateExtractValue(NewCall, 1);
-    // Cast the pointer to the right type.
-    Value *Ptr = Builder.CreateBitCast(
-        CI->getArgOperand(3), llvm::PointerType::getUnqual(Data->getType()));
-    Builder.CreateAlignedStore(Data, Ptr, Align(1));
+    Builder.CreateAlignedStore(Data, CI->getArgOperand(3), Align(1));
     // Replace the original call result with the first result of the new call.
     Value *CF = Builder.CreateExtractValue(NewCall, 0);
 
@@ -4064,31 +4036,59 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
 
 static Value *upgradeAArch64IntrinsicCall(StringRef Name, CallBase *CI,
                                           Function *F, IRBuilder<> &Builder) {
-  Intrinsic::ID NewID =
-      StringSwitch<Intrinsic::ID>(Name)
-          .Case("sve.fcvt.bf16f32", Intrinsic::aarch64_sve_fcvt_bf16f32_v2)
-          .Case("sve.fcvtnt.bf16f32", Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2)
-          .Default(Intrinsic::not_intrinsic);
-  if (NewID == Intrinsic::not_intrinsic)
-    llvm_unreachable("Unhandled Intrinsic!");
-
-  SmallVector<Value *, 3> Args(CI->args());
-
-  // The original intrinsics incorrectly used a predicate based on the smallest
-  // element type rather than the largest.
-  Type *BadPredTy = ScalableVectorType::get(Builder.getInt1Ty(), 8);
-  Type *GoodPredTy = ScalableVectorType::get(Builder.getInt1Ty(), 4);
-
-  if (Args[1]->getType() != BadPredTy)
-    llvm_unreachable("Unexpected predicate type!");
-
-  Args[1] = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_to_svbool,
-                                    BadPredTy, Args[1]);
-  Args[1] = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
-                                    GoodPredTy, Args[1]);
-
-  return Builder.CreateIntrinsic(NewID, {}, Args, /*FMFSource=*/nullptr,
-                                 CI->getName());
+  if (Name.starts_with("neon.bfcvt")) {
+    if (Name.starts_with("neon.bfcvtn2")) {
+      SmallVector<int, 32> LoMask(4);
+      std::iota(LoMask.begin(), LoMask.end(), 0);
+      SmallVector<int, 32> ConcatMask(8);
+      std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
+      Value *Inactive = Builder.CreateShuffleVector(CI->getOperand(0), LoMask);
+      Value *Trunc =
+          Builder.CreateFPTrunc(CI->getOperand(1), Inactive->getType());
+      return Builder.CreateShuffleVector(Inactive, Trunc, ConcatMask);
+    } else if (Name.starts_with("neon.bfcvtn")) {
+      SmallVector<int, 32> ConcatMask(8);
+      std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
+      Type *V4BF16 =
+          FixedVectorType::get(Type::getBFloatTy(F->getContext()), 4);
+      Value *Trunc = Builder.CreateFPTrunc(CI->getOperand(0), V4BF16);
+      dbgs() << "Trunc: " << *Trunc << "\n";
+      return Builder.CreateShuffleVector(
+          Trunc, ConstantAggregateZero::get(V4BF16), ConcatMask);
+    } else {
+      return Builder.CreateFPTrunc(CI->getOperand(0),
+                                   Type::getBFloatTy(F->getContext()));
+    }
+  } else if (Name.starts_with("sve.fcvt")) {
+    Intrinsic::ID NewID =
+        StringSwitch<Intrinsic::ID>(Name)
+            .Case("sve.fcvt.bf16f32", Intrinsic::aarch64_sve_fcvt_bf16f32_v2)
+            .Case("sve.fcvtnt.bf16f32",
+                  Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2)
+            .Default(Intrinsic::not_intrinsic);
+    if (NewID == Intrinsic::not_intrinsic)
+      llvm_unreachable("Unhandled Intrinsic!");
+
+    SmallVector<Value *, 3> Args(CI->args());
+
+    // The original intrinsics incorrectly used a predicate based on the
+    // smallest element type rather than the largest.
+    Type *BadPredTy = ScalableVectorType::get(Builder.getInt1Ty(), 8);
+    Type *GoodPredTy = ScalableVectorType::get(Builder.getInt1Ty(), 4);
+
+    if (Args[1]->getType() != BadPredTy)
+      llvm_unreachable("Unexpected predicate type!");
+
+    Args[1] = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_to_svbool,
+                                      BadPredTy, Args[1]);
+    Args[1] = Builder.CreateIntrinsic(
+        Intrinsic::aarch64_sve_convert_from_svbool, GoodPredTy, Args[1]);
+
+    return Builder.CreateIntrinsic(NewID, {}, Args, /*FMFSource=*/nullptr,
+                                   CI->getName());
+  }
+
+  llvm_unreachable("Unhandled Intrinsic!");
 }
 
 static Value *upgradeARMIntrinsicCall(StringRef Name, CallBase *CI, Function *F,
@@ -4756,10 +4756,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
     NewCall = Builder.CreateCall(NewFn);
     // Extract the second result and store it.
     Value *Data = Builder.CreateExtractValue(NewCall, 1);
-    // Cast the pointer to the right type.
-    Value *Ptr = Builder.CreateBitCast(CI->getArgOperand(0),
-                                 llvm::PointerType::getUnqual(Data->getType()));
-    Builder.CreateAlignedStore(Data, Ptr, Align(1));
+    Builder.CreateAlignedStore(Data, CI->getArgOperand(0), Align(1));
     // Replace the original call result with the first result of the new call.
     Value *TSC = Builder.CreateExtractValue(NewCall, 0);
 
diff --git llvm/lib/IR/Core.cpp llvm/lib/IR/Core.cpp
index dc5ca68bd998..15ab9674f496 100644
--- llvm/lib/IR/Core.cpp
+++ llvm/lib/IR/Core.cpp
@@ -873,7 +873,8 @@ LLVMTypeRef LLVMArrayType2(LLVMTypeRef ElementType, uint64_t ElementCount) {
 }
 
 LLVMTypeRef LLVMPointerType(LLVMTypeRef ElementType, unsigned AddressSpace) {
-  return wrap(PointerType::get(unwrap(ElementType), AddressSpace));
+  return wrap(
+      PointerType::get(unwrap(ElementType)->getContext(), AddressSpace));
 }
 
 LLVMBool LLVMPointerTypeIsOpaque(LLVMTypeRef Ty) {
diff --git llvm/lib/IR/DIBuilder.cpp llvm/lib/IR/DIBuilder.cpp
index b240a2a39de3..d9bd4f11e89a 100644
--- llvm/lib/IR/DIBuilder.cpp
+++ llvm/lib/IR/DIBuilder.cpp
@@ -644,11 +644,15 @@ DIType *DIBuilder::createArtificialType(DIType *Ty) {
   return createTypeWithFlags(Ty, DINode::FlagArtificial);
 }
 
-DIType *DIBuilder::createObjectPointerType(DIType *Ty) {
+DIType *DIBuilder::createObjectPointerType(DIType *Ty, bool Implicit) {
   // FIXME: Restrict this to the nodes where it's valid.
   if (Ty->isObjectPointer())
     return Ty;
-  DINode::DIFlags Flags = DINode::FlagObjectPointer | DINode::FlagArtificial;
+  DINode::DIFlags Flags = DINode::FlagObjectPointer;
+
+  if (Implicit)
+    Flags |= DINode::FlagArtificial;
+
   return createTypeWithFlags(Ty, Flags);
 }
 
diff --git llvm/lib/IR/DebugInfo.cpp llvm/lib/IR/DebugInfo.cpp
index e5b45e0082a8..4ce518009bd3 100644
--- llvm/lib/IR/DebugInfo.cpp
+++ llvm/lib/IR/DebugInfo.cpp
@@ -1432,10 +1432,11 @@ LLVMDIBuilderCreateObjCProperty(LLVMDIBuilderRef Builder,
                   PropertyAttributes, unwrapDI<DIType>(Ty)));
 }
 
-LLVMMetadataRef
-LLVMDIBuilderCreateObjectPointerType(LLVMDIBuilderRef Builder,
-                                     LLVMMetadataRef Type) {
-  return wrap(unwrap(Builder)->createObjectPointerType(unwrapDI<DIType>(Type)));
+LLVMMetadataRef LLVMDIBuilderCreateObjectPointerType(LLVMDIBuilderRef Builder,
+                                                     LLVMMetadataRef Type,
+                                                     LLVMBool Implicit) {
+  return wrap(unwrap(Builder)->createObjectPointerType(unwrapDI<DIType>(Type),
+                                                       Implicit));
 }
 
 LLVMMetadataRef
diff --git llvm/lib/IR/Instructions.cpp llvm/lib/IR/Instructions.cpp
index 50560e9cf218..b585d8cfbf2e 100644
--- llvm/lib/IR/Instructions.cpp
+++ llvm/lib/IR/Instructions.cpp
@@ -1214,7 +1214,7 @@ AllocaInst::AllocaInst(Type *Ty, unsigned AddrSpace, Value *ArraySize,
 AllocaInst::AllocaInst(Type *Ty, unsigned AddrSpace, Value *ArraySize,
                        Align Align, const Twine &Name,
                        InsertPosition InsertBefore)
-    : UnaryInstruction(PointerType::get(Ty, AddrSpace), Alloca,
+    : UnaryInstruction(PointerType::get(Ty->getContext(), AddrSpace), Alloca,
                        getAISize(Ty->getContext(), ArraySize), InsertBefore),
       AllocatedType(Ty) {
   setAlignment(Align);
diff --git llvm/lib/IR/PrintPasses.cpp llvm/lib/IR/PrintPasses.cpp
index e2ef20bb81ba..610411a3cf97 100644
--- llvm/lib/IR/PrintPasses.cpp
+++ llvm/lib/IR/PrintPasses.cpp
@@ -88,6 +88,12 @@ static cl::opt<bool>
                               "always print a module IR"),
                      cl::init(false), cl::Hidden);
 
+static cl::opt<bool> LoopPrintFuncScope(
+    "print-loop-func-scope",
+    cl::desc("When printing IR for print-[before|after]{-all} "
+             "for a loop pass, always print function IR"),
+    cl::init(false), cl::Hidden);
+
 // See the description for -print-changed for an explanation of the use
 // of this option.
 static cl::list<std::string> FilterPasses(
@@ -141,6 +147,8 @@ std::vector<std::string> llvm::printAfterPasses() {
 
 bool llvm::forcePrintModuleIR() { return PrintModuleScope; }
 
+bool llvm::forcePrintFuncIR() { return LoopPrintFuncScope; }
+
 bool llvm::isPassInPrintList(StringRef PassName) {
   static std::unordered_set<std::string> Set(FilterPasses.begin(),
                                              FilterPasses.end());
diff --git llvm/lib/IR/Type.cpp llvm/lib/IR/Type.cpp
index ffa80faf6e24..277985b6b00a 100644
--- llvm/lib/IR/Type.cpp
+++ llvm/lib/IR/Type.cpp
@@ -857,7 +857,7 @@ PointerType::PointerType(LLVMContext &C, unsigned AddrSpace)
 }
 
 PointerType *Type::getPointerTo(unsigned AddrSpace) const {
-  return PointerType::get(const_cast<Type*>(this), AddrSpace);
+  return PointerType::get(getContext(), AddrSpace);
 }
 
 bool PointerType::isValidElementType(Type *ElemTy) {
diff --git llvm/lib/Linker/IRMover.cpp llvm/lib/Linker/IRMover.cpp
index be3535ae94ff..43fcfe75ba46 100644
--- llvm/lib/Linker/IRMover.cpp
+++ llvm/lib/Linker/IRMover.cpp
@@ -296,9 +296,6 @@ Type *TypeMapTy::get(Type *Ty, SmallPtrSet<StructType *, 8> &Visited) {
   case Type::FixedVectorTyID:
     return *Entry = VectorType::get(ElementTypes[0],
                                     cast<VectorType>(Ty)->getElementCount());
-  case Type::PointerTyID:
-    return *Entry = PointerType::get(ElementTypes[0],
-                                     cast<PointerType>(Ty)->getAddressSpace());
   case Type::FunctionTyID:
     return *Entry = FunctionType::get(ElementTypes[0],
                                       ArrayRef(ElementTypes).slice(1),
diff --git llvm/lib/MC/ELFObjectWriter.cpp llvm/lib/MC/ELFObjectWriter.cpp
index bf911e29a19c..5f586fe19a5b 100644
--- llvm/lib/MC/ELFObjectWriter.cpp
+++ llvm/lib/MC/ELFObjectWriter.cpp
@@ -1219,7 +1219,8 @@ void ELFObjectWriter::executePostLayoutBinding(MCAssembler &Asm) {
       continue;
     }
 
-    if (Renames.count(&Symbol) && Renames[&Symbol] != Alias) {
+    if (auto It = Renames.find(&Symbol);
+        It != Renames.end() && It->second != Alias) {
       Asm.getContext().reportError(S.Loc, Twine("multiple versions for ") +
                                               Symbol.getName());
       continue;
diff --git llvm/lib/MC/MCAssembler.cpp llvm/lib/MC/MCAssembler.cpp
index 3c18d0832efc..3e5e0151d265 100644
--- llvm/lib/MC/MCAssembler.cpp
+++ llvm/lib/MC/MCAssembler.cpp
@@ -222,7 +222,7 @@ bool MCAssembler::evaluateFixup(const MCFixup &Fixup, const MCFragment *DF,
 
   // Let the backend force a relocation if needed.
   if (IsResolved &&
-      getBackend().shouldForceRelocation(*this, Fixup, Target, STI)) {
+      getBackend().shouldForceRelocation(*this, Fixup, Target, Value, STI)) {
     IsResolved = false;
     WasForced = true;
   }
diff --git llvm/lib/MC/WasmObjectWriter.cpp llvm/lib/MC/WasmObjectWriter.cpp
index 29a8c53d350a..c5a95cb3da54 100644
--- llvm/lib/MC/WasmObjectWriter.cpp
+++ llvm/lib/MC/WasmObjectWriter.cpp
@@ -746,10 +746,11 @@ static void addData(SmallVectorImpl<char> &DataBytes,
 uint32_t
 WasmObjectWriter::getRelocationIndexValue(const WasmRelocationEntry &RelEntry) {
   if (RelEntry.Type == wasm::R_WASM_TYPE_INDEX_LEB) {
-    if (!TypeIndices.count(RelEntry.Symbol))
+    auto It = TypeIndices.find(RelEntry.Symbol);
+    if (It == TypeIndices.end())
       report_fatal_error("symbol not found in type index space: " +
                          RelEntry.Symbol->getName());
-    return TypeIndices[RelEntry.Symbol];
+    return It->second;
   }
 
   return RelEntry.Symbol->getIndex();
@@ -1019,7 +1020,7 @@ void WasmObjectWriter::writeElemSection(
   encodeSLEB128(InitialTableOffset, W->OS);
   W->OS << char(wasm::WASM_OPCODE_END);
 
-  if (Flags & wasm::WASM_ELEM_SEGMENT_MASK_HAS_ELEM_KIND) {
+  if (Flags & wasm::WASM_ELEM_SEGMENT_MASK_HAS_ELEM_DESC) {
     // We only write active function table initializers, for which the elem kind
     // is specified to be written as 0x00 and interpreted to mean "funcref".
     const uint8_t ElemKind = 0;
diff --git llvm/lib/MC/WinCOFFObjectWriter.cpp llvm/lib/MC/WinCOFFObjectWriter.cpp
index da0c0661117b..f79c374640c2 100644
--- llvm/lib/MC/WinCOFFObjectWriter.cpp
+++ llvm/lib/MC/WinCOFFObjectWriter.cpp
@@ -773,7 +773,10 @@ void WinCOFFWriter::assignFileOffsets(MCAssembler &Asm) {
 
       for (auto &Relocation : Sec->Relocations) {
         assert(Relocation.Symb->getIndex() != -1);
-        Relocation.Data.SymbolTableIndex = Relocation.Symb->getIndex();
+        if (Header.Machine != COFF::IMAGE_FILE_MACHINE_R4000 ||
+            Relocation.Data.Type != COFF::IMAGE_REL_MIPS_PAIR) {
+          Relocation.Data.SymbolTableIndex = Relocation.Symb->getIndex();
+        }
       }
     }
 
@@ -976,8 +979,18 @@ void WinCOFFWriter::recordRelocation(MCAssembler &Asm,
   if (Fixup.getKind() == FK_SecRel_2)
     FixedValue = 0;
 
-  if (OWriter.TargetObjectWriter->recordRelocation(Fixup))
+  if (OWriter.TargetObjectWriter->recordRelocation(Fixup)) {
     Sec->Relocations.push_back(Reloc);
+    if (Header.Machine == COFF::IMAGE_FILE_MACHINE_R4000 &&
+        (Reloc.Data.Type == COFF::IMAGE_REL_MIPS_REFHI ||
+         Reloc.Data.Type == COFF::IMAGE_REL_MIPS_SECRELHI)) {
+      // IMAGE_REL_MIPS_REFHI and IMAGE_REL_MIPS_SECRELHI *must*
+      // be followed by IMAGE_REL_MIPS_PAIR
+      auto RelocPair = Reloc;
+      RelocPair.Data.Type = COFF::IMAGE_REL_MIPS_PAIR;
+      Sec->Relocations.push_back(RelocPair);
+    }
+  }
 }
 
 static std::time_t getTime() {
diff --git llvm/lib/Object/GOFFObjectFile.cpp llvm/lib/Object/GOFFObjectFile.cpp
index e3c4383d27aa..db1e7e704f62 100644
--- llvm/lib/Object/GOFFObjectFile.cpp
+++ llvm/lib/Object/GOFFObjectFile.cpp
@@ -190,8 +190,8 @@ const uint8_t *GOFFObjectFile::getSymbolEsdRecord(DataRefImpl Symb) const {
 }
 
 Expected<StringRef> GOFFObjectFile::getSymbolName(DataRefImpl Symb) const {
-  if (EsdNamesCache.count(Symb.d.a)) {
-    auto &StrPtr = EsdNamesCache[Symb.d.a];
+  if (auto It = EsdNamesCache.find(Symb.d.a); It != EsdNamesCache.end()) {
+    auto &StrPtr = It->second;
     return StringRef(StrPtr.second.get(), StrPtr.first);
   }
 
@@ -459,8 +459,8 @@ uint64_t GOFFObjectFile::getSectionSize(DataRefImpl Sec) const {
 // a contiguous sequence of bytes.
 Expected<ArrayRef<uint8_t>>
 GOFFObjectFile::getSectionContents(DataRefImpl Sec) const {
-  if (SectionDataCache.count(Sec.d.a)) {
-    auto &Buf = SectionDataCache[Sec.d.a];
+  if (auto It = SectionDataCache.find(Sec.d.a); It != SectionDataCache.end()) {
+    auto &Buf = It->second;
     return ArrayRef<uint8_t>(Buf);
   }
   uint64_t SectionSize = getSectionSize(Sec);
diff --git llvm/lib/Object/WasmObjectFile.cpp llvm/lib/Object/WasmObjectFile.cpp
index 2c9b878a4cde..0f6fd5612f9d 100644
--- llvm/lib/Object/WasmObjectFile.cpp
+++ llvm/lib/Object/WasmObjectFile.cpp
@@ -1440,15 +1440,20 @@ Error WasmObjectFile::parseExportSection(ReadContext &Ctx) {
     Info.Flags = 0;
     switch (Ex.Kind) {
     case wasm::WASM_EXTERNAL_FUNCTION: {
-      if (!isDefinedFunctionIndex(Ex.Index))
+      if (!isValidFunctionIndex(Ex.Index))
         return make_error<GenericBinaryError>("invalid function export",
                                               object_error::parse_failed);
-      getDefinedFunction(Ex.Index).ExportName = Ex.Name;
       Info.Kind = wasm::WASM_SYMBOL_TYPE_FUNCTION;
       Info.ElementIndex = Ex.Index;
-      unsigned FuncIndex = Info.ElementIndex - NumImportedFunctions;
-      wasm::WasmFunction &Function = Functions[FuncIndex];
-      Signature = &Signatures[Function.SigIndex];
+      if (isDefinedFunctionIndex(Ex.Index)) {
+        getDefinedFunction(Ex.Index).ExportName = Ex.Name;
+        unsigned FuncIndex = Info.ElementIndex - NumImportedFunctions;
+        wasm::WasmFunction &Function = Functions[FuncIndex];
+        Signature = &Signatures[Function.SigIndex];
+      }
+      // Else the function is imported. LLVM object files don't use this
+      // pattern and we still treat this as an undefined symbol, but we want to
+      // parse it without crashing.
       break;
     }
     case wasm::WASM_EXTERNAL_GLOBAL: {
@@ -1645,17 +1650,25 @@ Error WasmObjectFile::parseElemSection(ReadContext &Ctx) {
       return make_error<GenericBinaryError>(
           "Unsupported flags for element segment", object_error::parse_failed);
 
-    bool IsPassive = (Segment.Flags & wasm::WASM_ELEM_SEGMENT_IS_PASSIVE) != 0;
-    bool IsDeclarative =
-        IsPassive && (Segment.Flags & wasm::WASM_ELEM_SEGMENT_IS_DECLARATIVE);
+    wasm::ElemSegmentMode Mode;
+    if ((Segment.Flags & wasm::WASM_ELEM_SEGMENT_IS_PASSIVE) == 0) {
+      Mode = wasm::ElemSegmentMode::Active;
+    } else if (Segment.Flags & wasm::WASM_ELEM_SEGMENT_IS_DECLARATIVE) {
+      Mode = wasm::ElemSegmentMode::Declarative;
+    } else {
+      Mode = wasm::ElemSegmentMode::Passive;
+    }
     bool HasTableNumber =
-        !IsPassive &&
+        Mode == wasm::ElemSegmentMode::Active &&
         (Segment.Flags & wasm::WASM_ELEM_SEGMENT_HAS_TABLE_NUMBER);
+    bool HasElemKind =
+        (Segment.Flags & wasm::WASM_ELEM_SEGMENT_MASK_HAS_ELEM_DESC) &&
+        !(Segment.Flags & wasm::WASM_ELEM_SEGMENT_HAS_INIT_EXPRS);
+    bool HasElemType =
+        (Segment.Flags & wasm::WASM_ELEM_SEGMENT_MASK_HAS_ELEM_DESC) &&
+        (Segment.Flags & wasm::WASM_ELEM_SEGMENT_HAS_INIT_EXPRS);
     bool HasInitExprs =
         (Segment.Flags & wasm::WASM_ELEM_SEGMENT_HAS_INIT_EXPRS);
-    bool HasElemKind =
-        (Segment.Flags & wasm::WASM_ELEM_SEGMENT_MASK_HAS_ELEM_KIND) &&
-        !HasInitExprs;
 
     if (HasTableNumber)
       Segment.TableNumber = readVaruint32(Ctx);
@@ -1666,7 +1679,7 @@ Error WasmObjectFile::parseElemSection(ReadContext &Ctx) {
       return make_error<GenericBinaryError>("invalid TableNumber",
                                             object_error::parse_failed);
 
-    if (IsPassive || IsDeclarative) {
+    if (Mode != wasm::ElemSegmentMode::Active) {
       Segment.Offset.Extended = false;
       Segment.Offset.Inst.Opcode = wasm::WASM_OPCODE_I32_CONST;
       Segment.Offset.Inst.Value.Int32 = 0;
@@ -1692,7 +1705,7 @@ Error WasmObjectFile::parseElemSection(ReadContext &Ctx) {
                                                 object_error::parse_failed);
         Segment.ElemKind = wasm::ValType::FUNCREF;
       }
-    } else if (HasInitExprs) {
+    } else if (HasElemType) {
       auto ElemType = parseValType(Ctx, readVaruint32(Ctx));
       Segment.ElemKind = ElemType;
     } else {
diff --git llvm/lib/ObjectYAML/WasmEmitter.cpp llvm/lib/ObjectYAML/WasmEmitter.cpp
index 817d364694b4..bd016764f586 100644
--- llvm/lib/ObjectYAML/WasmEmitter.cpp
+++ llvm/lib/ObjectYAML/WasmEmitter.cpp
@@ -497,7 +497,7 @@ void WasmWriter::writeSectionContent(raw_ostream &OS,
 
     writeInitExpr(OS, Segment.Offset);
 
-    if (Segment.Flags & wasm::WASM_ELEM_SEGMENT_MASK_HAS_ELEM_KIND) {
+    if (Segment.Flags & wasm::WASM_ELEM_SEGMENT_MASK_HAS_ELEM_DESC) {
       // We only support active function table initializers, for which the elem
       // kind is specified to be written as 0x00 and interpreted to mean
       // "funcref".
diff --git llvm/lib/ObjectYAML/WasmYAML.cpp llvm/lib/ObjectYAML/WasmYAML.cpp
index 0636e19e0535..6af66ba62be1 100644
--- llvm/lib/ObjectYAML/WasmYAML.cpp
+++ llvm/lib/ObjectYAML/WasmYAML.cpp
@@ -381,7 +381,7 @@ void MappingTraits<WasmYAML::ElemSegment>::mapping(
       Segment.Flags & wasm::WASM_ELEM_SEGMENT_HAS_TABLE_NUMBER)
     IO.mapOptional("TableNumber", Segment.TableNumber);
   if (!IO.outputting() ||
-      Segment.Flags & wasm::WASM_ELEM_SEGMENT_MASK_HAS_ELEM_KIND)
+      Segment.Flags & wasm::WASM_ELEM_SEGMENT_MASK_HAS_ELEM_DESC)
     IO.mapOptional("ElemKind", Segment.ElemKind);
   // TODO: Omit "offset" for passive segments? It's neither meaningful nor
   // encoded.
diff --git llvm/lib/SandboxIR/Instruction.cpp llvm/lib/SandboxIR/Instruction.cpp
index 0a7cd95124bb..cc961418600e 100644
--- llvm/lib/SandboxIR/Instruction.cpp
+++ llvm/lib/SandboxIR/Instruction.cpp
@@ -926,21 +926,26 @@ void PHINode::removeIncomingValueIf(function_ref<bool(unsigned)> Predicate) {
   }
 }
 
-CmpInst *CmpInst::create(Predicate P, Value *S1, Value *S2, InsertPosition Pos,
-                         Context &Ctx, const Twine &Name) {
+Value *CmpInst::create(Predicate P, Value *S1, Value *S2, InsertPosition Pos,
+                       Context &Ctx, const Twine &Name) {
   auto &Builder = setInsertPos(Pos);
-  auto *LLVMI = Builder.CreateCmp(P, S1->Val, S2->Val, Name);
-  if (dyn_cast<llvm::ICmpInst>(LLVMI))
-    return Ctx.createICmpInst(cast<llvm::ICmpInst>(LLVMI));
-  return Ctx.createFCmpInst(cast<llvm::FCmpInst>(LLVMI));
-}
-CmpInst *CmpInst::createWithCopiedFlags(Predicate P, Value *S1, Value *S2,
-                                        const Instruction *F,
-                                        InsertPosition Pos, Context &Ctx,
-                                        const Twine &Name) {
-  CmpInst *Inst = create(P, S1, S2, Pos, Ctx, Name);
-  cast<llvm::CmpInst>(Inst->Val)->copyIRFlags(F->Val);
-  return Inst;
+  auto *LLVMV = Builder.CreateCmp(P, S1->Val, S2->Val, Name);
+  // It may have been folded into a constant.
+  if (auto *LLVMC = dyn_cast<llvm::Constant>(LLVMV))
+    return Ctx.getOrCreateConstant(LLVMC);
+  if (isa<llvm::ICmpInst>(LLVMV))
+    return Ctx.createICmpInst(cast<llvm::ICmpInst>(LLVMV));
+  return Ctx.createFCmpInst(cast<llvm::FCmpInst>(LLVMV));
+}
+
+Value *CmpInst::createWithCopiedFlags(Predicate P, Value *S1, Value *S2,
+                                      const Instruction *F, InsertPosition Pos,
+                                      Context &Ctx, const Twine &Name) {
+  Value *V = create(P, S1, S2, Pos, Ctx, Name);
+  if (auto *C = dyn_cast<Constant>(V))
+    return C;
+  cast<llvm::CmpInst>(V->Val)->copyIRFlags(F->Val);
+  return V;
 }
 
 Type *CmpInst::makeCmpResultType(Type *OpndType) {
diff --git llvm/lib/SandboxIR/Type.cpp llvm/lib/SandboxIR/Type.cpp
index 9ecff5f0165a..4734d51be282 100644
--- llvm/lib/SandboxIR/Type.cpp
+++ llvm/lib/SandboxIR/Type.cpp
@@ -46,8 +46,7 @@ void Type::dump() {
 #endif
 
 PointerType *PointerType::get(Type *ElementType, unsigned AddressSpace) {
-  return cast<PointerType>(ElementType->getContext().getType(
-      llvm::PointerType::get(ElementType->LLVMTy, AddressSpace)));
+  return get(ElementType->getContext(), AddressSpace);
 }
 
 PointerType *PointerType::get(Context &Ctx, unsigned AddressSpace) {
diff --git llvm/lib/Support/VirtualFileSystem.cpp llvm/lib/Support/VirtualFileSystem.cpp
index 5febdf992fbf..e489282281d2 100644
--- llvm/lib/Support/VirtualFileSystem.cpp
+++ llvm/lib/Support/VirtualFileSystem.cpp
@@ -1708,11 +1708,12 @@ class llvm::vfs::RedirectingFileSystemParser {
   // false on error
   bool checkDuplicateOrUnknownKey(yaml::Node *KeyNode, StringRef Key,
                                   DenseMap<StringRef, KeyStatus> &Keys) {
-    if (!Keys.count(Key)) {
+    auto It = Keys.find(Key);
+    if (It == Keys.end()) {
       error(KeyNode, "unknown key");
       return false;
     }
-    KeyStatus &S = Keys[Key];
+    KeyStatus &S = It->second;
     if (S.Seen) {
       error(KeyNode, Twine("duplicate key '") + Key + "'");
       return false;
diff --git llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
index 415edb189e60..abd2df301880 100644
--- llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
+++ llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
@@ -793,9 +793,9 @@ bool AArch64Arm64ECCallLowering::runOnModule(Module &Mod) {
   VoidTy = Type::getVoidTy(M->getContext());
 
   GuardFnType = FunctionType::get(PtrTy, {PtrTy, PtrTy}, false);
-  GuardFnPtrType = PointerType::get(GuardFnType, 0);
+  GuardFnPtrType = PointerType::get(M->getContext(), 0);
   DispatchFnType = FunctionType::get(PtrTy, {PtrTy, PtrTy, PtrTy}, false);
-  DispatchFnPtrType = PointerType::get(DispatchFnType, 0);
+  DispatchFnPtrType = PointerType::get(M->getContext(), 0);
   GuardFnCFGlobal =
       M->getOrInsertGlobal("__os_arm64x_check_icall_cfg", GuardFnPtrType);
   GuardFnGlobal =
diff --git llvm/lib/Target/AArch64/AArch64CollectLOH.cpp llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
index e8a4d73c671c..4d0d99bce258 100644
--- llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
+++ llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
@@ -251,7 +251,7 @@ static bool supportLoadFromLiteral(const MachineInstr &MI) {
 /// Number of GPR registers traked by mapRegToGPRIndex()
 static const unsigned N_GPR_REGS = 31;
 /// Map register number to index from 0-30.
-static int mapRegToGPRIndex(MCPhysReg Reg) {
+static int mapRegToGPRIndex(MCRegister Reg) {
   static_assert(AArch64::X28 - AArch64::X0 + 3 == N_GPR_REGS, "Number of GPRs");
   static_assert(AArch64::W30 - AArch64::W0 + 1 == N_GPR_REGS, "Number of GPRs");
   if (AArch64::X0 <= Reg && Reg <= AArch64::X28)
diff --git llvm/lib/Target/AArch64/AArch64Features.td llvm/lib/Target/AArch64/AArch64Features.td
index ffc2d27a57c9..0a91edb4c166 100644
--- llvm/lib/Target/AArch64/AArch64Features.td
+++ llvm/lib/Target/AArch64/AArch64Features.td
@@ -859,8 +859,8 @@ def HasV8_6aOps : Architecture64<8, 6, "a", "v8.6a",
     FeatureEnhancedCounterVirtualization, FeatureMatMulInt8],
   !listconcat(HasV8_5aOps.DefaultExts, [FeatureBF16, FeatureMatMulInt8])>;
 def HasV8_7aOps : Architecture64<8, 7, "a", "v8.7a",
-  [HasV8_6aOps, FeatureXS, FeatureWFxT, FeatureHCX, FeatureSPE_EEF],
-  !listconcat(HasV8_6aOps.DefaultExts, [FeatureWFxT])>;
+  [HasV8_6aOps, FeatureXS, FeatureWFxT, FeatureHCX],
+  !listconcat(HasV8_6aOps.DefaultExts, [FeatureWFxT, FeatureSPE_EEF])>;
 def HasV8_8aOps : Architecture64<8, 8, "a", "v8.8a",
   [HasV8_7aOps, FeatureHBC, FeatureMOPS, FeatureNMI],
   !listconcat(HasV8_7aOps.DefaultExts, [FeatureMOPS, FeatureHBC])>;
@@ -875,17 +875,19 @@ def HasV9_0aOps : Architecture64<9, 0, "a", "v9a",
     FeatureSVE2])>;
 def HasV9_1aOps : Architecture64<9, 1, "a", "v9.1a",
   [HasV8_6aOps, HasV9_0aOps],
-  !listconcat(HasV9_0aOps.DefaultExts, [FeatureBF16, FeatureMatMulInt8, FeatureRME])>;
+  !listconcat(HasV9_0aOps.DefaultExts, HasV8_6aOps.DefaultExts,
+              [FeatureRME])>;
 def HasV9_2aOps : Architecture64<9, 2, "a", "v9.2a",
   [HasV8_7aOps, HasV9_1aOps],
-  !listconcat(HasV9_1aOps.DefaultExts, [FeatureMEC, FeatureWFxT])>;
+  !listconcat(HasV9_1aOps.DefaultExts, HasV8_7aOps.DefaultExts,
+              [FeatureMEC])>;
 def HasV9_3aOps : Architecture64<9, 3, "a", "v9.3a",
   [HasV8_8aOps, HasV9_2aOps],
-  !listconcat(HasV9_2aOps.DefaultExts, [FeatureMOPS, FeatureHBC])>;
+  !listconcat(HasV9_2aOps.DefaultExts, HasV8_8aOps.DefaultExts, [])>;
 def HasV9_4aOps : Architecture64<9, 4, "a", "v9.4a",
   [HasV8_9aOps, HasV9_3aOps],
-  !listconcat(HasV9_3aOps.DefaultExts, [FeatureSPECRES2, FeatureCSSC,
-    FeatureRASv2, FeatureSVE2p1])>;
+  !listconcat(HasV9_3aOps.DefaultExts, HasV8_9aOps.DefaultExts,
+              [FeatureSVE2p1])>;
 def HasV9_5aOps : Architecture64<9, 5, "a", "v9.5a",
   [HasV9_4aOps, FeatureCPA],
   !listconcat(HasV9_4aOps.DefaultExts, [FeatureCPA,  FeatureLUT, FeatureFAMINMAX])>;
diff --git llvm/lib/Target/AArch64/AArch64FrameLowering.cpp llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 1582d1999ca1..eabe64361938 100644
--- llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -1195,10 +1195,9 @@ bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
 }
 
 bool AArch64FrameLowering::shouldCombineCSRLocalStackBumpInEpilogue(
-    MachineBasicBlock &MBB, unsigned StackBumpBytes) const {
+    MachineBasicBlock &MBB, uint64_t StackBumpBytes) const {
   if (!shouldCombineCSRLocalStackBump(*MBB.getParent(), StackBumpBytes))
     return false;
-
   if (MBB.empty())
     return true;
 
@@ -2363,7 +2362,6 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
   }
   bool CombineSPBump = shouldCombineCSRLocalStackBumpInEpilogue(MBB, NumBytes);
   // Assume we can't combine the last pop with the sp restore.
-
   bool CombineAfterCSRBump = false;
   if (!CombineSPBump && PrologueSaveSize != 0) {
     MachineBasicBlock::iterator Pop = std::prev(MBB.getFirstTerminator());
diff --git llvm/lib/Target/AArch64/AArch64FrameLowering.h llvm/lib/Target/AArch64/AArch64FrameLowering.h
index 20445e63bcb1..8f84702f4d2b 100644
--- llvm/lib/Target/AArch64/AArch64FrameLowering.h
+++ llvm/lib/Target/AArch64/AArch64FrameLowering.h
@@ -146,7 +146,7 @@ private:
                                       int &MinCSFrameIndex,
                                       int &MaxCSFrameIndex) const;
   bool shouldCombineCSRLocalStackBumpInEpilogue(MachineBasicBlock &MBB,
-                                                unsigned StackBumpBytes) const;
+                                                uint64_t StackBumpBytes) const;
   void emitCalleeSavedGPRLocations(MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator MBBI) const;
   void emitCalleeSavedSVELocations(MachineBasicBlock &MBB,
diff --git llvm/lib/Target/AArch64/AArch64ISelLowering.cpp llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 0c096711bf3b..9a0bb7308798 100644
--- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -9702,7 +9702,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
 
 bool AArch64TargetLowering::CanLowerReturn(
     CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
-    const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
+    const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
+    const Type *RetTy) const {
   CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
@@ -24898,16 +24899,31 @@ static SDValue reassociateCSELOperandsForCSE(SDNode *N, SelectionDAG &DAG) {
   SDValue SubsNode = N->getOperand(3);
   if (SubsNode.getOpcode() != AArch64ISD::SUBS || !SubsNode.hasOneUse())
     return SDValue();
-  auto *CmpOpConst = dyn_cast<ConstantSDNode>(SubsNode.getOperand(1));
-  if (!CmpOpConst)
-    return SDValue();
 
+  SDValue CmpOpToMatch = SubsNode.getOperand(1);
   SDValue CmpOpOther = SubsNode.getOperand(0);
   EVT VT = N->getValueType(0);
 
+  unsigned ExpectedOpcode;
+  SDValue ExpectedOp;
+  SDValue SubsOp;
+  auto *CmpOpConst = dyn_cast<ConstantSDNode>(CmpOpToMatch);
+  if (CmpOpConst) {
+    ExpectedOpcode = ISD::ADD;
+    ExpectedOp =
+        DAG.getConstant(-CmpOpConst->getAPIntValue(), SDLoc(CmpOpConst),
+                        CmpOpConst->getValueType(0));
+    SubsOp = DAG.getConstant(CmpOpConst->getAPIntValue(), SDLoc(CmpOpConst),
+                             CmpOpConst->getValueType(0));
+  } else {
+    ExpectedOpcode = ISD::SUB;
+    ExpectedOp = CmpOpToMatch;
+    SubsOp = CmpOpToMatch;
+  }
+
   // Get the operand that can be reassociated with the SUBS instruction.
-  auto GetReassociationOp = [&](SDValue Op, APInt ExpectedConst) {
-    if (Op.getOpcode() != ISD::ADD)
+  auto GetReassociationOp = [&](SDValue Op, SDValue ExpectedOp) {
+    if (Op.getOpcode() != ExpectedOpcode)
       return SDValue();
     if (Op.getOperand(0).getOpcode() != ISD::ADD ||
         !Op.getOperand(0).hasOneUse())
@@ -24918,24 +24934,21 @@ static SDValue reassociateCSELOperandsForCSE(SDNode *N, SelectionDAG &DAG) {
       std::swap(X, Y);
     if (X != CmpOpOther)
       return SDValue();
-    auto *AddOpConst = dyn_cast<ConstantSDNode>(Op.getOperand(1));
-    if (!AddOpConst || AddOpConst->getAPIntValue() != ExpectedConst)
+    if (ExpectedOp != Op.getOperand(1))
       return SDValue();
     return Y;
   };
 
   // Try the reassociation using the given constant and condition code.
-  auto Fold = [&](APInt NewCmpConst, AArch64CC::CondCode NewCC) {
-    APInt ExpectedConst = -NewCmpConst;
-    SDValue TReassocOp = GetReassociationOp(N->getOperand(0), ExpectedConst);
-    SDValue FReassocOp = GetReassociationOp(N->getOperand(1), ExpectedConst);
+  auto Fold = [&](AArch64CC::CondCode NewCC, SDValue ExpectedOp,
+                  SDValue SubsOp) {
+    SDValue TReassocOp = GetReassociationOp(N->getOperand(0), ExpectedOp);
+    SDValue FReassocOp = GetReassociationOp(N->getOperand(1), ExpectedOp);
     if (!TReassocOp && !FReassocOp)
       return SDValue();
 
     SDValue NewCmp = DAG.getNode(AArch64ISD::SUBS, SDLoc(SubsNode),
-                                 DAG.getVTList(VT, MVT_CC), CmpOpOther,
-                                 DAG.getConstant(NewCmpConst, SDLoc(CmpOpConst),
-                                                 CmpOpConst->getValueType(0)));
+                                 DAG.getVTList(VT, MVT_CC), CmpOpOther, SubsOp);
 
     auto Reassociate = [&](SDValue ReassocOp, unsigned OpNum) {
       if (!ReassocOp)
@@ -24957,9 +24970,19 @@ static SDValue reassociateCSELOperandsForCSE(SDNode *N, SelectionDAG &DAG) {
 
   // First, try to eliminate the compare instruction by searching for a
   // subtraction with the same constant.
-  if (SDValue R = Fold(CmpOpConst->getAPIntValue(), CC))
+  if (SDValue R = Fold(CC, ExpectedOp, SubsOp))
     return R;
 
+  if (!CmpOpConst) {
+    // Try again with the operands of the SUBS instruction and the condition
+    // swapped. Due to canonicalization, this only helps for non-constant
+    // operands of the SUBS instruction.
+    std::swap(CmpOpToMatch, CmpOpOther);
+    if (SDValue R = Fold(getSwappedCondition(CC), CmpOpToMatch, CmpOpToMatch))
+      return R;
+    return SDValue();
+  }
+
   if ((CC == AArch64CC::EQ || CC == AArch64CC::NE) && !CmpOpConst->isZero())
     return SDValue();
 
@@ -24971,7 +24994,11 @@ static SDValue reassociateCSELOperandsForCSE(SDNode *N, SelectionDAG &DAG) {
   // them here but check for them nevertheless to be on the safe side.
   auto CheckedFold = [&](bool Check, APInt NewCmpConst,
                          AArch64CC::CondCode NewCC) {
-    return Check ? Fold(NewCmpConst, NewCC) : SDValue();
+    auto ExpectedOp = DAG.getConstant(-NewCmpConst, SDLoc(CmpOpConst),
+                                      CmpOpConst->getValueType(0));
+    auto SubsOp = DAG.getConstant(NewCmpConst, SDLoc(CmpOpConst),
+                                  CmpOpConst->getValueType(0));
+    return Check ? Fold(NewCC, ExpectedOp, SubsOp) : SDValue();
   };
   switch (CC) {
   case AArch64CC::EQ:
@@ -25026,6 +25053,30 @@ static SDValue performCSELCombine(SDNode *N,
   if (SDValue Folded = foldCSELofCTTZ(N, DAG))
 		return Folded;
 
+  // CSEL a, b, cc, SUBS(x, y) -> CSEL a, b, swapped(cc), SUBS(y, x)
+  // if SUB(y, x) already exists and we can produce a swapped predicate for cc.
+  SDValue Cond = N->getOperand(3);
+  if (DCI.isAfterLegalizeDAG() && Cond.getOpcode() == AArch64ISD::SUBS &&
+      Cond.hasOneUse() && Cond->hasNUsesOfValue(0, 0) &&
+      DAG.doesNodeExist(ISD::SUB, N->getVTList(),
+                        {Cond.getOperand(1), Cond.getOperand(0)}) &&
+      !DAG.doesNodeExist(ISD::SUB, N->getVTList(),
+                         {Cond.getOperand(0), Cond.getOperand(1)}) &&
+      !isNullConstant(Cond.getOperand(1))) {
+    AArch64CC::CondCode OldCond =
+        static_cast<AArch64CC::CondCode>(N->getConstantOperandVal(2));
+    AArch64CC::CondCode NewCond = getSwappedCondition(OldCond);
+    if (NewCond != AArch64CC::AL) {
+      SDLoc DL(N);
+      SDValue Sub = DAG.getNode(AArch64ISD::SUBS, DL, Cond->getVTList(),
+                                Cond.getOperand(1), Cond.getOperand(0));
+      return DAG.getNode(AArch64ISD::CSEL, DL, N->getVTList(), N->getOperand(0),
+                         N->getOperand(1),
+                         DAG.getConstant(NewCond, DL, MVT::i32),
+                         Sub.getValue(1));
+    }
+  }
+
   return performCONDCombine(N, DCI, DAG, 2, 3);
 }
 
diff --git llvm/lib/Target/AArch64/AArch64ISelLowering.h llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 85b62be5dd30..61579de50db1 100644
--- llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -1103,7 +1103,7 @@ private:
   bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
                       bool isVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
-                      LLVMContext &Context) const override;
+                      LLVMContext &Context, const Type *RetTy) const override;
 
   SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
diff --git llvm/lib/Target/AArch64/AArch64InstrFormats.td llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 1ff8b77f88e2..6a3a9492e031 100644
--- llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -9053,22 +9053,19 @@ class SIMDThreeSameVectorBF16MatrixMul<string asm>
 
 let mayRaiseFPException = 1, Uses = [FPCR] in
 class SIMD_BFCVTN
-  : BaseSIMDMixedTwoVector<0, 0, 0b10, 0b10110, V128, V128,
+  : BaseSIMDMixedTwoVector<0, 0, 0b10, 0b10110, V128, V64,
                            "bfcvtn", ".4h", ".4s",
-    [(set (v8bf16 V128:$Rd),
-          (int_aarch64_neon_bfcvtn (v4f32 V128:$Rn)))]>;
+    [(set (v4bf16 V64:$Rd), (any_fpround (v4f32 V128:$Rn)))]>;
 
 let mayRaiseFPException = 1, Uses = [FPCR] in
 class SIMD_BFCVTN2
   : BaseSIMDMixedTwoVectorTied<1, 0, 0b10, 0b10110, V128, V128,
-                           "bfcvtn2", ".8h", ".4s",
-    [(set (v8bf16 V128:$dst),
-          (int_aarch64_neon_bfcvtn2 (v8bf16 V128:$Rd), (v4f32 V128:$Rn)))]>;
+                               "bfcvtn2", ".8h", ".4s", []>;
 
 let mayRaiseFPException = 1, Uses = [FPCR] in
 class BF16ToSinglePrecision<string asm>
   : I<(outs FPR16:$Rd), (ins FPR32:$Rn), asm, "\t$Rd, $Rn", "",
-    [(set (bf16 FPR16:$Rd), (int_aarch64_neon_bfcvt (f32 FPR32:$Rn)))]>,
+    [(set (bf16 FPR16:$Rd), (any_fpround (f32 FPR32:$Rn)))]>,
     Sched<[WriteFCvt]> {
   bits<5> Rd;
   bits<5> Rn;
diff --git llvm/lib/Target/AArch64/AArch64InstrInfo.cpp llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index a2fd4963db10..6b8a7e9559e0 100644
--- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -4830,7 +4830,7 @@ static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB,
   if (!SubIdx)
     return MIB.addReg(Reg, State);
 
-  if (Register::isPhysicalRegister(Reg))
+  if (Reg.isPhysical())
     return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
   return MIB.addReg(Reg, State, SubIdx);
 }
diff --git llvm/lib/Target/AArch64/AArch64InstrInfo.td llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 8215f3a4fdae..8e575abf83d4 100644
--- llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -1454,8 +1454,8 @@ def BFMLALTIdx   : SIMDBF16MLALIndex<1, "bfmlalt", int_aarch64_neon_bfmlalt>;
 def BFCVTN       : SIMD_BFCVTN;
 def BFCVTN2      : SIMD_BFCVTN2;
 
-def : Pat<(v4bf16 (any_fpround (v4f32 V128:$Rn))),
-          (EXTRACT_SUBREG (BFCVTN V128:$Rn), dsub)>;
+def : Pat<(concat_vectors (v4bf16 V64:$Rd), (any_fpround (v4f32 V128:$Rn))),
+          (BFCVTN2 (v8bf16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub)), V128:$Rn)>;
 
 // Vector-scalar BFDOT:
 // The second source operand of the 64-bit variant of BF16DOTlane is a 128-bit
@@ -1477,8 +1477,6 @@ def : Pat<(v2f32 (int_aarch64_neon_bfdot
 
 let Predicates = [HasNEONandIsStreamingSafe, HasBF16] in {
 def BFCVT : BF16ToSinglePrecision<"bfcvt">;
-// Round FP32 to BF16.
-def : Pat<(bf16 (any_fpround (f32 FPR32:$Rn))), (BFCVT $Rn)>;
 }
 
 // ARMv8.6A AArch64 matrix multiplication
@@ -2030,6 +2028,8 @@ let Predicates = [HasPAuthLR] in {
     //                              opcode2, opcode,   asm
     def AUTIASPPCr : SignAuthOneReg<0b00001, 0b100100, "autiasppcr">;
     def AUTIBSPPCr : SignAuthOneReg<0b00001, 0b100101, "autibsppcr">;
+  }
+  let Defs = [X17], Uses = [X15, X16, X17] in {
     //                                  opcode2, opcode,   asm
     def PACIA171615 : SignAuthFixedRegs<0b00001, 0b100010, "pacia171615">;
     def PACIB171615 : SignAuthFixedRegs<0b00001, 0b100011, "pacib171615">;
@@ -10410,9 +10410,11 @@ multiclass PromoteUnaryv8f16Tov4f32<SDPatternOperator InOp, Instruction OutInst>
   let Predicates = [HasBF16] in
   def : Pat<(InOp (v8bf16 V128:$Rn)),
             (v8bf16 (BFCVTN2
-              (v8bf16 (BFCVTN
-                (v4f32 (OutInst
-                  (v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))))))),
+              (INSERT_SUBREG (IMPLICIT_DEF),
+                (v4bf16 (BFCVTN
+                  (v4f32 (OutInst
+                    (v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))))))),
+                dsub),
               (v4f32 (OutInst (v4f32 (SHLLv8i16 V128:$Rn))))))>;
 
   let Predicates = [HasNoBF16] in
@@ -10447,10 +10449,12 @@ multiclass PromoteBinaryv8f16Tov4f32<SDPatternOperator InOp, Instruction OutInst
   let Predicates = [HasBF16] in
   def : Pat<(InOp (v8bf16 V128:$Rn), (v8bf16 V128:$Rm)),
             (v8bf16 (BFCVTN2
-              (v8bf16 (BFCVTN
-                (v4f32 (OutInst
-                  (v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))),
-                  (v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rm, dsub)))))))),
+              (INSERT_SUBREG (IMPLICIT_DEF),
+                (v4bf16 (BFCVTN
+                  (v4f32 (OutInst
+                    (v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))),
+                    (v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rm, dsub)))))))),
+                dsub),
               (v4f32 (OutInst (v4f32 (SHLLv8i16 V128:$Rn)),
                               (v4f32 (SHLLv8i16 V128:$Rm))))))>;
 
diff --git llvm/lib/Target/AArch64/AArch64Processors.td llvm/lib/Target/AArch64/AArch64Processors.td
index 364ab0d82bf8..2de8d4637d37 100644
--- llvm/lib/Target/AArch64/AArch64Processors.td
+++ llvm/lib/Target/AArch64/AArch64Processors.td
@@ -229,6 +229,7 @@ def TuneX2 : SubtargetFeature<"cortex-x2", "ARMProcFamily", "CortexX2",
                                   FeatureALULSLFast,
                                   FeaturePostRAScheduler,
                                   FeatureEnableSelectOptimize,
+                                  FeatureUseFixedOverScalableIfEqualCost,
                                   FeaturePredictableSelectIsExpensive]>;
 
 def TuneX3 : SubtargetFeature<"cortex-x3", "ARMProcFamily", "CortexX3",
@@ -238,6 +239,7 @@ def TuneX3 : SubtargetFeature<"cortex-x3", "ARMProcFamily", "CortexX3",
                                FeatureFuseAES,
                                FeaturePostRAScheduler,
                                FeatureEnableSelectOptimize,
+                               FeatureUseFixedOverScalableIfEqualCost,
                                FeaturePredictableSelectIsExpensive]>;
 
 def TuneX4 : SubtargetFeature<"cortex-x4", "ARMProcFamily", "CortexX4",
@@ -247,6 +249,7 @@ def TuneX4 : SubtargetFeature<"cortex-x4", "ARMProcFamily", "CortexX4",
                                FeatureFuseAES,
                                FeaturePostRAScheduler,
                                FeatureEnableSelectOptimize,
+                               FeatureUseFixedOverScalableIfEqualCost,
                                FeaturePredictableSelectIsExpensive]>;
 
 def TuneX925 : SubtargetFeature<"cortex-x925", "ARMProcFamily",
@@ -256,6 +259,7 @@ def TuneX925 : SubtargetFeature<"cortex-x925", "ARMProcFamily",
                                 FeatureFuseAES,
                                 FeaturePostRAScheduler,
                                 FeatureEnableSelectOptimize,
+                                FeatureUseFixedOverScalableIfEqualCost,
                                 FeaturePredictableSelectIsExpensive]>;
 
 def TuneA64FX : SubtargetFeature<"a64fx", "ARMProcFamily", "A64FX",
@@ -363,6 +367,7 @@ def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15",
                                     FeatureArithmeticCbzFusion,
                                     FeatureDisableLatencySchedHeuristic,
                                     FeatureFuseAddress,
+                                    FeatureFuseAdrpAdd,
                                     FeatureFuseAES,
                                     FeatureFuseArithmeticLogic,
                                     FeatureFuseCCSelect,
@@ -413,6 +418,7 @@ def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
                                      FeatureArithmeticCbzFusion,
                                      FeatureDisableLatencySchedHeuristic,
                                      FeatureFuseAddress,
+                                     FeatureFuseAdrpAdd,
                                      FeatureFuseAES,
                                      FeatureFuseArithmeticLogic,
                                      FeatureFuseCCSelect,
@@ -923,7 +929,8 @@ def ProcessorFeatures {
                                     FeatureComplxNum, FeatureCRC, FeatureJS,
                                     FeatureLSE, FeaturePAuth, FeatureFPAC,
                                     FeatureRAS, FeatureRCPC, FeatureRDM,
-                                    FeatureDotProd, FeatureMatMulInt8];
+                                    FeatureDotProd, FeatureMatMulInt8,
+                                    FeatureSPE_EEF];
   list<SubtargetFeature> ExynosM3 = [HasV8_0aOps, FeatureCRC, FeatureSHA2, FeatureAES,
                                      FeaturePerfMon, FeatureNEON, FeatureFPARMv8];
   list<SubtargetFeature> ExynosM4 = [HasV8_2aOps, FeatureSHA2, FeatureAES, FeatureDotProd,
diff --git llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 27c88a55919e..6d5e2697160a 100644
--- llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -4277,8 +4277,8 @@ let Predicates = [HasSVE2p2_or_SME2p2] in {
   defm FCVTZS_ZPzZ : sve_fp_z2op_p_zd_d<0b0, "fcvtzs", "int_aarch64_sve_fcvtzs", AArch64fcvtzs_mt>;
   defm FCVTZU_ZPzZ : sve_fp_z2op_p_zd_d<0b1, "fcvtzu", "int_aarch64_sve_fcvtzu", AArch64fcvtzu_mt>;
   // Integer convert to floating-point, zeroing predicate
-  defm SCVTF_ZPzZ  : sve_fp_z2op_p_zd_c<0b0, "scvtf">;
-  defm UCVTF_ZPzZ  : sve_fp_z2op_p_zd_c<0b1, "ucvtf">;
+  defm SCVTF_ZPzZ  : sve_fp_z2op_p_zd_c<0b0, "scvtf", "int_aarch64_sve_scvtf", AArch64scvtf_mt>;
+  defm UCVTF_ZPzZ  : sve_fp_z2op_p_zd_c<0b1, "ucvtf", "int_aarch64_sve_ucvtf", AArch64ucvtf_mt>;
   // Signed integer base 2 logarithm of fp value, zeroing predicate
   defm FLOGB_ZPzZ : sve_fp_z2op_p_zd_d_flogb<"flogb">;
 
@@ -4314,11 +4314,11 @@ let Predicates = [HasSVE2p2_or_SME2p2] in {
   defm FSQRT_ZPZz  : sve_fp_z2op_p_zd_hsd<0b01101, "fsqrt">;
 
   // SVE2p2 integer unary arithmetic (bitwise), zeroing predicate
-  defm CLS_ZPzZ  : sve_int_un_pred_arit_bitwise_z<0b000, "cls">;
-  defm CLZ_ZPzZ  : sve_int_un_pred_arit_bitwise_z<0b001, "clz">;
-  defm CNT_ZPzZ  : sve_int_un_pred_arit_bitwise_z<0b010, "cnt">;
-  defm CNOT_ZPzZ : sve_int_un_pred_arit_bitwise_z<0b011, "cnot">;
-  defm NOT_ZPzZ  : sve_int_un_pred_arit_bitwise_z<0b110, "not">;
+  defm CLS_ZPzZ  : sve_int_un_pred_arit_bitwise_z<0b000,  "cls", AArch64cls_mt>;
+  defm CLZ_ZPzZ  : sve_int_un_pred_arit_bitwise_z<0b001,  "clz", AArch64clz_mt>;
+  defm CNT_ZPzZ  : sve_int_un_pred_arit_bitwise_z<0b010,  "cnt", AArch64cnt_mt>;
+  defm CNOT_ZPzZ : sve_int_un_pred_arit_bitwise_z<0b011, "cnot", AArch64cnot_mt>;
+  defm NOT_ZPzZ  : sve_int_un_pred_arit_bitwise_z<0b110,  "not", AArch64not_mt>;
 
   // floating point
   defm FABS_ZPzZ : sve_int_un_pred_arit_bitwise_fp_z<0b100, "fabs", AArch64fabs_mt>;
diff --git llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 7f10bfed739b..cd093317275e 100644
--- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3623,7 +3623,13 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
     // so the cost can be cheaper (smull or umull).
     if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args))
       return LT.first;
-    return LT.first * 14;
+    return cast<VectorType>(Ty)->getElementCount().getKnownMinValue() *
+           (getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind) +
+            getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind, -1,
+                               nullptr, nullptr) *
+                2 +
+            getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
+                               nullptr, nullptr));
   case ISD::ADD:
   case ISD::XOR:
   case ISD::OR:
@@ -4664,6 +4670,66 @@ InstructionCost AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index) {
   return LegalizationCost * LT.first;
 }
 
+InstructionCost AArch64TTIImpl::getPartialReductionCost(
+    unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
+    ElementCount VF, TTI::PartialReductionExtendKind OpAExtend,
+    TTI::PartialReductionExtendKind OpBExtend,
+    std::optional<unsigned> BinOp) const {
+  InstructionCost Invalid = InstructionCost::getInvalid();
+  InstructionCost Cost(TTI::TCC_Basic);
+
+  if (Opcode != Instruction::Add)
+    return Invalid;
+
+  if (InputTypeA != InputTypeB)
+    return Invalid;
+
+  EVT InputEVT = EVT::getEVT(InputTypeA);
+  EVT AccumEVT = EVT::getEVT(AccumType);
+
+  if (VF.isScalable() && !ST->isSVEorStreamingSVEAvailable())
+    return Invalid;
+  if (VF.isFixed() && (!ST->isNeonAvailable() || !ST->hasDotProd()))
+    return Invalid;
+
+  if (InputEVT == MVT::i8) {
+    switch (VF.getKnownMinValue()) {
+    default:
+      return Invalid;
+    case 8:
+      if (AccumEVT == MVT::i32)
+        Cost *= 2;
+      else if (AccumEVT != MVT::i64)
+        return Invalid;
+      break;
+    case 16:
+      if (AccumEVT == MVT::i64)
+        Cost *= 2;
+      else if (AccumEVT != MVT::i32)
+        return Invalid;
+      break;
+    }
+  } else if (InputEVT == MVT::i16) {
+    // FIXME: Allow i32 accumulator but increase cost, as we would extend
+    //        it to i64.
+    if (VF.getKnownMinValue() != 8 || AccumEVT != MVT::i64)
+      return Invalid;
+  } else
+    return Invalid;
+
+  // AArch64 supports lowering mixed extensions to a usdot but only if the
+  // i8mm or sve/streaming features are available.
+  if (OpAExtend == TTI::PR_None || OpBExtend == TTI::PR_None ||
+      (OpAExtend != OpBExtend && !ST->hasMatMulInt8() &&
+       !ST->isSVEorStreamingSVEAvailable()))
+    return Invalid;
+
+  if (!BinOp || *BinOp != Instruction::Mul)
+    return Invalid;
+
+  return Cost;
+}
+
 InstructionCost AArch64TTIImpl::getShuffleCost(
     TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef<int> Mask,
     TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
diff --git llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 1eb805ae00b1..b65e3c7a1ab2 100644
--- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -367,62 +367,7 @@ public:
                           Type *AccumType, ElementCount VF,
                           TTI::PartialReductionExtendKind OpAExtend,
                           TTI::PartialReductionExtendKind OpBExtend,
-                          std::optional<unsigned> BinOp) const {
-
-    InstructionCost Invalid = InstructionCost::getInvalid();
-    InstructionCost Cost(TTI::TCC_Basic);
-
-    if (Opcode != Instruction::Add)
-      return Invalid;
-
-    if (InputTypeA != InputTypeB)
-      return Invalid;
-
-    EVT InputEVT = EVT::getEVT(InputTypeA);
-    EVT AccumEVT = EVT::getEVT(AccumType);
-
-    if (VF.isScalable() && !ST->isSVEorStreamingSVEAvailable())
-      return Invalid;
-    if (VF.isFixed() && (!ST->isNeonAvailable() || !ST->hasDotProd()))
-      return Invalid;
-
-    if (InputEVT == MVT::i8) {
-      switch (VF.getKnownMinValue()) {
-      default:
-        return Invalid;
-      case 8:
-        if (AccumEVT == MVT::i32)
-          Cost *= 2;
-        else if (AccumEVT != MVT::i64)
-          return Invalid;
-        break;
-      case 16:
-        if (AccumEVT == MVT::i64)
-          Cost *= 2;
-        else if (AccumEVT != MVT::i32)
-          return Invalid;
-        break;
-      }
-    } else if (InputEVT == MVT::i16) {
-      // FIXME: Allow i32 accumulator but increase cost, as we would extend
-      //        it to i64.
-      if (VF.getKnownMinValue() != 8 || AccumEVT != MVT::i64)
-        return Invalid;
-    } else
-      return Invalid;
-
-    // AArch64 supports lowering mixed extensions to a usdot but only if the
-    // i8mm or sve/streaming features are available.
-    if (OpAExtend == TTI::PR_None || OpBExtend == TTI::PR_None ||
-        (OpAExtend != OpBExtend && !ST->hasMatMulInt8() &&
-         !ST->isSVEorStreamingSVEAvailable()))
-      return Invalid;
-
-    if (!BinOp || *BinOp != Instruction::Mul)
-      return Invalid;
-
-    return Cost;
-  }
+                          std::optional<unsigned> BinOp) const;
 
   bool enableOrderedReductions() const { return true; }
 
diff --git llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
index 3ba0f2a26828..337b81d68c93 100644
--- llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -98,7 +98,7 @@ public:
   unsigned getFixupKindContainereSizeInBytes(unsigned Kind) const;
 
   bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
-                             const MCValue &Target,
+                             const MCValue &Target, const uint64_t Value,
                              const MCSubtargetInfo *STI) override;
 };
 
@@ -520,6 +520,7 @@ bool AArch64AsmBackend::writeNopData(raw_ostream &OS, uint64_t Count,
 bool AArch64AsmBackend::shouldForceRelocation(const MCAssembler &Asm,
                                               const MCFixup &Fixup,
                                               const MCValue &Target,
+                                              const uint64_t,
                                               const MCSubtargetInfo *STI) {
   unsigned Kind = Fixup.getKind();
   if (Kind >= FirstLiteralRelocationKind)
diff --git llvm/lib/Target/AArch64/SVEInstrFormats.td llvm/lib/Target/AArch64/SVEInstrFormats.td
index 873fbf7dd346..2ee9910da507 100644
--- llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -3306,7 +3306,7 @@ multiclass sve_fp_z2op_p_zd_d<bit U, string asm, string int_op, SDPatternOperato
   defm : SVE_1_Op_PassthruUndefZero_Pat<nxv2i64, ir_op, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _DtoD)>;
 }
 
-multiclass sve_fp_z2op_p_zd_c<bit U, string asm> {
+multiclass sve_fp_z2op_p_zd_c<bit U, string asm, string int_op, SDPatternOperator ir_op> {
   def _HtoH : sve_fp_z2op_p_zd<{ 0b011001, U }, asm, ZPR16, ZPR16>;
   def _StoH : sve_fp_z2op_p_zd<{ 0b011010, U }, asm, ZPR32, ZPR16>;
   def _StoS : sve_fp_z2op_p_zd<{ 0b101010, U }, asm, ZPR32, ZPR32>;
@@ -3314,6 +3314,15 @@ multiclass sve_fp_z2op_p_zd_c<bit U, string asm> {
   def _DtoS : sve_fp_z2op_p_zd<{ 0b111010, U }, asm, ZPR64, ZPR32>;
   def _DtoH : sve_fp_z2op_p_zd<{ 0b011011, U }, asm, ZPR64, ZPR16>;
   def _DtoD : sve_fp_z2op_p_zd<{ 0b111011, U }, asm, ZPR64, ZPR64>;
+
+  defm : SVE_3_Op_UndefZero_Pat<nxv4f32, !cast<SDPatternOperator>(int_op # _f32i64), nxv4f32, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _DtoS)>;
+  defm : SVE_3_Op_UndefZero_Pat<nxv2f64, !cast<SDPatternOperator>(int_op # _f64i32), nxv2f64, nxv2i1, nxv4i32, !cast<Instruction>(NAME # _StoD)>;
+  defm : SVE_3_Op_UndefZero_Pat<nxv8f16, !cast<SDPatternOperator>(int_op # _f16i32), nxv8f16, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _StoH)>;
+  defm : SVE_3_Op_UndefZero_Pat<nxv8f16, !cast<SDPatternOperator>(int_op # _f16i64), nxv8f16, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _DtoH)>;
+
+  defm : SVE_1_Op_PassthruUndefZero_Pat<nxv8f16, ir_op, nxv8i1,nxv8i16, !cast<Instruction>(NAME # _HtoH)>;
+  defm : SVE_1_Op_PassthruUndefZero_Pat<nxv4f32, ir_op, nxv4i1,nxv4i32, !cast<Instruction>(NAME # _StoS)>;
+  defm : SVE_1_Op_PassthruUndefZero_Pat<nxv2f64, ir_op, nxv2i1,nxv2i64, !cast<Instruction>(NAME # _DtoD)>;
 }
 
 multiclass sve_fp_z2op_p_zd_d_flogb<string asm> {
@@ -4966,11 +4975,16 @@ multiclass sve_int_un_pred_arit_bitwise<bits<3> opc, string asm,
   defm : SVE_1_Op_PassthruUndef_Pat<nxv2i64, op, nxv2i1,  nxv2i64, !cast<Pseudo>(NAME # _D_UNDEF)>;
 }
 
-multiclass sve_int_un_pred_arit_bitwise_z<bits<3> opc, string asm> {
+multiclass sve_int_un_pred_arit_bitwise_z<bits<3> opc, string asm, SDPatternOperator op> {
   def _B : sve_int_un_pred_arit_z<0b00, { opc, 0b1 }, asm, ZPR8>;
   def _H : sve_int_un_pred_arit_z<0b01, { opc, 0b1 }, asm, ZPR16>;
   def _S : sve_int_un_pred_arit_z<0b10, { opc, 0b1 }, asm, ZPR32>;
   def _D : sve_int_un_pred_arit_z<0b11, { opc, 0b1 }, asm, ZPR64>;
+
+  defm : SVE_1_Op_PassthruUndefZero_Pat<nxv16i8, op, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
+  defm : SVE_1_Op_PassthruUndefZero_Pat<nxv8i16, op, nxv8i1,  nxv8i16, !cast<Instruction>(NAME # _H)>;
+  defm : SVE_1_Op_PassthruUndefZero_Pat<nxv4i32, op, nxv4i1,  nxv4i32, !cast<Instruction>(NAME # _S)>;
+  defm : SVE_1_Op_PassthruUndefZero_Pat<nxv2i64, op, nxv2i1,  nxv2i64, !cast<Instruction>(NAME # _D)>;
 }
 
 multiclass sve_int_un_pred_arit_bitwise_fp<bits<3> opc, string asm,
diff --git llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index b8d323649fea..9671fa3b3d92 100644
--- llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -306,6 +306,36 @@ inline static CondCode getInvertedCondCode(CondCode Code) {
   return static_cast<CondCode>(static_cast<unsigned>(Code) ^ 0x1);
 }
 
+/// getSwappedCondition - assume the flags are set by MI(a,b), return
+/// the condition code if we modify the instructions such that flags are
+/// set by MI(b,a).
+inline static CondCode getSwappedCondition(CondCode CC) {
+  switch (CC) {
+  default:
+    return AL;
+  case EQ:
+    return EQ;
+  case NE:
+    return NE;
+  case HS:
+    return LS;
+  case LO:
+    return HI;
+  case HI:
+    return LO;
+  case LS:
+    return HS;
+  case GE:
+    return LE;
+  case LT:
+    return GT;
+  case GT:
+    return LT;
+  case LE:
+    return GE;
+  }
+}
+
 /// Given a condition code, return NZCV flags that would satisfy that condition.
 /// The flag bits are in the format expected by the ccmp instructions.
 /// Note that many different flag settings can satisfy a given condition code,
diff --git llvm/lib/Target/AMDGPU/AMDGPU.h llvm/lib/Target/AMDGPU/AMDGPU.h
index 89356df39724..12a8c155d3de 100644
--- llvm/lib/Target/AMDGPU/AMDGPU.h
+++ llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -177,7 +177,7 @@ extern char &SIShrinkInstructionsLegacyID;
 void initializeSIFixSGPRCopiesLegacyPass(PassRegistry &);
 extern char &SIFixSGPRCopiesLegacyID;
 
-void initializeSIFixVGPRCopiesPass(PassRegistry &);
+void initializeSIFixVGPRCopiesLegacyPass(PassRegistry &);
 extern char &SIFixVGPRCopiesID;
 
 void initializeSILowerWWMCopiesPass(PassRegistry &);
@@ -216,8 +216,8 @@ extern char &SIPreEmitPeepholeID;
 void initializeSILateBranchLoweringPass(PassRegistry &);
 extern char &SILateBranchLoweringPassID;
 
-void initializeSIOptimizeExecMaskingPass(PassRegistry &);
-extern char &SIOptimizeExecMaskingID;
+void initializeSIOptimizeExecMaskingLegacyPass(PassRegistry &);
+extern char &SIOptimizeExecMaskingLegacyID;
 
 void initializeSIPreAllocateWWMRegsLegacyPass(PassRegistry &);
 extern char &SIPreAllocateWWMRegsLegacyID;
diff --git llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
index 8d3eac686831..5a6868f96d97 100644
--- llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
@@ -80,10 +80,14 @@ AliasResult AMDGPUAAResult::alias(const MemoryLocation &LocA,
     } else if (const Argument *Arg = dyn_cast<Argument>(ObjA)) {
       const Function *F = Arg->getParent();
       switch (F->getCallingConv()) {
-      case CallingConv::AMDGPU_KERNEL:
+      case CallingConv::AMDGPU_KERNEL: {
         // In the kernel function, kernel arguments won't alias to (local)
         // variables in shared or private address space.
-        return AliasResult::NoAlias;
+        const auto *ObjB =
+            getUnderlyingObject(B.Ptr->stripPointerCastsForAliasAnalysis());
+        return ObjA != ObjB && isIdentifiedObject(ObjB) ? AliasResult::NoAlias
+                                                        : AliasResult::MayAlias;
+      }
       default:
         // TODO: In the regular function, if that local variable in the
         // location B is not captured, that argument pointer won't alias to it
diff --git llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentation.cpp llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentation.cpp
index 067fc9817403..6554863e08c9 100644
--- llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentation.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentation.cpp
@@ -161,7 +161,7 @@ static void instrumentAddressImpl(Module &M, IRBuilder<> &IRB,
   size_t AccessSizeIndex = TypeStoreSizeToSizeIndex(TypeStoreSize);
   Type *ShadowTy = IntegerType::get(M.getContext(),
                                     std::max(8U, TypeStoreSize >> AsanScale));
-  Type *ShadowPtrTy = PointerType::get(ShadowTy, 0);
+  Type *ShadowPtrTy = PointerType::get(M.getContext(), 0);
   Value *AddrLong = IRB.CreatePtrToInt(Addr, IntptrTy);
   Value *ShadowPtr =
       memToShadow(M, IRB, IntptrTy, AddrLong, AsanScale, AsanOffset);
diff --git llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 737b2f740d6f..0c151d06924d 100644
--- llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -363,6 +363,7 @@ void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) {
   using RIK = MCResourceInfo::ResourceInfoKind;
   const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F);
   MCSymbol *FnSym = TM.getSymbol(&F);
+  bool IsLocal = F.hasLocalLinkage();
 
   auto TryGetMCExprValue = [](const MCExpr *Value, uint64_t &Res) -> bool {
     int64_t Val;
@@ -375,8 +376,8 @@ void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) {
 
   const uint64_t MaxScratchPerWorkitem =
       STM.getMaxWaveScratchSize() / STM.getWavefrontSize();
-  MCSymbol *ScratchSizeSymbol =
-      RI.getSymbol(FnSym->getName(), RIK::RIK_PrivateSegSize, OutContext);
+  MCSymbol *ScratchSizeSymbol = RI.getSymbol(
+      FnSym->getName(), RIK::RIK_PrivateSegSize, OutContext, IsLocal);
   uint64_t ScratchSize;
   if (ScratchSizeSymbol->isVariable() &&
       TryGetMCExprValue(ScratchSizeSymbol->getVariableValue(), ScratchSize) &&
@@ -389,7 +390,7 @@ void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) {
   // Validate addressable scalar registers (i.e., prior to added implicit
   // SGPRs).
   MCSymbol *NumSGPRSymbol =
-      RI.getSymbol(FnSym->getName(), RIK::RIK_NumSGPR, OutContext);
+      RI.getSymbol(FnSym->getName(), RIK::RIK_NumSGPR, OutContext, IsLocal);
   if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
       !STM.hasSGPRInitBug()) {
     unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
@@ -406,9 +407,9 @@ void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) {
   }
 
   MCSymbol *VCCUsedSymbol =
-      RI.getSymbol(FnSym->getName(), RIK::RIK_UsesVCC, OutContext);
-  MCSymbol *FlatUsedSymbol =
-      RI.getSymbol(FnSym->getName(), RIK::RIK_UsesFlatScratch, OutContext);
+      RI.getSymbol(FnSym->getName(), RIK::RIK_UsesVCC, OutContext, IsLocal);
+  MCSymbol *FlatUsedSymbol = RI.getSymbol(
+      FnSym->getName(), RIK::RIK_UsesFlatScratch, OutContext, IsLocal);
   uint64_t VCCUsed, FlatUsed, NumSgpr;
 
   if (NumSGPRSymbol->isVariable() && VCCUsedSymbol->isVariable() &&
@@ -435,9 +436,9 @@ void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) {
     }
 
     MCSymbol *NumVgprSymbol =
-        RI.getSymbol(FnSym->getName(), RIK::RIK_NumVGPR, OutContext);
+        RI.getSymbol(FnSym->getName(), RIK::RIK_NumVGPR, OutContext, IsLocal);
     MCSymbol *NumAgprSymbol =
-        RI.getSymbol(FnSym->getName(), RIK::RIK_NumAGPR, OutContext);
+        RI.getSymbol(FnSym->getName(), RIK::RIK_NumAGPR, OutContext, IsLocal);
     uint64_t NumVgpr, NumAgpr;
 
     MachineModuleInfo &MMI =
@@ -655,6 +656,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
 
   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
   MCContext &Context = getObjFileLowering().getContext();
+  bool IsLocal = MF.getFunction().hasLocalLinkage();
   // FIXME: This should be an explicit check for Mesa.
   if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) {
     MCSectionELF *ConfigSection =
@@ -700,20 +702,24 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   {
     using RIK = MCResourceInfo::ResourceInfoKind;
     getTargetStreamer()->EmitMCResourceInfo(
-        RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumVGPR, OutContext),
-        RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumAGPR, OutContext),
-        RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumSGPR, OutContext),
+        RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumVGPR, OutContext,
+                     IsLocal),
+        RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumAGPR, OutContext,
+                     IsLocal),
+        RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumSGPR, OutContext,
+                     IsLocal),
         RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_PrivateSegSize,
-                     OutContext),
-        RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_UsesVCC, OutContext),
+                     OutContext, IsLocal),
+        RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_UsesVCC, OutContext,
+                     IsLocal),
         RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_UsesFlatScratch,
-                     OutContext),
+                     OutContext, IsLocal),
         RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_HasDynSizedStack,
-                     OutContext),
-        RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_HasRecursion,
-                     OutContext),
+                     OutContext, IsLocal),
+        RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_HasRecursion, OutContext,
+                     IsLocal),
         RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_HasIndirectCall,
-                     OutContext));
+                     OutContext, IsLocal));
   }
 
   if (isVerbose()) {
@@ -726,19 +732,21 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
       OutStreamer->emitRawComment(" Function info:", false);
 
       emitCommonFunctionComments(
-          RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumVGPR, OutContext)
+          RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumVGPR, OutContext,
+                       IsLocal)
               ->getVariableValue(),
-          STM.hasMAIInsts() ? RI.getSymbol(CurrentFnSym->getName(),
-                                           RIK::RIK_NumAGPR, OutContext)
-                                  ->getVariableValue()
-                            : nullptr,
+          STM.hasMAIInsts()
+              ? RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumAGPR,
+                             OutContext, IsLocal)
+                    ->getVariableValue()
+              : nullptr,
           RI.createTotalNumVGPRs(MF, Ctx),
           RI.createTotalNumSGPRs(
               MF,
               MF.getSubtarget<GCNSubtarget>().getTargetID().isXnackOnOrAny(),
               Ctx),
           RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_PrivateSegSize,
-                       OutContext)
+                       OutContext, IsLocal)
               ->getVariableValue(),
           getFunctionCodeSize(MF), MFI);
       return false;
@@ -927,6 +935,7 @@ static const MCExpr *computeAccumOffset(const MCExpr *NumVGPR, MCContext &Ctx) {
 void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
                                         const MachineFunction &MF) {
   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
+  bool IsLocal = MF.getFunction().hasLocalLinkage();
   MCContext &Ctx = MF.getContext();
 
   auto CreateExpr = [&Ctx](int64_t Value) {
@@ -944,7 +953,8 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
 
   auto GetSymRefExpr =
       [&](MCResourceInfo::ResourceInfoKind RIK) -> const MCExpr * {
-    MCSymbol *Sym = RI.getSymbol(CurrentFnSym->getName(), RIK, OutContext);
+    MCSymbol *Sym =
+        RI.getSymbol(CurrentFnSym->getName(), RIK, OutContext, IsLocal);
     return MCSymbolRefExpr::create(Sym, Ctx);
   };
 
diff --git llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 9fa9cccd3e3e..6d5c3b5e0742 100644
--- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1045,7 +1045,8 @@ void AMDGPUDAGToDAGISel::SelectMUL_LOHI(SDNode *N) {
   SDValue Zero = CurDAG->getTargetConstant(0, SL, MVT::i64);
   SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
   SDValue Ops[] = {N->getOperand(0), N->getOperand(1), Zero, Clamp};
-  SDNode *Mad = CurDAG->getMachineNode(Opc, SL, N->getVTList(), Ops);
+  SDNode *Mad = CurDAG->getMachineNode(
+      Opc, SL, CurDAG->getVTList(MVT::i64, MVT::i1), Ops);
   if (!SDValue(N, 0).use_empty()) {
     SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32);
     SDNode *Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
diff --git llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 9836e10c36bc..e9e47eaadd55 100644
--- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -4280,7 +4280,7 @@ bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
                                          const TargetRegisterClass *ArgRC,
                                          LLT ArgTy) const {
   MCRegister SrcReg = Arg->getRegister();
-  assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
+  assert(SrcReg.isPhysical() && "Physical register expected");
   assert(DstReg.isVirtual() && "Virtual register expected");
 
   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
diff --git llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 657a406e9f70..ccb874e6a934 100644
--- llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -66,6 +66,28 @@
 // Atomics operations on `ptr addrspace(7)` values are not suppported, as the
 // hardware does not include a 160-bit atomic.
 //
+// ## Buffer contents type legalization
+//
+// The underlying buffer intrinsics only support types up to 128 bits long,
+// and don't support complex types. If buffer operations were
+// standard pointer operations that could be represented as MIR-level loads,
+// this would be handled by the various legalization schemes in instruction
+// selection. However, because we have to do the conversion from `load` and
+// `store` to intrinsics at LLVM IR level, we must perform that legalization
+// ourselves.
+//
+// This involves a combination of
+// - Converting arrays to vectors where possible
+// - Otherwise, splitting loads and stores of aggregates into loads/stores of
+//   each component.
+// - Zero-extending things to fill a whole number of bytes
+// - Casting values of types that don't neatly correspond to supported machine
+// value
+//   (for example, an i96 or i256) into ones that would work (
+//    like <3 x i32> and <8 x i32>, respectively)
+// - Splitting values that are too long (such as aforementioned <8 x i32>) into
+//   multiple operations.
+//
 // ## Type remapping
 //
 // We use a `ValueMapper` to mangle uses of [vectors of] buffer fat pointers
@@ -86,7 +108,6 @@
 // This phase also records intrinsics so that they can be remangled or deleted
 // later.
 //
-//
 // ## Splitting pointer structs
 //
 // The meat of this pass consists of defining semantics for operations that
@@ -218,6 +239,7 @@
 #include "llvm/IR/ReplaceConstant.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/Alignment.h"
 #include "llvm/Support/AtomicOrdering.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -551,7 +573,6 @@ bool StoreFatPtrsAsIntsVisitor::visitLoadInst(LoadInst &LI) {
   auto *NLI = cast<LoadInst>(LI.clone());
   NLI->mutateType(IntTy);
   NLI = IRB.Insert(NLI);
-  copyMetadataForLoad(*NLI, LI);
   NLI->takeName(&LI);
 
   Value *CastBack = intsToFatPtrs(NLI, IntTy, Ty, NLI->getName());
@@ -576,6 +597,542 @@ bool StoreFatPtrsAsIntsVisitor::visitStoreInst(StoreInst &SI) {
   return true;
 }
 
+namespace {
+/// Convert loads/stores of types that the buffer intrinsics can't handle into
+/// one ore more such loads/stores that consist of legal types.
+///
+/// Do this by
+/// 1. Recursing into structs (and arrays that don't share a memory layout with
+/// vectors) since the intrinsics can't handle complex types.
+/// 2. Converting arrays of non-aggregate, byte-sized types into their
+/// corresponding vectors
+/// 3. Bitcasting unsupported types, namely overly-long scalars and byte
+/// vectors, into vectors of supported types.
+/// 4. Splitting up excessively long reads/writes into multiple operations.
+///
+/// Note that this doesn't handle complex data strucures, but, in the future,
+/// the aggregate load splitter from SROA could be refactored to allow for that
+/// case.
+class LegalizeBufferContentTypesVisitor
+    : public InstVisitor<LegalizeBufferContentTypesVisitor, bool> {
+  friend class InstVisitor<LegalizeBufferContentTypesVisitor, bool>;
+
+  IRBuilder<> IRB;
+
+  const DataLayout &DL;
+
+  /// If T is [N x U], where U is a scalar type, return the vector type
+  /// <N x U>, otherwise, return T.
+  Type *scalarArrayTypeAsVector(Type *MaybeArrayType);
+  Value *arrayToVector(Value *V, Type *TargetType, const Twine &Name);
+  Value *vectorToArray(Value *V, Type *OrigType, const Twine &Name);
+
+  /// Break up the loads of a struct into the loads of its components
+
+  /// Convert a vector or scalar type that can't be operated on by buffer
+  /// intrinsics to one that would be legal through bitcasts and/or truncation.
+  /// Uses the wider of i32, i16, or i8 where possible.
+  Type *legalNonAggregateFor(Type *T);
+  Value *makeLegalNonAggregate(Value *V, Type *TargetType, const Twine &Name);
+  Value *makeIllegalNonAggregate(Value *V, Type *OrigType, const Twine &Name);
+
+  struct VecSlice {
+    uint64_t Index = 0;
+    uint64_t Length = 0;
+    VecSlice() = delete;
+    // Needed for some Clangs
+    VecSlice(uint64_t Index, uint64_t Length) : Index(Index), Length(Length) {}
+  };
+  /// Return the [index, length] pairs into which `T` needs to be cut to form
+  /// legal buffer load or store operations. Clears `Slices`. Creates an empty
+  /// `Slices` for non-vector inputs and creates one slice if no slicing will be
+  /// needed.
+  void getVecSlices(Type *T, SmallVectorImpl<VecSlice> &Slices);
+
+  Value *extractSlice(Value *Vec, VecSlice S, const Twine &Name);
+  Value *insertSlice(Value *Whole, Value *Part, VecSlice S, const Twine &Name);
+
+  /// In most cases, return `LegalType`. However, when given an input that would
+  /// normally be a legal type for the buffer intrinsics to return but that
+  /// isn't hooked up through SelectionDAG, return a type of the same width that
+  /// can be used with the relevant intrinsics. Specifically, handle the cases:
+  /// - <1 x T> => T for all T
+  /// - <N x i8> <=> i16, i32, 2xi32, 4xi32 (as needed)
+  /// - <N x T> where T is under 32 bits and the total size is 96 bits <=> <3 x
+  /// i32>
+  Type *intrinsicTypeFor(Type *LegalType);
+
+  bool visitLoadImpl(LoadInst &OrigLI, Type *PartType,
+                     SmallVectorImpl<uint32_t> &AggIdxs, uint64_t AggByteOffset,
+                     Value *&Result, const Twine &Name);
+  /// Return value is (Changed, ModifiedInPlace)
+  std::pair<bool, bool> visitStoreImpl(StoreInst &OrigSI, Type *PartType,
+                                       SmallVectorImpl<uint32_t> &AggIdxs,
+                                       uint64_t AggByteOffset,
+                                       const Twine &Name);
+
+  bool visitInstruction(Instruction &I) { return false; }
+  bool visitLoadInst(LoadInst &LI);
+  bool visitStoreInst(StoreInst &SI);
+
+public:
+  LegalizeBufferContentTypesVisitor(const DataLayout &DL, LLVMContext &Ctx)
+      : IRB(Ctx), DL(DL) {}
+  bool processFunction(Function &F);
+};
+} // namespace
+
+Type *LegalizeBufferContentTypesVisitor::scalarArrayTypeAsVector(Type *T) {
+  ArrayType *AT = dyn_cast<ArrayType>(T);
+  if (!AT)
+    return T;
+  Type *ET = AT->getElementType();
+  if (!ET->isSingleValueType() || isa<VectorType>(ET))
+    report_fatal_error("loading non-scalar arrays from buffer fat pointers "
+                       "should have recursed");
+  if (!DL.typeSizeEqualsStoreSize(AT))
+    report_fatal_error(
+        "loading padded arrays from buffer fat pinters should have recursed");
+  return FixedVectorType::get(ET, AT->getNumElements());
+}
+
+Value *LegalizeBufferContentTypesVisitor::arrayToVector(Value *V,
+                                                        Type *TargetType,
+                                                        const Twine &Name) {
+  Value *VectorRes = PoisonValue::get(TargetType);
+  auto *VT = cast<FixedVectorType>(TargetType);
+  unsigned EC = VT->getNumElements();
+  for (auto I : iota_range<unsigned>(0, EC, /*Inclusive=*/false)) {
+    Value *Elem = IRB.CreateExtractValue(V, I, Name + ".elem." + Twine(I));
+    VectorRes = IRB.CreateInsertElement(VectorRes, Elem, I,
+                                        Name + ".as.vec." + Twine(I));
+  }
+  return VectorRes;
+}
+
+Value *LegalizeBufferContentTypesVisitor::vectorToArray(Value *V,
+                                                        Type *OrigType,
+                                                        const Twine &Name) {
+  Value *ArrayRes = PoisonValue::get(OrigType);
+  ArrayType *AT = cast<ArrayType>(OrigType);
+  unsigned EC = AT->getNumElements();
+  for (auto I : iota_range<unsigned>(0, EC, /*Inclusive=*/false)) {
+    Value *Elem = IRB.CreateExtractElement(V, I, Name + ".elem." + Twine(I));
+    ArrayRes = IRB.CreateInsertValue(ArrayRes, Elem, I,
+                                     Name + ".as.array." + Twine(I));
+  }
+  return ArrayRes;
+}
+
+Type *LegalizeBufferContentTypesVisitor::legalNonAggregateFor(Type *T) {
+  TypeSize Size = DL.getTypeStoreSizeInBits(T);
+  // Implicitly zero-extend to the next byte if needed
+  if (!DL.typeSizeEqualsStoreSize(T))
+    T = IRB.getIntNTy(Size.getFixedValue());
+  Type *ElemTy = T->getScalarType();
+  if (isa<PointerType, ScalableVectorType>(ElemTy)) {
+    // Pointers are always big enough, and we'll let scalable vectors through to
+    // fail in codegen.
+    return T;
+  }
+  unsigned ElemSize = DL.getTypeSizeInBits(ElemTy).getFixedValue();
+  if (isPowerOf2_32(ElemSize) && ElemSize >= 16 && ElemSize <= 128) {
+    // [vectors of] anything that's 16/32/64/128 bits can be cast and split into
+    // legal buffer operations.
+    return T;
+  }
+  Type *BestVectorElemType = nullptr;
+  if (Size.isKnownMultipleOf(32))
+    BestVectorElemType = IRB.getInt32Ty();
+  else if (Size.isKnownMultipleOf(16))
+    BestVectorElemType = IRB.getInt16Ty();
+  else
+    BestVectorElemType = IRB.getInt8Ty();
+  unsigned NumCastElems =
+      Size.getFixedValue() / BestVectorElemType->getIntegerBitWidth();
+  if (NumCastElems == 1)
+    return BestVectorElemType;
+  return FixedVectorType::get(BestVectorElemType, NumCastElems);
+}
+
+Value *LegalizeBufferContentTypesVisitor::makeLegalNonAggregate(
+    Value *V, Type *TargetType, const Twine &Name) {
+  Type *SourceType = V->getType();
+  TypeSize SourceSize = DL.getTypeSizeInBits(SourceType);
+  TypeSize TargetSize = DL.getTypeSizeInBits(TargetType);
+  if (SourceSize != TargetSize) {
+    Type *ShortScalarTy = IRB.getIntNTy(SourceSize.getFixedValue());
+    Type *ByteScalarTy = IRB.getIntNTy(TargetSize.getFixedValue());
+    Value *AsScalar = IRB.CreateBitCast(V, ShortScalarTy, Name + ".as.scalar");
+    Value *Zext = IRB.CreateZExt(AsScalar, ByteScalarTy, Name + ".zext");
+    V = Zext;
+    SourceType = ByteScalarTy;
+  }
+  return IRB.CreateBitCast(V, TargetType, Name + ".legal");
+}
+
+Value *LegalizeBufferContentTypesVisitor::makeIllegalNonAggregate(
+    Value *V, Type *OrigType, const Twine &Name) {
+  Type *LegalType = V->getType();
+  TypeSize LegalSize = DL.getTypeSizeInBits(LegalType);
+  TypeSize OrigSize = DL.getTypeSizeInBits(OrigType);
+  if (LegalSize != OrigSize) {
+    Type *ShortScalarTy = IRB.getIntNTy(OrigSize.getFixedValue());
+    Type *ByteScalarTy = IRB.getIntNTy(LegalSize.getFixedValue());
+    Value *AsScalar = IRB.CreateBitCast(V, ByteScalarTy, Name + ".bytes.cast");
+    Value *Trunc = IRB.CreateTrunc(AsScalar, ShortScalarTy, Name + ".trunc");
+    return IRB.CreateBitCast(Trunc, OrigType, Name + ".orig");
+  }
+  return IRB.CreateBitCast(V, OrigType, Name + ".real.ty");
+}
+
+Type *LegalizeBufferContentTypesVisitor::intrinsicTypeFor(Type *LegalType) {
+  auto *VT = dyn_cast<FixedVectorType>(LegalType);
+  if (!VT)
+    return LegalType;
+  Type *ET = VT->getElementType();
+  // Explicitly return the element type of 1-element vectors because the
+  // underlying intrinsics don't like <1 x T> even though it's a synonym for T.
+  if (VT->getNumElements() == 1)
+    return ET;
+  if (DL.getTypeSizeInBits(LegalType) == 96 && DL.getTypeSizeInBits(ET) < 32)
+    return FixedVectorType::get(IRB.getInt32Ty(), 3);
+  if (ET->isIntegerTy(8)) {
+    switch (VT->getNumElements()) {
+    default:
+      return LegalType; // Let it crash later
+    case 1:
+      return IRB.getInt8Ty();
+    case 2:
+      return IRB.getInt16Ty();
+    case 4:
+      return IRB.getInt32Ty();
+    case 8:
+      return FixedVectorType::get(IRB.getInt32Ty(), 2);
+    case 16:
+      return FixedVectorType::get(IRB.getInt32Ty(), 4);
+    }
+  }
+  return LegalType;
+}
+
+void LegalizeBufferContentTypesVisitor::getVecSlices(
+    Type *T, SmallVectorImpl<VecSlice> &Slices) {
+  Slices.clear();
+  auto *VT = dyn_cast<FixedVectorType>(T);
+  if (!VT)
+    return;
+
+  uint64_t ElemBitWidth =
+      DL.getTypeSizeInBits(VT->getElementType()).getFixedValue();
+
+  uint64_t ElemsPer4Words = 128 / ElemBitWidth;
+  uint64_t ElemsPer2Words = ElemsPer4Words / 2;
+  uint64_t ElemsPerWord = ElemsPer2Words / 2;
+  uint64_t ElemsPerShort = ElemsPerWord / 2;
+  uint64_t ElemsPerByte = ElemsPerShort / 2;
+  // If the elements evenly pack into 32-bit words, we can use 3-word stores,
+  // such as for <6 x bfloat> or <3 x i32>, but we can't dot his for, for
+  // example, <3 x i64>, since that's not slicing.
+  uint64_t ElemsPer3Words = ElemsPerWord * 3;
+
+  uint64_t TotalElems = VT->getNumElements();
+  uint64_t Index = 0;
+  auto TrySlice = [&](unsigned MaybeLen) {
+    if (MaybeLen > 0 && Index + MaybeLen <= TotalElems) {
+      VecSlice Slice{/*Index=*/Index, /*Length=*/MaybeLen};
+      Slices.push_back(Slice);
+      Index += MaybeLen;
+      return true;
+    }
+    return false;
+  };
+  while (Index < TotalElems) {
+    TrySlice(ElemsPer4Words) || TrySlice(ElemsPer3Words) ||
+        TrySlice(ElemsPer2Words) || TrySlice(ElemsPerWord) ||
+        TrySlice(ElemsPerShort) || TrySlice(ElemsPerByte);
+  }
+}
+
+Value *LegalizeBufferContentTypesVisitor::extractSlice(Value *Vec, VecSlice S,
+                                                       const Twine &Name) {
+  auto *VecVT = dyn_cast<FixedVectorType>(Vec->getType());
+  if (!VecVT)
+    return Vec;
+  if (S.Length == VecVT->getNumElements() && S.Index == 0)
+    return Vec;
+  if (S.Length == 1)
+    return IRB.CreateExtractElement(Vec, S.Index,
+                                    Name + ".slice." + Twine(S.Index));
+  SmallVector<int> Mask = llvm::to_vector(
+      llvm::iota_range<int>(S.Index, S.Index + S.Length, /*Inclusive=*/false));
+  return IRB.CreateShuffleVector(Vec, Mask, Name + ".slice." + Twine(S.Index));
+}
+
+Value *LegalizeBufferContentTypesVisitor::insertSlice(Value *Whole, Value *Part,
+                                                      VecSlice S,
+                                                      const Twine &Name) {
+  auto *WholeVT = dyn_cast<FixedVectorType>(Whole->getType());
+  if (!WholeVT)
+    return Part;
+  if (S.Length == WholeVT->getNumElements() && S.Index == 0)
+    return Part;
+  if (S.Length == 1) {
+    return IRB.CreateInsertElement(Whole, Part, S.Index,
+                                   Name + ".slice." + Twine(S.Index));
+  }
+  int NumElems = cast<FixedVectorType>(Whole->getType())->getNumElements();
+
+  // Extend the slice with poisons to make the main shufflevector happy.
+  SmallVector<int> ExtPartMask(NumElems, -1);
+  for (auto [I, E] : llvm::enumerate(
+           MutableArrayRef<int>(ExtPartMask).take_front(S.Length))) {
+    E = I;
+  }
+  Value *ExtPart = IRB.CreateShuffleVector(Part, ExtPartMask,
+                                           Name + ".ext." + Twine(S.Index));
+
+  SmallVector<int> Mask =
+      llvm::to_vector(llvm::iota_range<int>(0, NumElems, /*Inclusive=*/false));
+  for (auto [I, E] :
+       llvm::enumerate(MutableArrayRef<int>(Mask).slice(S.Index, S.Length)))
+    E = I + NumElems;
+  return IRB.CreateShuffleVector(Whole, ExtPart, Mask,
+                                 Name + ".parts." + Twine(S.Index));
+}
+
+bool LegalizeBufferContentTypesVisitor::visitLoadImpl(
+    LoadInst &OrigLI, Type *PartType, SmallVectorImpl<uint32_t> &AggIdxs,
+    uint64_t AggByteOff, Value *&Result, const Twine &Name) {
+  if (auto *ST = dyn_cast<StructType>(PartType)) {
+    const StructLayout *Layout = DL.getStructLayout(ST);
+    bool Changed = false;
+    for (auto [I, ElemTy, Offset] :
+         llvm::enumerate(ST->elements(), Layout->getMemberOffsets())) {
+      AggIdxs.push_back(I);
+      Changed |= visitLoadImpl(OrigLI, ElemTy, AggIdxs,
+                               AggByteOff + Offset.getFixedValue(), Result,
+                               Name + "." + Twine(I));
+      AggIdxs.pop_back();
+    }
+    return Changed;
+  }
+  if (auto *AT = dyn_cast<ArrayType>(PartType)) {
+    Type *ElemTy = AT->getElementType();
+    if (!ElemTy->isSingleValueType() || !DL.typeSizeEqualsStoreSize(ElemTy) ||
+        ElemTy->isVectorTy()) {
+      TypeSize ElemStoreSize = DL.getTypeStoreSize(ElemTy);
+      bool Changed = false;
+      for (auto I : llvm::iota_range<uint32_t>(0, AT->getNumElements(),
+                                               /*Inclusive=*/false)) {
+        AggIdxs.push_back(I);
+        Changed |= visitLoadImpl(OrigLI, ElemTy, AggIdxs,
+                                 AggByteOff + I * ElemStoreSize.getFixedValue(),
+                                 Result, Name + Twine(I));
+        AggIdxs.pop_back();
+      }
+      return Changed;
+    }
+  }
+
+  // Typical case
+
+  Type *ArrayAsVecType = scalarArrayTypeAsVector(PartType);
+  Type *LegalType = legalNonAggregateFor(ArrayAsVecType);
+
+  SmallVector<VecSlice> Slices;
+  getVecSlices(LegalType, Slices);
+  bool HasSlices = Slices.size() > 1;
+  bool IsAggPart = !AggIdxs.empty();
+  Value *LoadsRes;
+  if (!HasSlices && !IsAggPart) {
+    Type *LoadableType = intrinsicTypeFor(LegalType);
+    if (LoadableType == PartType)
+      return false;
+
+    IRB.SetInsertPoint(&OrigLI);
+    auto *NLI = cast<LoadInst>(OrigLI.clone());
+    NLI->mutateType(LoadableType);
+    NLI = IRB.Insert(NLI);
+    NLI->setName(Name + ".loadable");
+
+    LoadsRes = IRB.CreateBitCast(NLI, LegalType, Name + ".from.loadable");
+  } else {
+    IRB.SetInsertPoint(&OrigLI);
+    LoadsRes = PoisonValue::get(LegalType);
+    Value *OrigPtr = OrigLI.getPointerOperand();
+    // If we're needing to spill something into more than one load, its legal
+    // type will be a vector (ex. an i256 load will have LegalType = <8 x i32>).
+    // But if we're already a scalar (which can happen if we're splitting up a
+    // struct), the element type will be the legal type itself.
+    Type *ElemType = LegalType->getScalarType();
+    unsigned ElemBytes = DL.getTypeStoreSize(ElemType);
+    AAMDNodes AANodes = OrigLI.getAAMetadata();
+    if (IsAggPart && Slices.empty())
+      Slices.push_back(VecSlice{/*Index=*/0, /*Length=*/1});
+    for (VecSlice S : Slices) {
+      Type *SliceType =
+          S.Length != 1 ? FixedVectorType::get(ElemType, S.Length) : ElemType;
+      int64_t ByteOffset = AggByteOff + S.Index * ElemBytes;
+      // You can't reasonably expect loads to wrap around the edge of memory.
+      Value *NewPtr = IRB.CreateGEP(
+          IRB.getInt8Ty(), OrigLI.getPointerOperand(), IRB.getInt32(ByteOffset),
+          OrigPtr->getName() + ".off.ptr." + Twine(ByteOffset),
+          GEPNoWrapFlags::noUnsignedWrap());
+      Type *LoadableType = intrinsicTypeFor(SliceType);
+      LoadInst *NewLI = IRB.CreateAlignedLoad(
+          LoadableType, NewPtr, commonAlignment(OrigLI.getAlign(), ByteOffset),
+          Name + ".off." + Twine(ByteOffset));
+      copyMetadataForLoad(*NewLI, OrigLI);
+      NewLI->setAAMetadata(
+          AANodes.adjustForAccess(ByteOffset, LoadableType, DL));
+      NewLI->setAtomic(OrigLI.getOrdering(), OrigLI.getSyncScopeID());
+      NewLI->setVolatile(OrigLI.isVolatile());
+      Value *Loaded = IRB.CreateBitCast(NewLI, SliceType,
+                                        NewLI->getName() + ".from.loadable");
+      LoadsRes = insertSlice(LoadsRes, Loaded, S, Name);
+    }
+  }
+  if (LegalType != ArrayAsVecType)
+    LoadsRes = makeIllegalNonAggregate(LoadsRes, ArrayAsVecType, Name);
+  if (ArrayAsVecType != PartType)
+    LoadsRes = vectorToArray(LoadsRes, PartType, Name);
+
+  if (IsAggPart)
+    Result = IRB.CreateInsertValue(Result, LoadsRes, AggIdxs, Name);
+  else
+    Result = LoadsRes;
+  return true;
+}
+
+bool LegalizeBufferContentTypesVisitor::visitLoadInst(LoadInst &LI) {
+  if (LI.getPointerAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER)
+    return false;
+
+  SmallVector<uint32_t> AggIdxs;
+  Type *OrigType = LI.getType();
+  Value *Result = PoisonValue::get(OrigType);
+  bool Changed = visitLoadImpl(LI, OrigType, AggIdxs, 0, Result, LI.getName());
+  if (!Changed)
+    return false;
+  Result->takeName(&LI);
+  LI.replaceAllUsesWith(Result);
+  LI.eraseFromParent();
+  return Changed;
+}
+
+std::pair<bool, bool> LegalizeBufferContentTypesVisitor::visitStoreImpl(
+    StoreInst &OrigSI, Type *PartType, SmallVectorImpl<uint32_t> &AggIdxs,
+    uint64_t AggByteOff, const Twine &Name) {
+  if (auto *ST = dyn_cast<StructType>(PartType)) {
+    const StructLayout *Layout = DL.getStructLayout(ST);
+    bool Changed = false;
+    for (auto [I, ElemTy, Offset] :
+         llvm::enumerate(ST->elements(), Layout->getMemberOffsets())) {
+      AggIdxs.push_back(I);
+      Changed |= std::get<0>(visitStoreImpl(OrigSI, ElemTy, AggIdxs,
+                                            AggByteOff + Offset.getFixedValue(),
+                                            Name + "." + Twine(I)));
+      AggIdxs.pop_back();
+    }
+    return std::make_pair(Changed, /*ModifiedInPlace=*/false);
+  }
+  if (auto *AT = dyn_cast<ArrayType>(PartType)) {
+    Type *ElemTy = AT->getElementType();
+    if (!ElemTy->isSingleValueType() || !DL.typeSizeEqualsStoreSize(ElemTy) ||
+        ElemTy->isVectorTy()) {
+      TypeSize ElemStoreSize = DL.getTypeStoreSize(ElemTy);
+      bool Changed = false;
+      for (auto I : llvm::iota_range<uint32_t>(0, AT->getNumElements(),
+                                               /*Inclusive=*/false)) {
+        AggIdxs.push_back(I);
+        Changed |= std::get<0>(visitStoreImpl(
+            OrigSI, ElemTy, AggIdxs,
+            AggByteOff + I * ElemStoreSize.getFixedValue(), Name + Twine(I)));
+        AggIdxs.pop_back();
+      }
+      return std::make_pair(Changed, /*ModifiedInPlace=*/false);
+    }
+  }
+
+  Value *OrigData = OrigSI.getValueOperand();
+  Value *NewData = OrigData;
+
+  bool IsAggPart = !AggIdxs.empty();
+  if (IsAggPart)
+    NewData = IRB.CreateExtractValue(NewData, AggIdxs, Name);
+
+  Type *ArrayAsVecType = scalarArrayTypeAsVector(PartType);
+  if (ArrayAsVecType != PartType) {
+    NewData = arrayToVector(NewData, ArrayAsVecType, Name);
+  }
+
+  Type *LegalType = legalNonAggregateFor(ArrayAsVecType);
+  if (LegalType != ArrayAsVecType) {
+    NewData = makeLegalNonAggregate(NewData, LegalType, Name);
+  }
+
+  SmallVector<VecSlice> Slices;
+  getVecSlices(LegalType, Slices);
+  bool NeedToSplit = Slices.size() > 1 || IsAggPart;
+  if (!NeedToSplit) {
+    Type *StorableType = intrinsicTypeFor(LegalType);
+    if (StorableType == PartType)
+      return std::make_pair(/*Changed=*/false, /*ModifiedInPlace=*/false);
+    NewData = IRB.CreateBitCast(NewData, StorableType, Name + ".storable");
+    OrigSI.setOperand(0, NewData);
+    return std::make_pair(/*Changed=*/true, /*ModifiedInPlace=*/true);
+  }
+
+  Value *OrigPtr = OrigSI.getPointerOperand();
+  Type *ElemType = LegalType->getScalarType();
+  if (IsAggPart && Slices.empty())
+    Slices.push_back(VecSlice{/*Index=*/0, /*Length=*/1});
+  unsigned ElemBytes = DL.getTypeStoreSize(ElemType);
+  AAMDNodes AANodes = OrigSI.getAAMetadata();
+  for (VecSlice S : Slices) {
+    Type *SliceType =
+        S.Length != 1 ? FixedVectorType::get(ElemType, S.Length) : ElemType;
+    int64_t ByteOffset = AggByteOff + S.Index * ElemBytes;
+    Value *NewPtr =
+        IRB.CreateGEP(IRB.getInt8Ty(), OrigPtr, IRB.getInt32(ByteOffset),
+                      OrigPtr->getName() + ".part." + Twine(S.Index),
+                      GEPNoWrapFlags::noUnsignedWrap());
+    Value *DataSlice = extractSlice(NewData, S, Name);
+    Type *StorableType = intrinsicTypeFor(SliceType);
+    DataSlice = IRB.CreateBitCast(DataSlice, StorableType,
+                                  DataSlice->getName() + ".storable");
+    auto *NewSI = cast<StoreInst>(OrigSI.clone());
+    NewSI->setAlignment(commonAlignment(OrigSI.getAlign(), ByteOffset));
+    IRB.Insert(NewSI);
+    NewSI->setOperand(0, DataSlice);
+    NewSI->setOperand(1, NewPtr);
+    NewSI->setAAMetadata(AANodes.adjustForAccess(ByteOffset, StorableType, DL));
+  }
+  return std::make_pair(/*Changed=*/true, /*ModifiedInPlace=*/false);
+}
+
+bool LegalizeBufferContentTypesVisitor::visitStoreInst(StoreInst &SI) {
+  if (SI.getPointerAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER)
+    return false;
+  IRB.SetInsertPoint(&SI);
+  SmallVector<uint32_t> AggIdxs;
+  Value *OrigData = SI.getValueOperand();
+  auto [Changed, ModifiedInPlace] =
+      visitStoreImpl(SI, OrigData->getType(), AggIdxs, 0, OrigData->getName());
+  if (Changed && !ModifiedInPlace)
+    SI.eraseFromParent();
+  return Changed;
+}
+
+bool LegalizeBufferContentTypesVisitor::processFunction(Function &F) {
+  bool Changed = false;
+  for (Instruction &I : make_early_inc_range(instructions(F))) {
+    Changed |= visit(I);
+  }
+  return Changed;
+}
+
 /// Return the ptr addrspace(8) and i32 (resource and offset parts) in a lowered
 /// buffer fat pointer constant.
 static std::pair<Constant *, Constant *>
@@ -1766,12 +2323,16 @@ bool AMDGPULowerBufferFatPointers::run(Module &M, const TargetMachine &TM) {
   }
 
   StoreFatPtrsAsIntsVisitor MemOpsRewrite(&IntTM, M.getContext());
+  LegalizeBufferContentTypesVisitor BufferContentsTypeRewrite(DL,
+                                                              M.getContext());
   for (Function &F : M.functions()) {
     bool InterfaceChange = hasFatPointerInterface(F, &StructTM);
     bool BodyChanges = containsBufferFatPointers(F, &StructTM);
     Changed |= MemOpsRewrite.processFunction(F);
-    if (InterfaceChange || BodyChanges)
+    if (InterfaceChange || BodyChanges) {
       NeedsRemap.push_back(std::make_pair(&F, InterfaceChange));
+      Changed |= BufferContentsTypeRewrite.processFunction(F);
+    }
   }
   if (NeedsRemap.empty())
     return Changed;
diff --git llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
index 9511b6bb7de0..47679f89f3f0 100644
--- llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
@@ -15,6 +15,7 @@
 #include "AMDGPUMCResourceInfo.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Target/TargetMachine.h"
@@ -22,9 +23,12 @@
 using namespace llvm;
 
 MCSymbol *MCResourceInfo::getSymbol(StringRef FuncName, ResourceInfoKind RIK,
-                                    MCContext &OutContext) {
-  auto GOCS = [FuncName, &OutContext](StringRef Suffix) {
-    return OutContext.getOrCreateSymbol(FuncName + Twine(Suffix));
+                                    MCContext &OutContext, bool IsLocal) {
+  auto GOCS = [FuncName, &OutContext, IsLocal](StringRef Suffix) {
+    StringRef Prefix =
+        IsLocal ? OutContext.getAsmInfo()->getPrivateGlobalPrefix() : "";
+    return OutContext.getOrCreateSymbol(Twine(Prefix) + FuncName +
+                                        Twine(Suffix));
   };
   switch (RIK) {
   case RIK_NumVGPR:
@@ -51,8 +55,8 @@ MCSymbol *MCResourceInfo::getSymbol(StringRef FuncName, ResourceInfoKind RIK,
 
 const MCExpr *MCResourceInfo::getSymRefExpr(StringRef FuncName,
                                             ResourceInfoKind RIK,
-                                            MCContext &Ctx) {
-  return MCSymbolRefExpr::create(getSymbol(FuncName, RIK, Ctx), Ctx);
+                                            MCContext &Ctx, bool IsLocal) {
+  return MCSymbolRefExpr::create(getSymbol(FuncName, RIK, Ctx, IsLocal), Ctx);
 }
 
 void MCResourceInfo::assignMaxRegs(MCContext &OutContext) {
@@ -96,11 +100,12 @@ void MCResourceInfo::assignResourceInfoExpr(
     const MachineFunction &MF, const SmallVectorImpl<const Function *> &Callees,
     MCContext &OutContext) {
   const TargetMachine &TM = MF.getTarget();
+  bool IsLocal = MF.getFunction().hasLocalLinkage();
   MCSymbol *FnSym = TM.getSymbol(&MF.getFunction());
   const MCConstantExpr *LocalConstExpr =
       MCConstantExpr::create(LocalValue, OutContext);
   const MCExpr *SymVal = LocalConstExpr;
-  MCSymbol *Sym = getSymbol(FnSym->getName(), RIK, OutContext);
+  MCSymbol *Sym = getSymbol(FnSym->getName(), RIK, OutContext, IsLocal);
   if (!Callees.empty()) {
     SmallVector<const MCExpr *, 8> ArgExprs;
     SmallPtrSet<const Function *, 8> Seen;
@@ -110,9 +115,10 @@ void MCResourceInfo::assignResourceInfoExpr(
       if (!Seen.insert(Callee).second)
         continue;
 
+      bool IsCalleeLocal = Callee->hasLocalLinkage();
       MCSymbol *CalleeFnSym = TM.getSymbol(&Callee->getFunction());
       MCSymbol *CalleeValSym =
-          getSymbol(CalleeFnSym->getName(), RIK, OutContext);
+          getSymbol(CalleeFnSym->getName(), RIK, OutContext, IsCalleeLocal);
 
       // Avoid constructing recursive definitions by detecting whether `Sym` is
       // found transitively within any of its `CalleeValSym`.
@@ -155,6 +161,7 @@ void MCResourceInfo::gatherResourceInfo(
   MCSymbol *MaxVGPRSym = getMaxVGPRSymbol(OutContext);
   MCSymbol *MaxAGPRSym = getMaxAGPRSymbol(OutContext);
   MCSymbol *MaxSGPRSym = getMaxSGPRSymbol(OutContext);
+  bool IsLocal = MF.getFunction().hasLocalLinkage();
 
   if (!AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv())) {
     addMaxVGPRCandidate(FRI.NumVGPR);
@@ -172,7 +179,8 @@ void MCResourceInfo::gatherResourceInfo(
                              FRI.Callees, OutContext);
     } else {
       const MCExpr *SymRef = MCSymbolRefExpr::create(MaxSym, OutContext);
-      MCSymbol *LocalNumSym = getSymbol(FnSym->getName(), RIK, OutContext);
+      MCSymbol *LocalNumSym =
+          getSymbol(FnSym->getName(), RIK, OutContext, IsLocal);
       const MCExpr *MaxWithLocal = AMDGPUMCExpr::createMax(
           {MCConstantExpr::create(numRegs, OutContext), SymRef}, OutContext);
       LocalNumSym->setVariableValue(MaxWithLocal);
@@ -187,7 +195,8 @@ void MCResourceInfo::gatherResourceInfo(
     // The expression for private segment size should be: FRI.PrivateSegmentSize
     // + max(FRI.Callees, FRI.CalleeSegmentSize)
     SmallVector<const MCExpr *, 8> ArgExprs;
-    MCSymbol *Sym = getSymbol(FnSym->getName(), RIK_PrivateSegSize, OutContext);
+    MCSymbol *Sym =
+        getSymbol(FnSym->getName(), RIK_PrivateSegSize, OutContext, IsLocal);
     if (FRI.CalleeSegmentSize)
       ArgExprs.push_back(
           MCConstantExpr::create(FRI.CalleeSegmentSize, OutContext));
@@ -198,9 +207,11 @@ void MCResourceInfo::gatherResourceInfo(
       if (!Seen.insert(Callee).second)
         continue;
       if (!Callee->isDeclaration()) {
+        bool IsCalleeLocal = Callee->hasLocalLinkage();
         MCSymbol *CalleeFnSym = TM.getSymbol(&Callee->getFunction());
         MCSymbol *CalleeValSym =
-            getSymbol(CalleeFnSym->getName(), RIK_PrivateSegSize, OutContext);
+            getSymbol(CalleeFnSym->getName(), RIK_PrivateSegSize, OutContext,
+                      IsCalleeLocal);
 
         // Avoid constructing recursive definitions by detecting whether `Sym`
         // is found transitively within any of its `CalleeValSym`.
@@ -223,7 +234,7 @@ void MCResourceInfo::gatherResourceInfo(
   }
 
   auto SetToLocal = [&](int64_t LocalValue, ResourceInfoKind RIK) {
-    MCSymbol *Sym = getSymbol(FnSym->getName(), RIK, OutContext);
+    MCSymbol *Sym = getSymbol(FnSym->getName(), RIK, OutContext, IsLocal);
     Sym->setVariableValue(MCConstantExpr::create(LocalValue, OutContext));
   };
 
@@ -255,9 +266,10 @@ const MCExpr *MCResourceInfo::createTotalNumVGPRs(const MachineFunction &MF,
                                                   MCContext &Ctx) {
   const TargetMachine &TM = MF.getTarget();
   MCSymbol *FnSym = TM.getSymbol(&MF.getFunction());
+  bool IsLocal = MF.getFunction().hasLocalLinkage();
   return AMDGPUMCExpr::createTotalNumVGPR(
-      getSymRefExpr(FnSym->getName(), RIK_NumAGPR, Ctx),
-      getSymRefExpr(FnSym->getName(), RIK_NumVGPR, Ctx), Ctx);
+      getSymRefExpr(FnSym->getName(), RIK_NumAGPR, Ctx, IsLocal),
+      getSymRefExpr(FnSym->getName(), RIK_NumVGPR, Ctx, IsLocal), Ctx);
 }
 
 const MCExpr *MCResourceInfo::createTotalNumSGPRs(const MachineFunction &MF,
@@ -265,11 +277,12 @@ const MCExpr *MCResourceInfo::createTotalNumSGPRs(const MachineFunction &MF,
                                                   MCContext &Ctx) {
   const TargetMachine &TM = MF.getTarget();
   MCSymbol *FnSym = TM.getSymbol(&MF.getFunction());
+  bool IsLocal = MF.getFunction().hasLocalLinkage();
   return MCBinaryExpr::createAdd(
-      getSymRefExpr(FnSym->getName(), RIK_NumSGPR, Ctx),
+      getSymRefExpr(FnSym->getName(), RIK_NumSGPR, Ctx, IsLocal),
       AMDGPUMCExpr::createExtraSGPRs(
-          getSymRefExpr(FnSym->getName(), RIK_UsesVCC, Ctx),
-          getSymRefExpr(FnSym->getName(), RIK_UsesFlatScratch, Ctx), hasXnack,
-          Ctx),
+          getSymRefExpr(FnSym->getName(), RIK_UsesVCC, Ctx, IsLocal),
+          getSymRefExpr(FnSym->getName(), RIK_UsesFlatScratch, Ctx, IsLocal),
+          hasXnack, Ctx),
       Ctx);
 }
diff --git llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
index 9dc34100e644..a670878948c3 100644
--- llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
+++ llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
@@ -71,9 +71,9 @@ public:
   }
 
   MCSymbol *getSymbol(StringRef FuncName, ResourceInfoKind RIK,
-                      MCContext &OutContext);
+                      MCContext &OutContext, bool IsLocal);
   const MCExpr *getSymRefExpr(StringRef FuncName, ResourceInfoKind RIK,
-                              MCContext &Ctx);
+                              MCContext &Ctx, bool IsLocal);
 
   void reset();
 
diff --git llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index fbcf83e2fdd6..ce3aeb93ec4c 100644
--- llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -99,12 +99,14 @@ FUNCTION_PASS_WITH_PARAMS(
 MACHINE_FUNCTION_PASS("amdgpu-isel", AMDGPUISelDAGToDAGPass(*this))
 MACHINE_FUNCTION_PASS("si-fix-sgpr-copies", SIFixSGPRCopiesPass())
 MACHINE_FUNCTION_PASS("si-i1-copies", SILowerI1CopiesPass())
+MACHINE_FUNCTION_PASS("si-fix-vgpr-copies", SIFixVGPRCopiesPass())
 MACHINE_FUNCTION_PASS("si-fold-operands", SIFoldOperandsPass());
 MACHINE_FUNCTION_PASS("gcn-dpp-combine", GCNDPPCombinePass())
 MACHINE_FUNCTION_PASS("si-load-store-opt", SILoadStoreOptimizerPass())
 MACHINE_FUNCTION_PASS("si-lower-control-flow", SILowerControlFlowPass())
 MACHINE_FUNCTION_PASS("si-lower-sgpr-spills", SILowerSGPRSpillsPass())
 MACHINE_FUNCTION_PASS("si-opt-vgpr-liverange", SIOptimizeVGPRLiveRangePass())
+MACHINE_FUNCTION_PASS("si-optimize-exec-masking", SIOptimizeExecMaskingPass())
 MACHINE_FUNCTION_PASS("si-peephole-sdwa", SIPeepholeSDWAPass())
 MACHINE_FUNCTION_PASS("si-pre-allocate-wwm-regs", SIPreAllocateWWMRegsPass())
 MACHINE_FUNCTION_PASS("si-shrink-instructions", SIShrinkInstructionsPass())
diff --git llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
index a899805dc46b..459f85ae6169 100644
--- llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
@@ -277,7 +277,7 @@ bool AMDGPUPrintfRuntimeBindingImpl::lowerPrintfForGpu(Module &M) {
 
     Type *Tys_alloc[1] = {SizetTy};
     Type *I8Ty = Type::getInt8Ty(Ctx);
-    Type *I8Ptr = PointerType::get(I8Ty, 1);
+    Type *I8Ptr = PointerType::get(Ctx, 1);
     FunctionType *FTy_alloc = FunctionType::get(I8Ptr, Tys_alloc, false);
     FunctionCallee PrintfAllocFn =
         M.getOrInsertFunction(StringRef("__printf_alloc"), FTy_alloc, Attr);
@@ -300,7 +300,7 @@ bool AMDGPUPrintfRuntimeBindingImpl::lowerPrintfForGpu(Module &M) {
     // basicblock splits after buffer overflow check
     //
     ConstantPointerNull *zeroIntPtr =
-        ConstantPointerNull::get(PointerType::get(I8Ty, 1));
+        ConstantPointerNull::get(PointerType::get(Ctx, 1));
     auto *cmp = cast<ICmpInst>(Builder.CreateICmpNE(pcall, zeroIntPtr, ""));
     if (!CI->use_empty()) {
       Value *result =
@@ -320,7 +320,7 @@ bool AMDGPUPrintfRuntimeBindingImpl::lowerPrintfForGpu(Module &M) {
         I8Ty, pcall, ConstantInt::get(Ctx, APInt(32, 0)), "PrintBuffID",
         BrnchPoint);
 
-    Type *idPointer = PointerType::get(I32Ty, AMDGPUAS::GLOBAL_ADDRESS);
+    Type *idPointer = PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS);
     Value *id_gep_cast =
         new BitCastInst(BufferIdx, idPointer, "PrintBuffIdCast", BrnchPoint);
 
diff --git llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 98268b848f5c..3fe17457cb36 100644
--- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -36,12 +36,14 @@
 #include "R600.h"
 #include "R600TargetMachine.h"
 #include "SIFixSGPRCopies.h"
+#include "SIFixVGPRCopies.h"
 #include "SIFoldOperands.h"
 #include "SILoadStoreOptimizer.h"
 #include "SILowerControlFlow.h"
 #include "SILowerSGPRSpills.h"
 #include "SIMachineFunctionInfo.h"
 #include "SIMachineScheduler.h"
+#include "SIOptimizeExecMasking.h"
 #include "SIOptimizeVGPRLiveRange.h"
 #include "SIPeepholeSDWA.h"
 #include "SIPreAllocateWWMRegs.h"
@@ -485,7 +487,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPUMarkLastScratchLoadPass(*PR);
   initializeSILowerSGPRSpillsLegacyPass(*PR);
   initializeSIFixSGPRCopiesLegacyPass(*PR);
-  initializeSIFixVGPRCopiesPass(*PR);
+  initializeSIFixVGPRCopiesLegacyPass(*PR);
   initializeSIFoldOperandsLegacyPass(*PR);
   initializeSIPeepholeSDWALegacyPass(*PR);
   initializeSIShrinkInstructionsLegacyPass(*PR);
@@ -528,7 +530,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeSIPreEmitPeepholePass(*PR);
   initializeSILateBranchLoweringPass(*PR);
   initializeSIMemoryLegalizerPass(*PR);
-  initializeSIOptimizeExecMaskingPass(*PR);
+  initializeSIOptimizeExecMaskingLegacyPass(*PR);
   initializeSIPreAllocateWWMRegsLegacyPass(*PR);
   initializeSIFormMemoryClausesPass(*PR);
   initializeSIPostRABundlerPass(*PR);
@@ -1634,7 +1636,7 @@ bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
 void GCNPassConfig::addPostRegAlloc() {
   addPass(&SIFixVGPRCopiesID);
   if (getOptLevel() > CodeGenOptLevel::None)
-    addPass(&SIOptimizeExecMaskingID);
+    addPass(&SIOptimizeExecMaskingLegacyID);
   TargetPassConfig::addPostRegAlloc();
 }
 
@@ -2105,6 +2107,13 @@ void AMDGPUCodeGenPassBuilder::addMachineSSAOptimization(
   addPass(SIShrinkInstructionsPass());
 }
 
+void AMDGPUCodeGenPassBuilder::addPostRegAlloc(AddMachinePass &addPass) const {
+  addPass(SIFixVGPRCopiesPass());
+  if (TM.getOptLevel() > CodeGenOptLevel::None)
+    addPass(SIOptimizeExecMaskingPass());
+  Base::addPostRegAlloc(addPass);
+}
+
 bool AMDGPUCodeGenPassBuilder::isPassEnabled(const cl::opt<bool> &Opt,
                                              CodeGenOptLevel Level) const {
   if (Opt.getNumOccurrences())
diff --git llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index 5ba58a92621e..24b4da3a68f6 100644
--- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -176,6 +176,7 @@ public:
   void addAsmPrinter(AddMachinePass &, CreateMCStreamer) const;
   Error addInstSelector(AddMachinePass &) const;
   void addMachineSSAOptimization(AddMachinePass &) const;
+  void addPostRegAlloc(AddMachinePass &) const;
 
   /// Check if a pass is enabled given \p Opt option. The option always
   /// overrides defaults if explicitly used. Otherwise its default will be used
diff --git llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 6baef137df5e..873d18e30a43 100644
--- llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -858,9 +858,12 @@ int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
   }
 
   if (TII->isFLAT(MI)) {
-    int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
-    if (AMDGPU::getRegBitWidth(Desc.operands()[DataIdx].RegClass) > 64)
-      return DataIdx;
+    // There is no hazard if the instruction does not use vector regs
+    if (VDataIdx == -1)
+      return -1;
+
+    if (AMDGPU::getRegBitWidth(VDataRCID) > 64)
+      return VDataIdx;
   }
 
   return -1;
diff --git llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
index 3172a83e5a1f..8c4314e6d6cc 100644
--- llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -53,7 +53,7 @@ public:
   std::optional<MCFixupKind> getFixupKind(StringRef Name) const override;
   const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
   bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
-                             const MCValue &Target,
+                             const MCValue &Target, uint64_t Value,
                              const MCSubtargetInfo *STI) override;
 };
 
@@ -196,7 +196,7 @@ const MCFixupKindInfo &AMDGPUAsmBackend::getFixupKindInfo(
 
 bool AMDGPUAsmBackend::shouldForceRelocation(const MCAssembler &,
                                              const MCFixup &Fixup,
-                                             const MCValue &,
+                                             const MCValue &, const uint64_t,
                                              const MCSubtargetInfo *STI) {
   return Fixup.getKind() >= FirstLiteralRelocationKind;
 }
diff --git llvm/lib/Target/AMDGPU/R600ISelLowering.cpp llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index c2e952418f1b..157ca4b08020 100644
--- llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -762,8 +762,8 @@ SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
                                                    const SDLoc &DL,
                                                    unsigned DwordOffset) const {
   unsigned ByteOffset = DwordOffset * 4;
-  PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
-                                      AMDGPUAS::PARAM_I_ADDRESS);
+  PointerType *PtrType =
+      PointerType::get(*DAG.getContext(), AMDGPUAS::PARAM_I_ADDRESS);
 
   // We shouldn't be using an offset wider than 16-bits for implicit parameters.
   assert(isInt<16>(ByteOffset));
diff --git llvm/lib/Target/AMDGPU/SIFixVGPRCopies.cpp llvm/lib/Target/AMDGPU/SIFixVGPRCopies.cpp
index 08272a9ddfd3..d0d679221eee 100644
--- llvm/lib/Target/AMDGPU/SIFixVGPRCopies.cpp
+++ llvm/lib/Target/AMDGPU/SIFixVGPRCopies.cpp
@@ -11,6 +11,7 @@
 ///
 //===----------------------------------------------------------------------===//
 
+#include "SIFixVGPRCopies.h"
 #include "AMDGPU.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
@@ -22,13 +23,12 @@ using namespace llvm;
 
 namespace {
 
-class SIFixVGPRCopies : public MachineFunctionPass {
+class SIFixVGPRCopiesLegacy : public MachineFunctionPass {
 public:
   static char ID;
 
-public:
-  SIFixVGPRCopies() : MachineFunctionPass(ID) {
-    initializeSIFixVGPRCopiesPass(*PassRegistry::getPassRegistry());
+  SIFixVGPRCopiesLegacy() : MachineFunctionPass(ID) {
+    initializeSIFixVGPRCopiesLegacyPass(*PassRegistry::getPassRegistry());
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -41,15 +41,31 @@ public:
   StringRef getPassName() const override { return "SI Fix VGPR copies"; }
 };
 
+class SIFixVGPRCopies {
+public:
+  bool run(MachineFunction &MF);
+};
+
 } // End anonymous namespace.
 
-INITIALIZE_PASS(SIFixVGPRCopies, DEBUG_TYPE, "SI Fix VGPR copies", false, false)
+INITIALIZE_PASS(SIFixVGPRCopiesLegacy, DEBUG_TYPE, "SI Fix VGPR copies", false,
+                false)
 
-char SIFixVGPRCopies::ID = 0;
+char SIFixVGPRCopiesLegacy::ID = 0;
 
-char &llvm::SIFixVGPRCopiesID = SIFixVGPRCopies::ID;
+char &llvm::SIFixVGPRCopiesID = SIFixVGPRCopiesLegacy::ID;
+
+PreservedAnalyses SIFixVGPRCopiesPass::run(MachineFunction &MF,
+                                           MachineFunctionAnalysisManager &) {
+  SIFixVGPRCopies().run(MF);
+  return PreservedAnalyses::all();
+}
+
+bool SIFixVGPRCopiesLegacy::runOnMachineFunction(MachineFunction &MF) {
+  return SIFixVGPRCopies().run(MF);
+}
 
-bool SIFixVGPRCopies::runOnMachineFunction(MachineFunction &MF) {
+bool SIFixVGPRCopies::run(MachineFunction &MF) {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIRegisterInfo *TRI = ST.getRegisterInfo();
   const SIInstrInfo *TII = ST.getInstrInfo();
diff --git llvm/lib/Target/AMDGPU/SIFixVGPRCopies.h llvm/lib/Target/AMDGPU/SIFixVGPRCopies.h
new file mode 100644
index 000000000000..7b098b71597f
--- /dev/null
+++ llvm/lib/Target/AMDGPU/SIFixVGPRCopies.h
@@ -0,0 +1,22 @@
+//===- SIFixVGPRCopies.h ----------------------------------------*- C++- *-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_SIFIXVGPRCOPIES_H
+#define LLVM_LIB_TARGET_AMDGPU_SIFIXVGPRCOPIES_H
+
+#include "llvm/CodeGen/MachinePassManager.h"
+
+namespace llvm {
+class SIFixVGPRCopiesPass : public PassInfoMixin<SIFixVGPRCopiesPass> {
+public:
+  PreservedAnalyses run(MachineFunction &MF,
+                        MachineFunctionAnalysisManager &MFAM);
+};
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_SIFIXVGPRCOPIES_H
diff --git llvm/lib/Target/AMDGPU/SIISelLowering.cpp llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index e068b5f0b876..6cf5774fc53b 100644
--- llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3157,7 +3157,8 @@ SDValue SITargetLowering::LowerFormalArguments(
 // possible in registers before passing on stack.
 bool SITargetLowering::CanLowerReturn(
     CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
-    const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
+    const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
+    const Type *RetTy) const {
   // Replacing returns with sret/stack usage doesn't make sense for shaders.
   // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
   // for shaders. Vector types should be explicitly handled by CC.
@@ -7816,9 +7817,8 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
 
   SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
                                             SIInstrInfo::MO_GOTPCREL32);
-
-  Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
-  PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
+  PointerType *PtrTy =
+      PointerType::get(*DAG.getContext(), AMDGPUAS::CONSTANT_ADDRESS);
   const DataLayout &DataLayout = DAG.getDataLayout();
   Align Alignment = DataLayout.getABITypeAlign(PtrTy);
   MachinePointerInfo PtrInfo =
@@ -15876,6 +15876,12 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
             RC = TRI->getAGPRClassForBitWidth(Width);
           if (RC) {
             Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
+            if (!Reg) {
+              // The register class does not contain the requested register,
+              // e.g., because it is an SGPR pair that would violate alignment
+              // requirements.
+              return std::pair(0U, nullptr);
+            }
             return std::pair(Reg, RC);
           }
         }
diff --git llvm/lib/Target/AMDGPU/SIISelLowering.h llvm/lib/Target/AMDGPU/SIISelLowering.h
index bbb96d9115a0..1cd7f1b29e07 100644
--- llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -392,7 +392,7 @@ public:
   bool CanLowerReturn(CallingConv::ID CallConv,
                       MachineFunction &MF, bool isVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
-                      LLVMContext &Context) const override;
+                      LLVMContext &Context, const Type *RetTy) const override;
 
   SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
diff --git llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp
index c663820311b8..d80a5f958273 100644
--- llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp
+++ llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp
@@ -43,6 +43,9 @@ public:
   StringRef getPassName() const override { return "SI Lower WWM Copies"; }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addUsedIfAvailable<LiveIntervalsWrapperPass>();
+    AU.addUsedIfAvailable<SlotIndexesWrapperPass>();
+    AU.addUsedIfAvailable<VirtRegMapWrapperLegacy>();
     AU.setPreservesAll();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
diff --git llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
index 93b70fa4ba97..3fb8d5b56049 100644
--- llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
+++ llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "SIOptimizeExecMasking.h"
 #include "AMDGPU.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
@@ -23,7 +24,7 @@ using namespace llvm;
 
 namespace {
 
-class SIOptimizeExecMasking : public MachineFunctionPass {
+class SIOptimizeExecMasking {
   MachineFunction *MF = nullptr;
   const GCNSubtarget *ST = nullptr;
   const SIRegisterInfo *TRI = nullptr;
@@ -61,11 +62,16 @@ class SIOptimizeExecMasking : public MachineFunctionPass {
   void tryRecordOrSaveexecXorSequence(MachineInstr &MI);
   bool optimizeOrSaveexecXorSequences();
 
+public:
+  bool run(MachineFunction &MF);
+};
+
+class SIOptimizeExecMaskingLegacy : public MachineFunctionPass {
 public:
   static char ID;
 
-  SIOptimizeExecMasking() : MachineFunctionPass(ID) {
-    initializeSIOptimizeExecMaskingPass(*PassRegistry::getPassRegistry());
+  SIOptimizeExecMaskingLegacy() : MachineFunctionPass(ID) {
+    initializeSIOptimizeExecMaskingLegacyPass(*PassRegistry::getPassRegistry());
   }
 
   bool runOnMachineFunction(MachineFunction &MF) override;
@@ -82,15 +88,28 @@ public:
 
 } // End anonymous namespace.
 
-INITIALIZE_PASS_BEGIN(SIOptimizeExecMasking, DEBUG_TYPE,
+PreservedAnalyses
+SIOptimizeExecMaskingPass::run(MachineFunction &MF,
+                               MachineFunctionAnalysisManager &) {
+  SIOptimizeExecMasking Impl;
+
+  if (!Impl.run(MF))
+    return PreservedAnalyses::all();
+
+  auto PA = getMachineFunctionPassPreservedAnalyses();
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
+
+INITIALIZE_PASS_BEGIN(SIOptimizeExecMaskingLegacy, DEBUG_TYPE,
                       "SI optimize exec mask operations", false, false)
 INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
-INITIALIZE_PASS_END(SIOptimizeExecMasking, DEBUG_TYPE,
+INITIALIZE_PASS_END(SIOptimizeExecMaskingLegacy, DEBUG_TYPE,
                     "SI optimize exec mask operations", false, false)
 
-char SIOptimizeExecMasking::ID = 0;
+char SIOptimizeExecMaskingLegacy::ID = 0;
 
-char &llvm::SIOptimizeExecMaskingID = SIOptimizeExecMasking::ID;
+char &llvm::SIOptimizeExecMaskingLegacyID = SIOptimizeExecMaskingLegacy::ID;
 
 /// If \p MI is a copy from exec, return the register copied to.
 Register SIOptimizeExecMasking::isCopyFromExec(const MachineInstr &MI) const {
@@ -786,10 +805,14 @@ bool SIOptimizeExecMasking::optimizeOrSaveexecXorSequences() {
   return Changed;
 }
 
-bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
+bool SIOptimizeExecMaskingLegacy::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
 
+  return SIOptimizeExecMasking().run(MF);
+}
+
+bool SIOptimizeExecMasking::run(MachineFunction &MF) {
   this->MF = &MF;
   ST = &MF.getSubtarget<GCNSubtarget>();
   TRI = ST->getRegisterInfo();
diff --git llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.h llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.h
new file mode 100644
index 000000000000..f170a4733279
--- /dev/null
+++ llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.h
@@ -0,0 +1,23 @@
+//===- SIOptimizeExecMasking.h ----------------------------------*- C++- *-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_SIOPTIMIZEEXECMASKING_H
+#define LLVM_LIB_TARGET_AMDGPU_SIOPTIMIZEEXECMASKING_H
+
+#include "llvm/CodeGen/MachinePassManager.h"
+
+namespace llvm {
+class SIOptimizeExecMaskingPass
+    : public PassInfoMixin<SIOptimizeExecMaskingPass> {
+public:
+  PreservedAnalyses run(MachineFunction &MF,
+                        MachineFunctionAnalysisManager &MFAM);
+};
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_SIOPTIMIZEEXECMASKING_H
diff --git llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 263f6497b9a7..38f86ca3e9af 100644
--- llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -1305,7 +1305,7 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
   // Record initial state is block information.
   BI.InitialState = State;
 
-  for (;;) {
+  for (unsigned Idx = 0;; ++Idx) {
     MachineBasicBlock::iterator Next = II;
     char Needs = StateExact | StateWQM; // Strict mode is disabled by default.
     char OutNeeds = 0;
@@ -1316,6 +1316,10 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
     if (FirstStrict == IE)
       FirstStrict = II;
 
+    // Adjust needs if this is first instruction of WQM requiring shader.
+    if (IsEntry && Idx == 0 && (BI.InNeeds & StateWQM))
+      Needs = StateWQM;
+
     // First, figure out the allowed states (Needs) based on the propagated
     // flags.
     if (II != IE) {
@@ -1801,6 +1805,9 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
     lowerKillInstrs(true);
     Changed = true;
   } else {
+    // Mark entry for WQM if required.
+    if (GlobalFlags & StateWQM)
+      Blocks[&Entry].InNeeds |= StateWQM;
     // Wave mode switching requires full lowering pass.
     for (auto BII : Blocks)
       processBlock(*BII.first, BII.first == &Entry);
diff --git llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
index 4ad26ee895c7..1e76bf7056cc 100644
--- llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
+++ llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
@@ -232,7 +232,8 @@ void AMDGPUPALMetadata::setEntryPoint(unsigned CC, StringRef Name) {
   if (isLegacy())
     return;
   // Msgpack format.
-  getHwStage(CC)[".entry_point"] = MsgPackDoc.getNode(Name, /*Copy=*/true);
+  getHwStage(CC)[".entry_point_symbol"] =
+      MsgPackDoc.getNode(Name, /*Copy=*/true);
 }
 
 // Set the number of used vgprs in the metadata. This is an optional
diff --git llvm/lib/Target/AMDGPU/VOPCInstructions.td llvm/lib/Target/AMDGPU/VOPCInstructions.td
index bba8aa570d2b..91ad2cafe9b5 100644
--- llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -899,25 +899,40 @@ class VOPC_Class_Profile<list<SchedReadWrite> sched, ValueType src0VT, ValueType
 
 multiclass VOPC_Class_Profile_t16<list<SchedReadWrite> sched> {
   def NAME : VOPC_Class_Profile<sched, f16>;
-  def _t16 : VOPC_Class_Profile<sched, f16, i16> {
+  def _t16 : VOPC_Class_Profile_Base<sched, f16, f16> {
     let IsTrue16 = 1;
     let IsRealTrue16 = 1;
-    let Src1RC32 = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
-    let Src1RC64 = VSrc_b32;
-    let Src0DPP = getVregSrcForVT<Src0VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
-    let Src1DPP = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
-    let Src2DPP = getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
-    let Src0ModDPP = getSrcModDPP_t16<Src0VT>.ret;
-    let Src1ModDPP = getSrcModDPP_t16<Src1VT>.ret;
-    let Src2ModDPP = getSrcModDPP_t16<Src2VT>.ret;
+    let HasOpSel = 1;
+    let HasModifiers = 1; // All instructions at least have OpSel
+    let DstRC = getVALUDstForVT<DstVT, 1 /*IsTrue16*/, 0 /*IsVOP3Encoding*/>.ret;
+    let Src0RC32 = getVOPSrc0ForVT<Src0VT, 1/*IsTrue16*/, 0/*IsFake16*/>.ret;
+    let Src1RC32 = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 0/*IsFake16*/>.ret;
+    let Src0DPP = getVregSrcForVT<Src0VT, 1/*IsTrue16*/, 0/*IsFake16*/>.ret;
+    let Src1DPP = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 0/*IsFake16*/>.ret;
+    let Src2DPP = getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 0/*IsFake16*/>.ret;
+    let Src0ModDPP = getSrcModDPP_t16<Src0VT, 0/*IsFake16*/>.ret;
+    let Src1ModDPP = getSrcModDPP_t16<Src1VT, 0/*IsFake16*/>.ret;
+    let Src2ModDPP = getSrcModDPP_t16<Src2VT, 0/*IsFake16*/>.ret;
+    let Src0VOP3DPP = VGPRSrc_16;
+    let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 0/*IsFake16*/>.ret;
+    let Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT, 0/*IsFake16*/>.ret;
+
+    let DstRC64 = getVALUDstForVT<DstVT, 1/*IsTrue16*/, 1/*IsVOP3Encoding*/>.ret;
+    let Src0RC64 = getVOP3SrcForVT<Src0VT, 1/*IsTrue16*/>.ret;
+    let Src1RC64 = getVOP3SrcForVT<Src1VT, 1/*IsTrue16*/>.ret;
+    let Src2RC64 = getVOP3SrcForVT<Src2VT, 1/*IsTrue16*/>.ret;
+    let Src0Mod = getSrc0Mod<Src0VT, DstVT, 1/*IsTrue16*/, 0/*IsFake16*/>.ret;
+    let Src1Mod = getSrcMod<Src1VT, 1/*IsTrue16*/, 0/*IsFake16*/>.ret;
+    let Src2Mod = getSrcMod<Src2VT, 1/*IsTrue16*/, 0/*IsFake16*/>.ret;
+    let Src0ModVOP3DPP = getSrc0ModVOP3DPP<Src0VT, DstVT, 0/*IsFake16*/>.ret;
+    let Src1ModVOP3DPP = getSrcModVOP3DPP<Src1VT, 0/*IsFake16*/>.ret;
+    let Src2ModVOP3DPP = getSrcModVOP3DPP<Src2VT, 0/*IsFake16*/>.ret;
   }
   def _fake16 : VOPC_Class_Profile_Base<sched, f16, f16> {
     let IsTrue16 = 1;
     let DstRC = getVALUDstForVT_fake16<DstVT>.ret;
-    let DstRC64 = getVALUDstForVT<DstVT>.ret;
     let Src0RC32 = getVOPSrc0ForVT<Src0VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
     let Src1RC32 = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
-    let Src1RC64 = VSrc_b32;
     let Src0DPP = getVregSrcForVT<Src0VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
     let Src1DPP = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
     let Src2DPP = getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
@@ -927,6 +942,14 @@ multiclass VOPC_Class_Profile_t16<list<SchedReadWrite> sched> {
     let Src0VOP3DPP = VGPRSrc_32;
     let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 1/*IsFake16*/>.ret;
     let Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT, 1/*IsFake16*/>.ret;
+
+    let DstRC64 = getVALUDstForVT<DstVT>.ret;
+    let Src0RC64 = getVOP3SrcForVT<Src0VT, 0/*IsTrue16*/>.ret;
+    let Src1RC64 = getVOP3SrcForVT<Src1VT, 0/*IsTrue16*/>.ret;
+    let Src2RC64 = getVOP3SrcForVT<Src2VT, 0/*IsTrue16*/>.ret;
+    let Src0Mod = getSrc0Mod<Src0VT, DstVT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
+    let Src1Mod = getSrcMod<Src1VT, 0/*IsTrue16*/, 1/*IsFake16*/>.ret;
+    let Src2Mod = getSrcMod<Src2VT, 0/*IsTrue16*/, 1/*IsFake16*/>.ret;
     let Src0ModVOP3DPP = getSrc0ModVOP3DPP<Src0VT, DstVT, 1/*IsFake16*/>.ret;
     let Src1ModVOP3DPP = getSrcModVOP3DPP<Src1VT, 1/*IsFake16*/>.ret;
     let Src2ModVOP3DPP = getSrcModVOP3DPP<Src2VT, 1/*IsFake16*/>.ret;
@@ -1838,22 +1861,22 @@ multiclass VOPCX_Real_t16_and_fake16_gfx11_gfx12<bits<9> op, string asm_name,
   defm _fake16: VOPCX_Real_t16_gfx11_gfx12<op, asm_name, OpName#"_fake16", pseudo_mnemonic>;
 }
 
-defm V_CMP_F_F16_fake16      : VOPC_Real_t16_gfx11<0x000, "v_cmp_f_f16">;
+defm V_CMP_F_F16             : VOPC_Real_t16_and_fake16_gfx11<0x000, "v_cmp_f_f16">;
 defm V_CMP_LT_F16            : VOPC_Real_t16_and_fake16_gfx11_gfx12<0x001, "v_cmp_lt_f16">;
-defm V_CMP_EQ_F16_fake16     : VOPC_Real_t16_gfx11_gfx12<0x002, "v_cmp_eq_f16">;
-defm V_CMP_LE_F16_fake16     : VOPC_Real_t16_gfx11_gfx12<0x003, "v_cmp_le_f16">;
-defm V_CMP_GT_F16_fake16     : VOPC_Real_t16_gfx11_gfx12<0x004, "v_cmp_gt_f16">;
-defm V_CMP_LG_F16_fake16     : VOPC_Real_t16_gfx11_gfx12<0x005, "v_cmp_lg_f16">;
-defm V_CMP_GE_F16_fake16     : VOPC_Real_t16_gfx11_gfx12<0x006, "v_cmp_ge_f16">;
-defm V_CMP_O_F16_fake16      : VOPC_Real_t16_gfx11_gfx12<0x007, "v_cmp_o_f16">;
-defm V_CMP_U_F16_fake16      : VOPC_Real_t16_gfx11_gfx12<0x008, "v_cmp_u_f16">;
-defm V_CMP_NGE_F16_fake16    : VOPC_Real_t16_gfx11_gfx12<0x009, "v_cmp_nge_f16">;
-defm V_CMP_NLG_F16_fake16    : VOPC_Real_t16_gfx11_gfx12<0x00a, "v_cmp_nlg_f16">;
-defm V_CMP_NGT_F16_fake16    : VOPC_Real_t16_gfx11_gfx12<0x00b, "v_cmp_ngt_f16">;
-defm V_CMP_NLE_F16_fake16    : VOPC_Real_t16_gfx11_gfx12<0x00c, "v_cmp_nle_f16">;
-defm V_CMP_NEQ_F16_fake16    : VOPC_Real_t16_gfx11_gfx12<0x00d, "v_cmp_neq_f16">;
-defm V_CMP_NLT_F16_fake16    : VOPC_Real_t16_gfx11_gfx12<0x00e, "v_cmp_nlt_f16">;
-defm V_CMP_T_F16_fake16      : VOPC_Real_t16_gfx11<0x00f, "v_cmp_t_f16", "V_CMP_TRU_F16_fake16", "v_cmp_tru_f16">;
+defm V_CMP_EQ_F16            : VOPC_Real_t16_and_fake16_gfx11_gfx12<0x002, "v_cmp_eq_f16">;
+defm V_CMP_LE_F16            : VOPC_Real_t16_and_fake16_gfx11_gfx12<0x003, "v_cmp_le_f16">;
+defm V_CMP_GT_F16            : VOPC_Real_t16_and_fake16_gfx11_gfx12<0x004, "v_cmp_gt_f16">;
+defm V_CMP_LG_F16            : VOPC_Real_t16_and_fake16_gfx11_gfx12<0x005, "v_cmp_lg_f16">;
+defm V_CMP_GE_F16            : VOPC_Real_t16_and_fake16_gfx11_gfx12<0x006, "v_cmp_ge_f16">;
+defm V_CMP_O_F16             : VOPC_Real_t16_and_fake16_gfx11_gfx12<0x007, "v_cmp_o_f16">;
+defm V_CMP_U_F16             : VOPC_Real_t16_and_fake16_gfx11_gfx12<0x008, "v_cmp_u_f16">;
+defm V_CMP_NGE_F16           : VOPC_Real_t16_and_fake16_gfx11_gfx12<0x009, "v_cmp_nge_f16">;
+defm V_CMP_NLG_F16           : VOPC_Real_t16_and_fake16_gfx11_gfx12<0x00a, "v_cmp_nlg_f16">;
+defm V_CMP_NGT_F16           : VOPC_Real_t16_and_fake16_gfx11_gfx12<0x00b, "v_cmp_ngt_f16">;
+defm V_CMP_NLE_F16           : VOPC_Real_t16_and_fake16_gfx11_gfx12<0x00c, "v_cmp_nle_f16">;
+defm V_CMP_NEQ_F16           : VOPC_Real_t16_and_fake16_gfx11_gfx12<0x00d, "v_cmp_neq_f16">;
+defm V_CMP_NLT_F16           : VOPC_Real_t16_and_fake16_gfx11_gfx12<0x00e, "v_cmp_nlt_f16">;
+defm V_CMP_T_F16             : VOPC_Real_t16_and_fake16_gfx11<0x00f, "v_cmp_t_f16", "V_CMP_TRU_F16", "v_cmp_tru_f16">;
 
 defm V_CMP_F_F32      : VOPC_Real_gfx11<0x010>;
 defm V_CMP_LT_F32     : VOPC_Real_gfx11_gfx12<0x011>;
@@ -1873,18 +1896,18 @@ defm V_CMP_NLT_F32    : VOPC_Real_gfx11_gfx12<0x01e>;
 defm V_CMP_T_F32      : VOPC_Real_with_name_gfx11<0x01f, "V_CMP_TRU_F32", "v_cmp_t_f32">;
 defm V_CMP_T_F64      : VOPC_Real_with_name_gfx11<0x02f, "V_CMP_TRU_F64", "v_cmp_t_f64">;
 
-defm V_CMP_LT_I16_fake16     : VOPC_Real_t16_gfx11_gfx12<0x031, "v_cmp_lt_i16">;
-defm V_CMP_EQ_I16_fake16     : VOPC_Real_t16_gfx11_gfx12<0x032, "v_cmp_eq_i16">;
-defm V_CMP_LE_I16_fake16     : VOPC_Real_t16_gfx11_gfx12<0x033, "v_cmp_le_i16">;
-defm V_CMP_GT_I16_fake16     : VOPC_Real_t16_gfx11_gfx12<0x034, "v_cmp_gt_i16">;
-defm V_CMP_NE_I16_fake16     : VOPC_Real_t16_gfx11_gfx12<0x035, "v_cmp_ne_i16">;
-defm V_CMP_GE_I16_fake16     : VOPC_Real_t16_gfx11_gfx12<0x036, "v_cmp_ge_i16">;
-defm V_CMP_LT_U16_fake16     : VOPC_Real_t16_gfx11_gfx12<0x039, "v_cmp_lt_u16">;
-defm V_CMP_EQ_U16_fake16     : VOPC_Real_t16_gfx11_gfx12<0x03a, "v_cmp_eq_u16">;
-defm V_CMP_LE_U16_fake16     : VOPC_Real_t16_gfx11_gfx12<0x03b, "v_cmp_le_u16">;
-defm V_CMP_GT_U16_fake16     : VOPC_Real_t16_gfx11_gfx12<0x03c, "v_cmp_gt_u16">;
-defm V_CMP_NE_U16_fake16     : VOPC_Real_t16_gfx11_gfx12<0x03d, "v_cmp_ne_u16">;
-defm V_CMP_GE_U16_fake16     : VOPC_Real_t16_gfx11_gfx12<0x03e, "v_cmp_ge_u16">;
+defm V_CMP_LT_I16     : VOPC_Real_t16_and_fake16_gfx11_gfx12<0x031, "v_cmp_lt_i16">;
+defm V_CMP_EQ_I16     : VOPC_Real_t16_and_fake16_gfx11_gfx12<0x032, "v_cmp_eq_i16">;
+defm V_CMP_LE_I16     : VOPC_Real_t16_and_fake16_gfx11_gfx12<0x033, "v_cmp_le_i16">;
+defm V_CMP_GT_I16     : VOPC_Real_t16_and_fake16_gfx11_gfx12<0x034, "v_cmp_gt_i16">;
+defm V_CMP_NE_I16     : VOPC_Real_t16_and_fake16_gfx11_gfx12<0x035, "v_cmp_ne_i16">;
+defm V_CMP_GE_I16     : VOPC_Real_t16_and_fake16_gfx11_gfx12<0x036, "v_cmp_ge_i16">;
+defm V_CMP_LT_U16     : VOPC_Real_t16_and_fake16_gfx11_gfx12<0x039, "v_cmp_lt_u16">;
+defm V_CMP_EQ_U16     : VOPC_Real_t16_and_fake16_gfx11_gfx12<0x03a, "v_cmp_eq_u16">;
+defm V_CMP_LE_U16     : VOPC_Real_t16_and_fake16_gfx11_gfx12<0x03b, "v_cmp_le_u16">;
+defm V_CMP_GT_U16     : VOPC_Real_t16_and_fake16_gfx11_gfx12<0x03c, "v_cmp_gt_u16">;
+defm V_CMP_NE_U16     : VOPC_Real_t16_and_fake16_gfx11_gfx12<0x03d, "v_cmp_ne_u16">;
+defm V_CMP_GE_U16     : VOPC_Real_t16_and_fake16_gfx11_gfx12<0x03e, "v_cmp_ge_u16">;
 
 defm V_CMP_F_I32      : VOPC_Real_gfx11<0x040>;
 defm V_CMP_LT_I32     : VOPC_Real_gfx11_gfx12<0x041>;
@@ -1920,7 +1943,7 @@ defm V_CMP_NE_U64     : VOPC_Real_gfx11_gfx12<0x05d>;
 defm V_CMP_GE_U64     : VOPC_Real_gfx11_gfx12<0x05e>;
 defm V_CMP_T_U64      : VOPC_Real_gfx11<0x05f>;
 
-defm V_CMP_CLASS_F16_fake16 : VOPC_Real_t16_gfx11_gfx12<0x07d, "v_cmp_class_f16">;
+defm V_CMP_CLASS_F16     : VOPC_Real_t16_and_fake16_gfx11_gfx12<0x07d, "v_cmp_class_f16">;
 defm V_CMP_CLASS_F32     : VOPC_Real_gfx11_gfx12<0x07e>;
 defm V_CMP_CLASS_F64     : VOPC_Real_gfx11_gfx12<0x07f>;
 
diff --git llvm/lib/Target/ARC/ARCISelLowering.cpp llvm/lib/Target/ARC/ARCISelLowering.cpp
index 5ab27681361d..b133e4e5299a 100644
--- llvm/lib/Target/ARC/ARCISelLowering.cpp
+++ llvm/lib/Target/ARC/ARCISelLowering.cpp
@@ -630,7 +630,8 @@ SDValue ARCTargetLowering::LowerCallArguments(
 
 bool ARCTargetLowering::CanLowerReturn(
     CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
-    const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
+    const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
+    const Type *RetTy) const {
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
   if (!CCInfo.CheckReturn(Outs, RetCC_ARC))
diff --git llvm/lib/Target/ARC/ARCISelLowering.h llvm/lib/Target/ARC/ARCISelLowering.h
index e070ed8752cc..716a72455e82 100644
--- llvm/lib/Target/ARC/ARCISelLowering.h
+++ llvm/lib/Target/ARC/ARCISelLowering.h
@@ -112,7 +112,7 @@ private:
   bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
                       bool isVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &ArgsFlags,
-                      LLVMContext &Context) const override;
+                      LLVMContext &Context, const Type *RetTy) const override;
 
   bool mayBeEmittedAsTailCall(const CallInst *CI) const override;
 };
diff --git llvm/lib/Target/ARM/A15SDOptimizer.cpp llvm/lib/Target/ARM/A15SDOptimizer.cpp
index be87707a297d..bb9a0a2bdf98 100644
--- llvm/lib/Target/ARM/A15SDOptimizer.cpp
+++ llvm/lib/Target/ARM/A15SDOptimizer.cpp
@@ -142,9 +142,10 @@ bool A15SDOptimizer::usesRegClass(MachineOperand &MO,
 }
 
 unsigned A15SDOptimizer::getDPRLaneFromSPR(unsigned SReg) {
-  unsigned DReg = TRI->getMatchingSuperReg(SReg, ARM::ssub_1,
-                                           &ARM::DPRRegClass);
-  if (DReg != ARM::NoRegister) return ARM::ssub_1;
+  MCRegister DReg =
+      TRI->getMatchingSuperReg(SReg, ARM::ssub_1, &ARM::DPRRegClass);
+  if (DReg)
+    return ARM::ssub_1;
   return ARM::ssub_0;
 }
 
diff --git llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index ae54bad0a055..2bca2c08c345 100644
--- llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -825,7 +825,7 @@ unsigned ARMBaseInstrInfo::getInstBundleLength(const MachineInstr &MI) const {
 
 void ARMBaseInstrInfo::copyFromCPSR(MachineBasicBlock &MBB,
                                     MachineBasicBlock::iterator I,
-                                    unsigned DestReg, bool KillSrc,
+                                    MCRegister DestReg, bool KillSrc,
                                     const ARMSubtarget &Subtarget) const {
   unsigned Opc = Subtarget.isThumb()
                      ? (Subtarget.isMClass() ? ARM::t2MRS_M : ARM::t2MRS_AR)
@@ -845,7 +845,7 @@ void ARMBaseInstrInfo::copyFromCPSR(MachineBasicBlock &MBB,
 
 void ARMBaseInstrInfo::copyToCPSR(MachineBasicBlock &MBB,
                                   MachineBasicBlock::iterator I,
-                                  unsigned SrcReg, bool KillSrc,
+                                  MCRegister SrcReg, bool KillSrc,
                                   const ARMSubtarget &Subtarget) const {
   unsigned Opc = Subtarget.isThumb()
                      ? (Subtarget.isMClass() ? ARM::t2MSR_M : ARM::t2MSR_AR)
@@ -1727,10 +1727,10 @@ bool ARMBaseInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     return false;
 
   const TargetRegisterInfo *TRI = &getRegisterInfo();
-  unsigned DstRegD = TRI->getMatchingSuperReg(DstRegS, ARM::ssub_0,
-                                              &ARM::DPRRegClass);
-  unsigned SrcRegD = TRI->getMatchingSuperReg(SrcRegS, ARM::ssub_0,
-                                              &ARM::DPRRegClass);
+  MCRegister DstRegD =
+      TRI->getMatchingSuperReg(DstRegS, ARM::ssub_0, &ARM::DPRRegClass);
+  MCRegister SrcRegD =
+      TRI->getMatchingSuperReg(SrcRegS, ARM::ssub_0, &ARM::DPRRegClass);
   if (!DstRegD || !SrcRegD)
     return false;
 
@@ -2594,7 +2594,7 @@ bool llvm::tryFoldSPUpdateIntoPushPop(const ARMSubtarget &Subtarget,
   // Now try to find enough space in the reglist to allocate NumBytes.
   for (int CurRegEnc = FirstRegEnc - 1; CurRegEnc >= 0 && RegsNeeded;
        --CurRegEnc) {
-    unsigned CurReg = RegClass->getRegister(CurRegEnc);
+    MCRegister CurReg = RegClass->getRegister(CurRegEnc);
     if (IsT1PushPop && CurRegEnc > TRI->getEncodingValue(ARM::R7))
       continue;
     if (!IsPop) {
@@ -5089,13 +5089,14 @@ ARMBaseInstrInfo::getExecutionDomain(const MachineInstr &MI) const {
   return std::make_pair(ExeGeneric, 0);
 }
 
-static unsigned getCorrespondingDRegAndLane(const TargetRegisterInfo *TRI,
-                                            unsigned SReg, unsigned &Lane) {
-  unsigned DReg = TRI->getMatchingSuperReg(SReg, ARM::ssub_0, &ARM::DPRRegClass);
+static MCRegister getCorrespondingDRegAndLane(const TargetRegisterInfo *TRI,
+                                              unsigned SReg, unsigned &Lane) {
+  MCRegister DReg =
+      TRI->getMatchingSuperReg(SReg, ARM::ssub_0, &ARM::DPRRegClass);
   Lane = 0;
 
-  if (DReg != ARM::NoRegister)
-   return DReg;
+  if (DReg)
+    return DReg;
 
   Lane = 1;
   DReg = TRI->getMatchingSuperReg(SReg, ARM::ssub_1, &ARM::DPRRegClass);
@@ -5120,12 +5121,13 @@ static unsigned getCorrespondingDRegAndLane(const TargetRegisterInfo *TRI,
 /// (including the case where the DPR itself is defined), it should not.
 ///
 static bool getImplicitSPRUseForDPRUse(const TargetRegisterInfo *TRI,
-                                       MachineInstr &MI, unsigned DReg,
-                                       unsigned Lane, unsigned &ImplicitSReg) {
+                                       MachineInstr &MI, MCRegister DReg,
+                                       unsigned Lane,
+                                       MCRegister &ImplicitSReg) {
   // If the DPR is defined or used already, the other SPR lane will be chained
   // correctly, so there is nothing to be done.
   if (MI.definesRegister(DReg, TRI) || MI.readsRegister(DReg, TRI)) {
-    ImplicitSReg = 0;
+    ImplicitSReg = MCRegister();
     return true;
   }
 
@@ -5142,13 +5144,14 @@ static bool getImplicitSPRUseForDPRUse(const TargetRegisterInfo *TRI,
 
   // If the register is known not to be live, there is no need to add an
   // implicit-use.
-  ImplicitSReg = 0;
+  ImplicitSReg = MCRegister();
   return true;
 }
 
 void ARMBaseInstrInfo::setExecutionDomain(MachineInstr &MI,
                                           unsigned Domain) const {
-  unsigned DstReg, SrcReg, DReg;
+  unsigned DstReg, SrcReg;
+  MCRegister DReg;
   unsigned Lane;
   MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
   const TargetRegisterInfo *TRI = &getRegisterInfo();
@@ -5218,7 +5221,7 @@ void ARMBaseInstrInfo::setExecutionDomain(MachineInstr &MI,
 
     DReg = getCorrespondingDRegAndLane(TRI, DstReg, Lane);
 
-    unsigned ImplicitSReg;
+    MCRegister ImplicitSReg;
     if (!getImplicitSPRUseForDPRUse(TRI, MI, DReg, Lane, ImplicitSReg))
       break;
 
@@ -5237,7 +5240,7 @@ void ARMBaseInstrInfo::setExecutionDomain(MachineInstr &MI,
     // The narrower destination must be marked as set to keep previous chains
     // in place.
     MIB.addReg(DstReg, RegState::Define | RegState::Implicit);
-    if (ImplicitSReg != 0)
+    if (ImplicitSReg)
       MIB.addReg(ImplicitSReg, RegState::Implicit);
     break;
     }
@@ -5249,11 +5252,12 @@ void ARMBaseInstrInfo::setExecutionDomain(MachineInstr &MI,
       DstReg = MI.getOperand(0).getReg();
       SrcReg = MI.getOperand(1).getReg();
 
-      unsigned DstLane = 0, SrcLane = 0, DDst, DSrc;
+      unsigned DstLane = 0, SrcLane = 0;
+      MCRegister DDst, DSrc;
       DDst = getCorrespondingDRegAndLane(TRI, DstReg, DstLane);
       DSrc = getCorrespondingDRegAndLane(TRI, SrcReg, SrcLane);
 
-      unsigned ImplicitSReg;
+      MCRegister ImplicitSReg;
       if (!getImplicitSPRUseForDPRUse(TRI, MI, DSrc, SrcLane, ImplicitSReg))
         break;
 
@@ -5273,7 +5277,7 @@ void ARMBaseInstrInfo::setExecutionDomain(MachineInstr &MI,
         // more, so add them in manually.
         MIB.addReg(DstReg, RegState::Implicit | RegState::Define);
         MIB.addReg(SrcReg, RegState::Implicit);
-        if (ImplicitSReg != 0)
+        if (ImplicitSReg)
           MIB.addReg(ImplicitSReg, RegState::Implicit);
         break;
       }
@@ -5297,7 +5301,7 @@ void ARMBaseInstrInfo::setExecutionDomain(MachineInstr &MI,
       // On the first instruction, both DSrc and DDst may be undef if present.
       // Specifically when the original instruction didn't have them as an
       // <imp-use>.
-      unsigned CurReg = SrcLane == 1 && DstLane == 1 ? DSrc : DDst;
+      MCRegister CurReg = SrcLane == 1 && DstLane == 1 ? DSrc : DDst;
       bool CurUndef = !MI.readsRegister(CurReg, TRI);
       NewMIB.addReg(CurReg, getUndefRegState(CurUndef));
 
@@ -5402,8 +5406,8 @@ unsigned ARMBaseInstrInfo::getPartialRegUpdateClearance(
       return 0;
   } else if (ARM::SPRRegClass.contains(Reg)) {
     // Physical register: MI must define the full D-reg.
-    unsigned DReg = TRI->getMatchingSuperReg(Reg, ARM::ssub_0,
-                                             &ARM::DPRRegClass);
+    MCRegister DReg =
+        TRI->getMatchingSuperReg(Reg, ARM::ssub_0, &ARM::DPRRegClass);
     if (!DReg || !MI.definesRegister(DReg, TRI))
       return 0;
   }
diff --git llvm/lib/Target/ARM/ARMBaseInstrInfo.h llvm/lib/Target/ARM/ARMBaseInstrInfo.h
index b6f20e6f99a0..9422e12c5dfc 100644
--- llvm/lib/Target/ARM/ARMBaseInstrInfo.h
+++ llvm/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -201,10 +201,10 @@ public:
                                     int &FrameIndex) const override;
 
   void copyToCPSR(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
-                  unsigned SrcReg, bool KillSrc,
+                  MCRegister SrcReg, bool KillSrc,
                   const ARMSubtarget &Subtarget) const;
   void copyFromCPSR(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
-                    unsigned DestReg, bool KillSrc,
+                    MCRegister DestReg, bool KillSrc,
                     const ARMSubtarget &Subtarget) const;
 
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
diff --git llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index 291bfc0610f8..22ebe175ff62 100644
--- llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -334,12 +334,12 @@ ARMBaseRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
 }
 
 // Get the other register in a GPRPair.
-static MCPhysReg getPairedGPR(MCPhysReg Reg, bool Odd,
-                              const MCRegisterInfo *RI) {
+static MCRegister getPairedGPR(MCRegister Reg, bool Odd,
+                               const MCRegisterInfo *RI) {
   for (MCPhysReg Super : RI->superregs(Reg))
     if (ARM::GPRPairRegClass.contains(Super))
       return RI->getSubReg(Super, Odd ? ARM::gsub_1 : ARM::gsub_0);
-  return 0;
+  return MCRegister();
 }
 
 // Resolve the RegPairEven / RegPairOdd register allocator hints.
@@ -390,7 +390,7 @@ bool ARMBaseRegisterInfo::getRegAllocationHints(
     if (Reg == PairedPhys || (getEncodingValue(Reg) & 1) != Odd)
       continue;
     // Don't provide hints that are paired to a reserved register.
-    MCPhysReg Paired = getPairedGPR(Reg, !Odd, this);
+    MCRegister Paired = getPairedGPR(Reg, !Odd, this);
     if (!Paired || MRI.isReserved(Paired))
       continue;
     Hints.push_back(Reg);
diff --git llvm/lib/Target/ARM/ARMBaseRegisterInfo.h llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
index 478c32fa724f..68a28043fd32 100644
--- llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
+++ llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -41,7 +41,7 @@ namespace ARMRI {
 
 } // end namespace ARMRI
 
-static inline bool isCalleeSavedRegister(unsigned Reg,
+static inline bool isCalleeSavedRegister(MCRegister Reg,
                                          const MCPhysReg *CSRegs) {
   for (unsigned i = 0; CSRegs[i]; ++i)
     if (Reg == CSRegs[i])
diff --git llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index 2e5dc09c00ce..9e1f3fcbdc43 100644
--- llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -516,8 +516,8 @@ static const NEONLdStTableEntry *LookupNEONLdSt(unsigned Opcode) {
 /// corresponding to the specified register spacing.  Not all of the results
 /// are necessarily valid, e.g., a Q register only has 2 D subregisters.
 static void GetDSubRegs(unsigned Reg, NEONRegSpacing RegSpc,
-                        const TargetRegisterInfo *TRI, unsigned &D0,
-                        unsigned &D1, unsigned &D2, unsigned &D3) {
+                        const TargetRegisterInfo *TRI, MCRegister &D0,
+                        MCRegister &D1, MCRegister &D2, MCRegister &D3) {
   if (RegSpc == SingleSpc || RegSpc == SingleLowSpc) {
     D0 = TRI->getSubReg(Reg, ARM::dsub_0);
     D1 = TRI->getSubReg(Reg, ARM::dsub_1);
@@ -585,11 +585,11 @@ void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) {
       SubRegIndex = ARM::dsub_1;
     }
     Register SubReg = TRI->getSubReg(DstReg, SubRegIndex);
-    unsigned DstRegPair = TRI->getMatchingSuperReg(SubReg, ARM::dsub_0,
-                                                   &ARM::DPairSpcRegClass);
+    MCRegister DstRegPair =
+        TRI->getMatchingSuperReg(SubReg, ARM::dsub_0, &ARM::DPairSpcRegClass);
     MIB.addReg(DstRegPair, RegState::Define | getDeadRegState(DstIsDead));
   } else {
-    unsigned D0, D1, D2, D3;
+    MCRegister D0, D1, D2, D3;
     GetDSubRegs(DstReg, RegSpc, TRI, D0, D1, D2, D3);
     MIB.addReg(D0, RegState::Define | getDeadRegState(DstIsDead));
     if (NumRegs > 1 && TableEntry->copyAllListRegs)
@@ -715,7 +715,7 @@ void ARMExpandPseudo::ExpandVST(MachineBasicBlock::iterator &MBBI) {
   bool SrcIsKill = MI.getOperand(OpIdx).isKill();
   bool SrcIsUndef = MI.getOperand(OpIdx).isUndef();
   Register SrcReg = MI.getOperand(OpIdx++).getReg();
-  unsigned D0, D1, D2, D3;
+  MCRegister D0, D1, D2, D3;
   GetDSubRegs(SrcReg, RegSpc, TRI, D0, D1, D2, D3);
   MIB.addReg(D0, getUndefRegState(SrcIsUndef));
   if (NumRegs > 1 && TableEntry->copyAllListRegs)
@@ -769,7 +769,7 @@ void ARMExpandPseudo::ExpandLaneOp(MachineBasicBlock::iterator &MBBI) {
   }
   assert(Lane < RegElts && "out of range lane for VLD/VST-lane");
 
-  unsigned D0 = 0, D1 = 0, D2 = 0, D3 = 0;
+  MCRegister D0, D1, D2, D3;
   unsigned DstReg = 0;
   bool DstIsDead = false;
   if (TableEntry->IsLoad) {
@@ -851,7 +851,7 @@ void ARMExpandPseudo::ExpandVTBL(MachineBasicBlock::iterator &MBBI,
 
   bool SrcIsKill = MI.getOperand(OpIdx).isKill();
   Register SrcReg = MI.getOperand(OpIdx++).getReg();
-  unsigned D0, D1, D2, D3;
+  MCRegister D0, D1, D2, D3;
   GetDSubRegs(SrcReg, SingleSpc, TRI, D0, D1, D2, D3);
   MIB.addReg(D0);
 
@@ -1547,7 +1547,7 @@ void ARMExpandPseudo::CMSESaveClearFPRegsV8(
       } else {
         // For big-endian targets we need to load the two subregisters of Reg
         // manually because VLDRD would load them in wrong order
-        unsigned SReg0 = TRI->getSubReg(Reg, ARM::ssub_0);
+        MCRegister SReg0 = TRI->getSubReg(Reg, ARM::ssub_0);
         BuildMI(MBB, MBBI, DL, TII->get(ARM::VLDRS), SReg0)
             .addReg(ARM::SP)
             .addImm((Reg - ARM::D0) * 2)
diff --git llvm/lib/Target/ARM/ARMFrameLowering.cpp llvm/lib/Target/ARM/ARMFrameLowering.cpp
index 8b94bfac9b0c..3393c55f1639 100644
--- llvm/lib/Target/ARM/ARMFrameLowering.cpp
+++ llvm/lib/Target/ARM/ARMFrameLowering.cpp
@@ -1917,8 +1917,8 @@ static void emitAlignedDPRCS2Spills(MachineBasicBlock &MBB,
   // 16-byte aligned vst1.64 with 4 d-regs and address writeback.
   // The writeback is only needed when emitting two vst1.64 instructions.
   if (NumAlignedDPRCS2Regs >= 6) {
-    unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0,
-                                               &ARM::QQPRRegClass);
+    MCRegister SupReg =
+        TRI->getMatchingSuperReg(NextReg, ARM::dsub_0, &ARM::QQPRRegClass);
     MBB.addLiveIn(SupReg);
     BuildMI(MBB, MI, DL, TII.get(ARM::VST1d64Qwb_fixed), ARM::R4)
         .addReg(ARM::R4, RegState::Kill)
@@ -1936,8 +1936,8 @@ static void emitAlignedDPRCS2Spills(MachineBasicBlock &MBB,
 
   // 16-byte aligned vst1.64 with 4 d-regs, no writeback.
   if (NumAlignedDPRCS2Regs >= 4) {
-    unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0,
-                                               &ARM::QQPRRegClass);
+    MCRegister SupReg =
+        TRI->getMatchingSuperReg(NextReg, ARM::dsub_0, &ARM::QQPRRegClass);
     MBB.addLiveIn(SupReg);
     BuildMI(MBB, MI, DL, TII.get(ARM::VST1d64Q))
         .addReg(ARM::R4)
@@ -1951,8 +1951,8 @@ static void emitAlignedDPRCS2Spills(MachineBasicBlock &MBB,
 
   // 16-byte aligned vst1.64 with 2 d-regs.
   if (NumAlignedDPRCS2Regs >= 2) {
-    unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0,
-                                               &ARM::QPRRegClass);
+    MCRegister SupReg =
+        TRI->getMatchingSuperReg(NextReg, ARM::dsub_0, &ARM::QPRRegClass);
     MBB.addLiveIn(SupReg);
     BuildMI(MBB, MI, DL, TII.get(ARM::VST1q64))
         .addReg(ARM::R4)
@@ -2049,8 +2049,8 @@ static void emitAlignedDPRCS2Restores(MachineBasicBlock &MBB,
 
   // 16-byte aligned vld1.64 with 4 d-regs and writeback.
   if (NumAlignedDPRCS2Regs >= 6) {
-    unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0,
-                                               &ARM::QQPRRegClass);
+    MCRegister SupReg =
+        TRI->getMatchingSuperReg(NextReg, ARM::dsub_0, &ARM::QQPRRegClass);
     BuildMI(MBB, MI, DL, TII.get(ARM::VLD1d64Qwb_fixed), NextReg)
         .addReg(ARM::R4, RegState::Define)
         .addReg(ARM::R4, RegState::Kill)
@@ -2067,8 +2067,8 @@ static void emitAlignedDPRCS2Restores(MachineBasicBlock &MBB,
 
   // 16-byte aligned vld1.64 with 4 d-regs, no writeback.
   if (NumAlignedDPRCS2Regs >= 4) {
-    unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0,
-                                               &ARM::QQPRRegClass);
+    MCRegister SupReg =
+        TRI->getMatchingSuperReg(NextReg, ARM::dsub_0, &ARM::QQPRRegClass);
     BuildMI(MBB, MI, DL, TII.get(ARM::VLD1d64Q), NextReg)
         .addReg(ARM::R4)
         .addImm(16)
@@ -2080,8 +2080,8 @@ static void emitAlignedDPRCS2Restores(MachineBasicBlock &MBB,
 
   // 16-byte aligned vld1.64 with 2 d-regs.
   if (NumAlignedDPRCS2Regs >= 2) {
-    unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0,
-                                               &ARM::QPRRegClass);
+    MCRegister SupReg =
+        TRI->getMatchingSuperReg(NextReg, ARM::dsub_0, &ARM::QPRRegClass);
     BuildMI(MBB, MI, DL, TII.get(ARM::VLD1q64), SupReg)
         .addReg(ARM::R4)
         .addImm(16)
diff --git llvm/lib/Target/ARM/ARMISelLowering.cpp llvm/lib/Target/ARM/ARMISelLowering.cpp
index 2e517c21fc4a..bd8d6079e1ba 100644
--- llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -3241,7 +3241,7 @@ bool
 ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
                                   MachineFunction &MF, bool isVarArg,
                                   const SmallVectorImpl<ISD::OutputArg> &Outs,
-                                  LLVMContext &Context) const {
+                                  LLVMContext &Context, const Type *RetTy) const {
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
   return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
diff --git llvm/lib/Target/ARM/ARMISelLowering.h llvm/lib/Target/ARM/ARMISelLowering.h
index 3c1a414af859..9fad056edd3f 100644
--- llvm/lib/Target/ARM/ARMISelLowering.h
+++ llvm/lib/Target/ARM/ARMISelLowering.h
@@ -965,7 +965,7 @@ class VectorType;
     bool CanLowerReturn(CallingConv::ID CallConv,
                         MachineFunction &MF, bool isVarArg,
                         const SmallVectorImpl<ISD::OutputArg> &Outs,
-                        LLVMContext &Context) const override;
+                        LLVMContext &Context, const Type *RetTy) const override;
 
     SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
                         const SmallVectorImpl<ISD::OutputArg> &Outs,
diff --git llvm/lib/Target/ARM/ARMSubtarget.cpp llvm/lib/Target/ARM/ARMSubtarget.cpp
index 07207e29bf7d..893084785e6f 100644
--- llvm/lib/Target/ARM/ARMSubtarget.cpp
+++ llvm/lib/Target/ARM/ARMSubtarget.cpp
@@ -478,7 +478,7 @@ unsigned ARMSubtarget::getGPRAllocationOrder(const MachineFunction &MF) const {
 }
 
 bool ARMSubtarget::ignoreCSRForAllocationOrder(const MachineFunction &MF,
-                                               unsigned PhysReg) const {
+                                               MCRegister PhysReg) const {
   // To minimize code size in Thumb2, we prefer the usage of low regs (lower
   // cost per use) so we can  use narrow encoding. By default, caller-saved
   // registers (e.g. lr, r12) are always  allocated first, regardless of
diff --git llvm/lib/Target/ARM/ARMSubtarget.h llvm/lib/Target/ARM/ARMSubtarget.h
index 611eeac9ef71..7329d3f2055f 100644
--- llvm/lib/Target/ARM/ARMSubtarget.h
+++ llvm/lib/Target/ARM/ARMSubtarget.h
@@ -523,7 +523,7 @@ public:
   }
 
   bool ignoreCSRForAllocationOrder(const MachineFunction &MF,
-                                   unsigned PhysReg) const override;
+                                   MCRegister PhysReg) const override;
   unsigned getGPRAllocationOrder(const MachineFunction &MF) const;
 };
 
diff --git llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 639f3bf8fc62..6b3fa0479806 100644
--- llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -83,9 +83,8 @@ static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign,
   if (!isPowerOf2_32(Alignment))
     return nullptr;
 
-  auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0),
-                                          PointerType::get(II.getType(), 0));
-  return Builder.CreateAlignedLoad(II.getType(), BCastInst, Align(Alignment));
+  return Builder.CreateAlignedLoad(II.getType(), II.getArgOperand(0),
+                                   Align(Alignment));
 }
 
 bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
diff --git llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index d0e759d3356f..6e2886a19292 100644
--- llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -955,7 +955,7 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm,
 
 bool ARMAsmBackend::shouldForceRelocation(const MCAssembler &Asm,
                                           const MCFixup &Fixup,
-                                          const MCValue &Target,
+                                          const MCValue &Target, const uint64_t,
                                           const MCSubtargetInfo *STI) {
   const MCSymbolRefExpr *A = Target.getSymA();
   const MCSymbol *Sym = A ? &A->getSymbol() : nullptr;
diff --git llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
index f33cd8b7c242..2932e68cd98e 100644
--- llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
+++ llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
@@ -36,7 +36,7 @@ public:
   const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
 
   bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
-                             const MCValue &Target,
+                             const MCValue &Target, const uint64_t Value,
                              const MCSubtargetInfo *STI) override;
 
   unsigned adjustFixupValue(const MCAssembler &Asm, const MCFixup &Fixup,
diff --git llvm/lib/Target/AVR/AVRISelLowering.cpp llvm/lib/Target/AVR/AVRISelLowering.cpp
index 07c79f6f227b..c73ff83d2978 100644
--- llvm/lib/Target/AVR/AVRISelLowering.cpp
+++ llvm/lib/Target/AVR/AVRISelLowering.cpp
@@ -1670,7 +1670,8 @@ SDValue AVRTargetLowering::LowerCallResult(
 
 bool AVRTargetLowering::CanLowerReturn(
     CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
-    const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
+    const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
+    const Type *RetTy) const {
   if (CallConv == CallingConv::AVR_BUILTIN) {
     SmallVector<CCValAssign, 16> RVLocs;
     CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
diff --git llvm/lib/Target/AVR/AVRISelLowering.h llvm/lib/Target/AVR/AVRISelLowering.h
index f60579593453..cd45444e2bc3 100644
--- llvm/lib/Target/AVR/AVRISelLowering.h
+++ llvm/lib/Target/AVR/AVRISelLowering.h
@@ -172,7 +172,7 @@ private:
   bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
                       bool isVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
-                      LLVMContext &Context) const override;
+                      LLVMContext &Context, const Type *RetTy) const override;
 
   SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
diff --git llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
index fd35f8fcb8e7..fbed25157a44 100644
--- llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
+++ llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
@@ -31,22 +31,6 @@ namespace adjust {
 
 using namespace llvm;
 
-static void signed_width(unsigned Width, uint64_t Value,
-                         std::string Description, const MCFixup &Fixup,
-                         MCContext *Ctx) {
-  if (!isIntN(Width, Value)) {
-    std::string Diagnostic = "out of range " + Description;
-
-    int64_t Min = minIntN(Width);
-    int64_t Max = maxIntN(Width);
-
-    Diagnostic += " (expected an integer in the range " + std::to_string(Min) +
-                  " to " + std::to_string(Max) + ")";
-
-    Ctx->reportError(Fixup.getLoc(), Diagnostic);
-  }
-}
-
 static void unsigned_width(unsigned Width, uint64_t Value,
                            std::string Description, const MCFixup &Fixup,
                            MCContext *Ctx) {
@@ -74,8 +58,8 @@ static void adjustBranch(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
 }
 
 /// Adjusts the value of a relative branch target before fixup application.
-static void adjustRelativeBranch(unsigned Size, const MCFixup &Fixup,
-                                 uint64_t &Value, MCContext *Ctx) {
+static bool adjustRelativeBranch(unsigned Size, const MCFixup &Fixup,
+                                 uint64_t &Value, const MCSubtargetInfo *STI) {
   // Jumps are relative to the current instruction.
   Value -= 2;
 
@@ -83,8 +67,9 @@ static void adjustRelativeBranch(unsigned Size, const MCFixup &Fixup,
   // one.
   Size += 1;
 
-  if (!isIntN(Size, Value) &&
-      Ctx->getSubtargetInfo()->hasFeature(AVR::FeatureWrappingRjmp)) {
+  assert(STI && "STI can not be NULL");
+
+  if (!isIntN(Size, Value) && STI->hasFeature(AVR::FeatureWrappingRjmp)) {
     const int32_t FlashSize = 0x2000;
     int32_t SignedValue = Value;
 
@@ -96,10 +81,14 @@ static void adjustRelativeBranch(unsigned Size, const MCFixup &Fixup,
     }
   }
 
-  signed_width(Size, Value, std::string("branch target"), Fixup, Ctx);
+  if (!isIntN(Size, Value)) {
+    return false;
+  }
 
   // Rightshifts the value by one.
   AVR::fixups::adjustBranchTarget(Value);
+
+  return true;
 }
 
 /// 22-bit absolute fixup.
@@ -126,7 +115,9 @@ static void fixup_call(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
 /// Offset of 0 (so the result is left shifted by 3 bits before application).
 static void fixup_7_pcrel(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
                           MCContext *Ctx) {
-  adjustRelativeBranch(Size, Fixup, Value, Ctx);
+  if (!adjustRelativeBranch(Size, Fixup, Value, Ctx->getSubtargetInfo())) {
+    llvm_unreachable("should've been emitted as a relocation");
+  }
 
   // Because the value may be negative, we must mask out the sign bits
   Value &= 0x7f;
@@ -140,7 +131,9 @@ static void fixup_7_pcrel(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
 /// Offset of 0 (so the result isn't left-shifted before application).
 static void fixup_13_pcrel(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
                            MCContext *Ctx) {
-  adjustRelativeBranch(Size, Fixup, Value, Ctx);
+  if (!adjustRelativeBranch(Size, Fixup, Value, Ctx->getSubtargetInfo())) {
+    llvm_unreachable("should've been emitted as a relocation");
+  }
 
   // Because the value may be negative, we must mask out the sign bits
   Value &= 0xfff;
@@ -181,7 +174,7 @@ static void fixup_port5(const MCFixup &Fixup, uint64_t &Value, MCContext *Ctx) {
   Value <<= 3;
 }
 
-/// 6-bit port number fixup on the `IN` family of instructions.
+/// 6-bit port number fixup on the IN family of instructions.
 ///
 /// Resolves to:
 /// 1011 0AAd dddd AAAA
@@ -512,14 +505,25 @@ bool AVRAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count,
 bool AVRAsmBackend::shouldForceRelocation(const MCAssembler &Asm,
                                           const MCFixup &Fixup,
                                           const MCValue &Target,
+                                          const uint64_t Value,
                                           const MCSubtargetInfo *STI) {
   switch ((unsigned)Fixup.getKind()) {
   default:
     return Fixup.getKind() >= FirstLiteralRelocationKind;
+
   case AVR::fixup_7_pcrel:
-  case AVR::fixup_13_pcrel:
-    // Always resolve relocations for PC-relative branches
-    return false;
+  case AVR::fixup_13_pcrel: {
+    uint64_t ValueEx = Value;
+    uint64_t Size = AVRAsmBackend::getFixupKindInfo(Fixup.getKind()).TargetSize;
+
+    // If the jump is too large to encode it, fall back to a relocation.
+    //
+    // Note that trying to actually link that relocation *would* fail, but the
+    // hopes are that the module we're currently compiling won't be actually
+    // linked to the final binary.
+    return !adjust::adjustRelativeBranch(Size, Fixup, ValueEx, STI);
+  }
+
   case AVR::fixup_call:
     return true;
   }
diff --git llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h
index 233731959032..1a9ae94f2f49 100644
--- llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h
+++ llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h
@@ -53,7 +53,7 @@ public:
                     const MCSubtargetInfo *STI) const override;
 
   bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
-                             const MCValue &Target,
+                             const MCValue &Target, const uint64_t Value,
                              const MCSubtargetInfo *STI) override;
 
 private:
diff --git llvm/lib/Target/CSKY/CSKYISelLowering.cpp llvm/lib/Target/CSKY/CSKYISelLowering.cpp
index c3fc9f9ead5e..4cea262d40a3 100644
--- llvm/lib/Target/CSKY/CSKYISelLowering.cpp
+++ llvm/lib/Target/CSKY/CSKYISelLowering.cpp
@@ -426,7 +426,8 @@ SDValue CSKYTargetLowering::LowerFormalArguments(
 
 bool CSKYTargetLowering::CanLowerReturn(
     CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
-    const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
+    const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
+    const Type *RetTy) const {
   SmallVector<CCValAssign, 16> CSKYLocs;
   CCState CCInfo(CallConv, IsVarArg, MF, CSKYLocs, Context);
   return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg));
diff --git llvm/lib/Target/CSKY/CSKYISelLowering.h llvm/lib/Target/CSKY/CSKYISelLowering.h
index d59481af3c5b..0accfcad1879 100644
--- llvm/lib/Target/CSKY/CSKYISelLowering.h
+++ llvm/lib/Target/CSKY/CSKYISelLowering.h
@@ -61,7 +61,7 @@ private:
   bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
                       bool IsVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
-                      LLVMContext &Context) const override;
+                      LLVMContext &Context, const Type *RetTy) const override;
 
   SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
diff --git llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp
index dd06971e1cf9..ebe12fa6afd1 100644
--- llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp
+++ llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp
@@ -262,6 +262,7 @@ bool CSKYAsmBackend::mayNeedRelaxation(const MCInst &Inst,
 bool CSKYAsmBackend::shouldForceRelocation(const MCAssembler &Asm,
                                            const MCFixup &Fixup,
                                            const MCValue &Target,
+                                           const uint64_t /*Value*/,
                                            const MCSubtargetInfo * /*STI*/) {
   if (Fixup.getKind() >= FirstLiteralRelocationKind)
     return true;
diff --git llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h
index 4b659f401d25..faa84a6ef71d 100644
--- llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h
+++ llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h
@@ -52,7 +52,7 @@ public:
                     const MCSubtargetInfo *STI) const override;
 
   bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
-                             const MCValue &Target,
+                             const MCValue &Target, const uint64_t Value,
                              const MCSubtargetInfo *STI) override;
 
   std::unique_ptr<MCObjectTargetWriter>
diff --git llvm/lib/Target/DirectX/DXILShaderFlags.cpp llvm/lib/Target/DirectX/DXILShaderFlags.cpp
index b1ff975d4dae..6a15bac153d8 100644
--- llvm/lib/Target/DirectX/DXILShaderFlags.cpp
+++ llvm/lib/Target/DirectX/DXILShaderFlags.cpp
@@ -64,11 +64,22 @@ void ModuleShaderFlags::updateFunctionFlags(ComputedShaderFlags &CSF,
     switch (II->getIntrinsicID()) {
     default:
       break;
+    case Intrinsic::dx_resource_handlefrombinding:
+      switch (DRTM[cast<TargetExtType>(II->getType())].getResourceKind()) {
+      case dxil::ResourceKind::StructuredBuffer:
+      case dxil::ResourceKind::RawBuffer:
+        CSF.EnableRawAndStructuredBuffers = true;
+        break;
+      default:
+        break;
+      }
+      break;
     case Intrinsic::dx_resource_load_typedbuffer: {
       dxil::ResourceTypeInfo &RTI =
           DRTM[cast<TargetExtType>(II->getArgOperand(0)->getType())];
       if (RTI.isTyped())
         CSF.TypedUAVLoadAdditionalFormats |= RTI.getTyped().ElementCount > 1;
+      break;
     }
     }
   }
diff --git llvm/lib/Target/Hexagon/HexagonISelLowering.cpp llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
index a19f9749cd9e..12ca0c505bd0 100644
--- llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -182,7 +182,7 @@ bool
 HexagonTargetLowering::CanLowerReturn(
     CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
     const SmallVectorImpl<ISD::OutputArg> &Outs,
-    LLVMContext &Context) const {
+    LLVMContext &Context, const Type *RetTy) const {
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
 
diff --git llvm/lib/Target/Hexagon/HexagonISelLowering.h llvm/lib/Target/Hexagon/HexagonISelLowering.h
index 3fd961f5a746..aaa9c65c1e07 100644
--- llvm/lib/Target/Hexagon/HexagonISelLowering.h
+++ llvm/lib/Target/Hexagon/HexagonISelLowering.h
@@ -249,7 +249,7 @@ public:
   bool CanLowerReturn(CallingConv::ID CallConv,
                       MachineFunction &MF, bool isVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
-                      LLVMContext &Context) const override;
+                      LLVMContext &Context, const Type *RetTy) const override;
 
   SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
diff --git llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
index 7864d45d594a..98b1dde8fa3f 100644
--- llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
+++ llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
@@ -201,7 +201,7 @@ public:
   }
 
   bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
-                             const MCValue &Target,
+                             const MCValue &Target, const uint64_t,
                              const MCSubtargetInfo *STI) override {
     switch(Fixup.getTargetKind()) {
       default:
diff --git llvm/lib/Target/Lanai/LanaiISelLowering.cpp llvm/lib/Target/Lanai/LanaiISelLowering.cpp
index da55b7b8c6d6..e0792b36ce4d 100644
--- llvm/lib/Target/Lanai/LanaiISelLowering.cpp
+++ llvm/lib/Target/Lanai/LanaiISelLowering.cpp
@@ -527,7 +527,8 @@ SDValue LanaiTargetLowering::LowerCCCArguments(
 
 bool LanaiTargetLowering::CanLowerReturn(
     CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
-    const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
+    const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
+    const Type *RetTy) const {
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
 
diff --git llvm/lib/Target/Lanai/LanaiISelLowering.h llvm/lib/Target/Lanai/LanaiISelLowering.h
index 5fa5444b5161..ebec2525b93c 100644
--- llvm/lib/Target/Lanai/LanaiISelLowering.h
+++ llvm/lib/Target/Lanai/LanaiISelLowering.h
@@ -93,7 +93,7 @@ public:
   bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
                       bool IsVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
-                      LLVMContext &Context) const override;
+                      LLVMContext &Context, const Type *RetTy) const override;
 
   Register getRegisterByName(const char *RegName, LLT VT,
                              const MachineFunction &MF) const override;
diff --git llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
index 0218934ea334..c2d73a260b1c 100644
--- llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
+++ llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
@@ -187,18 +187,21 @@ bool LoongArchPreRAExpandPseudo::expandPcalau12iInstPair(
   MachineInstr &MI = *MBBI;
   DebugLoc DL = MI.getDebugLoc();
 
+  const auto &STI = MF->getSubtarget<LoongArchSubtarget>();
+  bool EnableRelax = STI.hasFeature(LoongArch::FeatureRelax);
+
   Register DestReg = MI.getOperand(0).getReg();
   Register ScratchReg =
       MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass);
   MachineOperand &Symbol = MI.getOperand(1);
 
   BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCALAU12I), ScratchReg)
-      .addDisp(Symbol, 0, FlagsHi);
+      .addDisp(Symbol, 0, LoongArchII::encodeFlags(FlagsHi, EnableRelax));
 
   MachineInstr *SecondMI =
       BuildMI(MBB, MBBI, DL, TII->get(SecondOpcode), DestReg)
           .addReg(ScratchReg)
-          .addDisp(Symbol, 0, FlagsLo);
+          .addDisp(Symbol, 0, LoongArchII::encodeFlags(FlagsLo, EnableRelax));
 
   if (MI.hasOneMemOperand())
     SecondMI->addMemOperand(*MF, *MI.memoperands_begin());
@@ -481,6 +484,7 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSDesc(
   unsigned ADD = STI.is64Bit() ? LoongArch::ADD_D : LoongArch::ADD_W;
   unsigned ADDI = STI.is64Bit() ? LoongArch::ADDI_D : LoongArch::ADDI_W;
   unsigned LD = STI.is64Bit() ? LoongArch::LD_D : LoongArch::LD_W;
+  bool EnableRelax = STI.hasFeature(LoongArch::FeatureRelax);
 
   Register DestReg = MI.getOperand(0).getReg();
   Register Tmp1Reg =
@@ -488,7 +492,9 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSDesc(
   MachineOperand &Symbol = MI.getOperand(Large ? 2 : 1);
 
   BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCALAU12I), Tmp1Reg)
-      .addDisp(Symbol, 0, LoongArchII::MO_DESC_PC_HI);
+      .addDisp(Symbol, 0,
+               LoongArchII::encodeFlags(LoongArchII::MO_DESC_PC_HI,
+                                        EnableRelax && !Large));
 
   if (Large) {
     // Code Sequence:
@@ -526,19 +532,25 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSDesc(
     // pcalau12i $a0, %desc_pc_hi20(sym)
     // addi.w/d  $a0, $a0, %desc_pc_lo12(sym)
     // ld.w/d    $ra, $a0, %desc_ld(sym)
-    // jirl      $ra, $ra, %desc_ld(sym)
-    // add.d     $dst, $a0, $tp
+    // jirl      $ra, $ra, %desc_call(sym)
+    // add.w/d   $dst, $a0, $tp
     BuildMI(MBB, MBBI, DL, TII->get(ADDI), LoongArch::R4)
         .addReg(Tmp1Reg)
-        .addDisp(Symbol, 0, LoongArchII::MO_DESC_PC_LO);
+        .addDisp(
+            Symbol, 0,
+            LoongArchII::encodeFlags(LoongArchII::MO_DESC_PC_LO, EnableRelax));
   }
 
   BuildMI(MBB, MBBI, DL, TII->get(LD), LoongArch::R1)
       .addReg(LoongArch::R4)
-      .addDisp(Symbol, 0, LoongArchII::MO_DESC_LD);
+      .addDisp(Symbol, 0,
+               LoongArchII::encodeFlags(LoongArchII::MO_DESC_LD,
+                                        EnableRelax && !Large));
   BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PseudoDESC_CALL), LoongArch::R1)
       .addReg(LoongArch::R1)
-      .addDisp(Symbol, 0, LoongArchII::MO_DESC_CALL);
+      .addDisp(Symbol, 0,
+               LoongArchII::encodeFlags(LoongArchII::MO_DESC_CALL,
+                                        EnableRelax && !Large));
   BuildMI(MBB, MBBI, DL, TII->get(ADD), DestReg)
       .addReg(LoongArch::R4)
       .addReg(LoongArch::R2);
diff --git llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp
index d330f9535560..cb0fb9bc9c7f 100644
--- llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp
+++ llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp
@@ -245,6 +245,28 @@ bool LoongArchDAGToDAGISel::selectNonFIBaseAddr(SDValue Addr, SDValue &Base) {
   return true;
 }
 
+bool LoongArchDAGToDAGISel::SelectAddrRegImm12(SDValue Addr, SDValue &Base,
+                                               SDValue &Offset) {
+  SDLoc DL(Addr);
+  MVT VT = Addr.getSimpleValueType();
+
+  // The address is the result of an ADD. Here we only consider reg+simm12.
+  if (CurDAG->isBaseWithConstantOffset(Addr)) {
+    int64_t Imm = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
+    if (isInt<12>(Imm)) {
+      Base = Addr.getOperand(0);
+      Offset = CurDAG->getTargetConstant(SignExtend64<12>(Imm), DL, VT);
+      return true;
+    }
+  }
+
+  // Otherwise, we assume Addr as the base address and use constant 0 as the
+  // offset.
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, DL, VT);
+  return true;
+}
+
 bool LoongArchDAGToDAGISel::selectShiftMask(SDValue N, unsigned ShiftWidth,
                                             SDValue &ShAmt) {
   // Shift instructions on LoongArch only read the lower 5 or 6 bits of the
diff --git llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.h llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.h
index 363b4f0ca7cf..8a7eba418d80 100644
--- llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.h
+++ llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.h
@@ -43,6 +43,7 @@ public:
   bool SelectBaseAddr(SDValue Addr, SDValue &Base);
   bool SelectAddrConstant(SDValue Addr, SDValue &Base, SDValue &Offset);
   bool selectNonFIBaseAddr(SDValue Addr, SDValue &Base);
+  bool SelectAddrRegImm12(SDValue Addr, SDValue &Base, SDValue &Offset);
 
   bool selectShiftMask(SDValue N, unsigned ShiftWidth, SDValue &ShAmt);
   bool selectShiftMaskGRLen(SDValue N, SDValue &ShAmt) {
diff --git llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 96e6f71344a7..f9f1b097623e 100644
--- llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -99,6 +99,8 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
 
+  setOperationAction(ISD::PREFETCH, MVT::Other, Legal);
+
   // Expand bitreverse.i16 with native-width bitrev and shift for now, before
   // we get to know which of sll and revb.2h is faster.
   setOperationAction(ISD::BITREVERSE, MVT::i8, Custom);
@@ -466,11 +468,10 @@ SDValue LoongArchTargetLowering::lowerBITREVERSE(SDValue Op,
   for (unsigned int i = 0; i < NewEltNum; i++) {
     SDValue Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, NewSrc,
                              DAG.getConstant(i, DL, MVT::i64));
-    SDValue RevOp = DAG.getNode((ResTy == MVT::v16i8 || ResTy == MVT::v32i8)
-                                    ? LoongArchISD::BITREV_8B
-                                    : ISD::BITREVERSE,
-                                DL, MVT::i64, Op);
-    Ops.push_back(RevOp);
+    unsigned RevOp = (ResTy == MVT::v16i8 || ResTy == MVT::v32i8)
+                         ? (unsigned)LoongArchISD::BITREV_8B
+                         : (unsigned)ISD::BITREVERSE;
+    Ops.push_back(DAG.getNode(RevOp, DL, MVT::i64, Op));
   }
   SDValue Res =
       DAG.getNode(ISD::BITCAST, DL, ResTy, DAG.getBuildVector(NewVT, DL, Ops));
@@ -5676,7 +5677,8 @@ LoongArchTargetLowering::LowerCall(CallLoweringInfo &CLI,
 
 bool LoongArchTargetLowering::CanLowerReturn(
     CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
-    const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
+    const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
+    const Type *RetTy) const {
   SmallVector<CCValAssign> RVLocs;
   CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
 
diff --git llvm/lib/Target/LoongArch/LoongArchISelLowering.h llvm/lib/Target/LoongArch/LoongArchISelLowering.h
index e619cb69f333..e1bab9ebdd3f 100644
--- llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+++ llvm/lib/Target/LoongArch/LoongArchISelLowering.h
@@ -183,7 +183,7 @@ public:
   bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
                       bool IsVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
-                      LLVMContext &Context) const override;
+                      LLVMContext &Context, const Type *RetTy) const override;
   SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
                       const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
diff --git llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
index 32bc8bb80129..4e49ba6e339a 100644
--- llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
+++ llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
@@ -455,6 +455,83 @@ bool LoongArchInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
     break;
   }
 
+  const auto &STI = MF.getSubtarget<LoongArchSubtarget>();
+  if (STI.hasFeature(LoongArch::FeatureRelax)) {
+    // When linker relaxation enabled, the following instruction patterns are
+    // prohibited from being reordered:
+    //
+    // * pcalau12i $a0, %pc_hi20(s)
+    //   addi.w/d $a0, $a0, %pc_lo12(s)
+    //
+    // * pcalau12i $a0, %got_pc_hi20(s)
+    //   ld.w/d $a0, $a0, %got_pc_lo12(s)
+    //
+    // * pcalau12i $a0, %ld_pc_hi20(s) | %gd_pc_hi20(s)
+    //   addi.w/d $a0, $a0, %got_pc_lo12(s)
+    //
+    // * pcalau12i $a0, %desc_pc_hi20(s)
+    //   addi.w/d  $a0, $a0, %desc_pc_lo12(s)
+    //   ld.w/d    $ra, $a0, %desc_ld(s)
+    //   jirl      $ra, $ra, %desc_call(s)
+    unsigned AddiOp = STI.is64Bit() ? LoongArch::ADDI_D : LoongArch::ADDI_W;
+    unsigned LdOp = STI.is64Bit() ? LoongArch::LD_D : LoongArch::LD_W;
+    switch (MI.getOpcode()) {
+    case LoongArch::PCALAU12I: {
+      auto MO0 = LoongArchII::getDirectFlags(MI.getOperand(1));
+      auto SecondOp = std::next(MII);
+      if (MO0 == LoongArchII::MO_DESC_PC_HI) {
+        if (SecondOp == MIE || SecondOp->getOpcode() != AddiOp)
+          break;
+        auto Ld = std::next(SecondOp);
+        if (Ld == MIE || Ld->getOpcode() != LdOp)
+          break;
+        auto MO1 = LoongArchII::getDirectFlags(SecondOp->getOperand(2));
+        auto MO2 = LoongArchII::getDirectFlags(Ld->getOperand(2));
+        if (MO1 == LoongArchII::MO_DESC_PC_LO && MO2 == LoongArchII::MO_DESC_LD)
+          return true;
+        break;
+      }
+      if (SecondOp == MIE ||
+          (SecondOp->getOpcode() != AddiOp && SecondOp->getOpcode() != LdOp))
+        break;
+      auto MO1 = LoongArchII::getDirectFlags(SecondOp->getOperand(2));
+      if (MO0 == LoongArchII::MO_PCREL_HI && SecondOp->getOpcode() == AddiOp &&
+          MO1 == LoongArchII::MO_PCREL_LO)
+        return true;
+      if (MO0 == LoongArchII::MO_GOT_PC_HI && SecondOp->getOpcode() == LdOp &&
+          MO1 == LoongArchII::MO_GOT_PC_LO)
+        return true;
+      if ((MO0 == LoongArchII::MO_LD_PC_HI ||
+           MO0 == LoongArchII::MO_GD_PC_HI) &&
+          SecondOp->getOpcode() == AddiOp && MO1 == LoongArchII::MO_GOT_PC_LO)
+        return true;
+      break;
+    }
+    case LoongArch::ADDI_W:
+    case LoongArch::ADDI_D: {
+      auto MO = LoongArchII::getDirectFlags(MI.getOperand(2));
+      if (MO == LoongArchII::MO_PCREL_LO || MO == LoongArchII::MO_GOT_PC_LO)
+        return true;
+      break;
+    }
+    case LoongArch::LD_W:
+    case LoongArch::LD_D: {
+      auto MO = LoongArchII::getDirectFlags(MI.getOperand(2));
+      if (MO == LoongArchII::MO_GOT_PC_LO)
+        return true;
+      break;
+    }
+    case LoongArch::PseudoDESC_CALL: {
+      auto MO = LoongArchII::getDirectFlags(MI.getOperand(2));
+      if (MO == LoongArchII::MO_DESC_CALL)
+        return true;
+      break;
+    }
+    default:
+      break;
+    }
+  }
+
   return false;
 }
 
@@ -630,7 +707,8 @@ bool LoongArchInstrInfo::reverseBranchCondition(
 
 std::pair<unsigned, unsigned>
 LoongArchInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
-  return std::make_pair(TF, 0u);
+  const unsigned Mask = LoongArchII::MO_DIRECT_FLAG_MASK;
+  return std::make_pair(TF & Mask, TF & ~Mask);
 }
 
 ArrayRef<std::pair<unsigned, const char *>>
@@ -656,20 +734,29 @@ LoongArchInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
       {MO_IE_PC_LO, "loongarch-ie-pc-lo"},
       {MO_IE_PC64_LO, "loongarch-ie-pc64-lo"},
       {MO_IE_PC64_HI, "loongarch-ie-pc64-hi"},
+      {MO_LD_PC_HI, "loongarch-ld-pc-hi"},
+      {MO_GD_PC_HI, "loongarch-gd-pc-hi"},
+      {MO_CALL36, "loongarch-call36"},
       {MO_DESC_PC_HI, "loongarch-desc-pc-hi"},
       {MO_DESC_PC_LO, "loongarch-desc-pc-lo"},
       {MO_DESC64_PC_LO, "loongarch-desc64-pc-lo"},
       {MO_DESC64_PC_HI, "loongarch-desc64-pc-hi"},
       {MO_DESC_LD, "loongarch-desc-ld"},
       {MO_DESC_CALL, "loongarch-desc-call"},
-      {MO_LD_PC_HI, "loongarch-ld-pc-hi"},
-      {MO_GD_PC_HI, "loongarch-gd-pc-hi"},
       {MO_LE_HI_R, "loongarch-le-hi-r"},
       {MO_LE_ADD_R, "loongarch-le-add-r"},
       {MO_LE_LO_R, "loongarch-le-lo-r"}};
   return ArrayRef(TargetFlags);
 }
 
+ArrayRef<std::pair<unsigned, const char *>>
+LoongArchInstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
+  using namespace LoongArchII;
+  static const std::pair<unsigned, const char *> TargetFlags[] = {
+      {MO_RELAX, "loongarch-relax"}};
+  return ArrayRef(TargetFlags);
+}
+
 // Returns true if this is the sext.w pattern, addi.w rd, rs, 0.
 bool LoongArch::isSEXT_W(const MachineInstr &MI) {
   return MI.getOpcode() == LoongArch::ADDI_W && MI.getOperand(1).isReg() &&
diff --git llvm/lib/Target/LoongArch/LoongArchInstrInfo.h llvm/lib/Target/LoongArch/LoongArchInstrInfo.h
index ef9970783107..a5b31878bfa1 100644
--- llvm/lib/Target/LoongArch/LoongArchInstrInfo.h
+++ llvm/lib/Target/LoongArch/LoongArchInstrInfo.h
@@ -91,6 +91,9 @@ public:
   ArrayRef<std::pair<unsigned, const char *>>
   getSerializableDirectMachineOperandTargetFlags() const override;
 
+  ArrayRef<std::pair<unsigned, const char *>>
+  getSerializableBitmaskMachineOperandTargetFlags() const override;
+
 protected:
   const LoongArchSubtarget &STI;
 };
diff --git llvm/lib/Target/LoongArch/LoongArchInstrInfo.td llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
index 2101aa058305..62cb6fa1d88a 100644
--- llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+++ llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
@@ -522,6 +522,7 @@ def HI16ForAddu16idAddiPair: SDNodeXForm<imm, [{
 def BaseAddr : ComplexPattern<iPTR, 1, "SelectBaseAddr">;
 def AddrConstant : ComplexPattern<iPTR, 2, "SelectAddrConstant">;
 def NonFIBaseAddr : ComplexPattern<iPTR, 1, "selectNonFIBaseAddr">;
+def AddrRegImm : ComplexPattern<iPTR, 2, "SelectAddrRegImm12">;
 
 def fma_nsz : PatFrag<(ops node:$fj, node:$fk, node:$fa),
                       (fma node:$fj, node:$fk, node:$fa), [{
@@ -2011,6 +2012,14 @@ class PseudoMaskedAMMinMax
 def PseudoMaskedAtomicLoadMax32 : PseudoMaskedAMMinMax;
 def PseudoMaskedAtomicLoadMin32 : PseudoMaskedAMMinMax;
 
+// Data prefetch
+
+// TODO: Supports for preldx instruction.
+def : Pat<(prefetch (AddrRegImm GPR:$rj, simm12:$imm12), (i32 0), timm, (i32 1)),
+          (PRELD 0, GPR:$rj, simm12:$imm12)>; // data prefetch for loads
+def : Pat<(prefetch (AddrRegImm GPR:$rj, simm12:$imm12), (i32 1), timm, (i32 1)),
+          (PRELD 8, GPR:$rj, simm12:$imm12)>; // data prefetch for stores
+
 /// Compare and exchange
 
 class PseudoCmpXchg
diff --git llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp
index d1de0609f24c..d87ed068ebff 100644
--- llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp
+++ llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp
@@ -27,7 +27,7 @@ static MCOperand lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym,
   MCContext &Ctx = AP.OutContext;
   LoongArchMCExpr::VariantKind Kind;
 
-  switch (MO.getTargetFlags()) {
+  switch (LoongArchII::getDirectFlags(MO)) {
   default:
     llvm_unreachable("Unknown target flag on GV operand");
   case LoongArchII::MO_None:
@@ -134,7 +134,7 @@ static MCOperand lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym,
         ME, MCConstantExpr::create(MO.getOffset(), Ctx), Ctx);
 
   if (Kind != LoongArchMCExpr::VK_LoongArch_None)
-    ME = LoongArchMCExpr::create(ME, Kind, Ctx);
+    ME = LoongArchMCExpr::create(ME, Kind, Ctx, LoongArchII::hasRelaxFlag(MO));
   return MCOperand::createExpr(ME);
 }
 
diff --git llvm/lib/Target/LoongArch/LoongArchMergeBaseOffset.cpp llvm/lib/Target/LoongArch/LoongArchMergeBaseOffset.cpp
index e9455fdd23ba..7f98f7718a53 100644
--- llvm/lib/Target/LoongArch/LoongArchMergeBaseOffset.cpp
+++ llvm/lib/Target/LoongArch/LoongArchMergeBaseOffset.cpp
@@ -105,7 +105,7 @@ bool LoongArchMergeBaseOffsetOpt::detectFoldable(MachineInstr &Hi20,
     return false;
 
   const MachineOperand &Hi20Op1 = Hi20.getOperand(1);
-  if (Hi20Op1.getTargetFlags() != LoongArchII::MO_PCREL_HI)
+  if (LoongArchII::getDirectFlags(Hi20Op1) != LoongArchII::MO_PCREL_HI)
     return false;
 
   auto isGlobalOrCPIOrBlockAddress = [](const MachineOperand &Op) {
@@ -157,7 +157,7 @@ bool LoongArchMergeBaseOffsetOpt::detectFoldable(MachineInstr &Hi20,
 
   const MachineOperand &Lo12Op2 = Lo12->getOperand(2);
   assert(Hi20.getOpcode() == LoongArch::PCALAU12I);
-  if (Lo12Op2.getTargetFlags() != LoongArchII::MO_PCREL_LO ||
+  if (LoongArchII::getDirectFlags(Lo12Op2) != LoongArchII::MO_PCREL_LO ||
       !(isGlobalOrCPIOrBlockAddress(Lo12Op2) || Lo12Op2.isMCSymbol()) ||
       Lo12Op2.getOffset() != 0)
     return false;
@@ -597,9 +597,28 @@ bool LoongArchMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi20,
   if (!isInt<32>(NewOffset))
     return false;
 
+  // If optimized by this pass successfully, MO_RELAX bitmask target-flag should
+  // be removed from the code sequence.
+  //
+  // For example:
+  //   pcalau12i $a0, %pc_hi20(symbol)
+  //   addi.d $a0, $a0, %pc_lo12(symbol)
+  //   ld.w $a0, $a0, 0
+  //
+  //   =>
+  //
+  //   pcalau12i $a0, %pc_hi20(symbol)
+  //   ld.w $a0, $a0, %pc_lo12(symbol)
+  //
+  // Code sequence optimized before can be relax by linker. But after being
+  // optimized, it cannot be relaxed any more. So MO_RELAX flag should not be
+  // carried by them.
   Hi20.getOperand(1).setOffset(NewOffset);
+  Hi20.getOperand(1).setTargetFlags(
+      LoongArchII::getDirectFlags(Hi20.getOperand(1)));
   MachineOperand &ImmOp = Lo12.getOperand(2);
   ImmOp.setOffset(NewOffset);
+  ImmOp.setTargetFlags(LoongArchII::getDirectFlags(ImmOp));
   if (Lo20 && Hi12) {
     Lo20->getOperand(2).setOffset(NewOffset);
     Hi12->getOperand(2).setOffset(NewOffset);
@@ -617,15 +636,16 @@ bool LoongArchMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi20,
         switch (ImmOp.getType()) {
         case MachineOperand::MO_GlobalAddress:
           MO.ChangeToGA(ImmOp.getGlobal(), ImmOp.getOffset(),
-                        ImmOp.getTargetFlags());
+                        LoongArchII::getDirectFlags(ImmOp));
           break;
         case MachineOperand::MO_MCSymbol:
-          MO.ChangeToMCSymbol(ImmOp.getMCSymbol(), ImmOp.getTargetFlags());
+          MO.ChangeToMCSymbol(ImmOp.getMCSymbol(),
+                              LoongArchII::getDirectFlags(ImmOp));
           MO.setOffset(ImmOp.getOffset());
           break;
         case MachineOperand::MO_BlockAddress:
           MO.ChangeToBA(ImmOp.getBlockAddress(), ImmOp.getOffset(),
-                        ImmOp.getTargetFlags());
+                        LoongArchII::getDirectFlags(ImmOp));
           break;
         default:
           report_fatal_error("unsupported machine operand type");
diff --git llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
index b611365f608a..62b08be5435c 100644
--- llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
+++ llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
@@ -38,6 +38,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeLoongArchTarget() {
   initializeLoongArchMergeBaseOffsetOptPass(*PR);
   initializeLoongArchOptWInstrsPass(*PR);
   initializeLoongArchPreRAExpandPseudoPass(*PR);
+  initializeLoongArchExpandPseudoPass(*PR);
   initializeLoongArchDAGToDAGISelLegacyPass(*PR);
 }
 
diff --git llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
index 5fbc7c734168..cbc9c3f3beca 100644
--- llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
+++ llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
@@ -89,4 +89,10 @@ LoongArchTTIImpl::getPopcntSupport(unsigned TyWidth) {
   return ST->hasExtLSX() ? TTI::PSK_FastHardware : TTI::PSK_Software;
 }
 
+unsigned LoongArchTTIImpl::getCacheLineSize() const { return 64; }
+
+unsigned LoongArchTTIImpl::getPrefetchDistance() const { return 200; }
+
+bool LoongArchTTIImpl::enableWritePrefetching() const { return true; }
+
 // TODO: Implement more hooks to provide TTI machinery for LoongArch.
diff --git llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h
index f7ce75173be2..b3edf131c584 100644
--- llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h
+++ llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h
@@ -47,6 +47,10 @@ public:
   const char *getRegisterClassName(unsigned ClassID) const;
   TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
 
+  unsigned getCacheLineSize() const override;
+  unsigned getPrefetchDistance() const override;
+  bool enableWritePrefetching() const override;
+
   // TODO: Implement more hooks to provide TTI machinery for LoongArch.
 };
 
diff --git llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
index 0c24008301d0..eb4f6edc117a 100644
--- llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
+++ llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
@@ -251,6 +251,7 @@ bool LoongArchAsmBackend::shouldInsertFixupForCodeAlign(MCAssembler &Asm,
 bool LoongArchAsmBackend::shouldForceRelocation(const MCAssembler &Asm,
                                                 const MCFixup &Fixup,
                                                 const MCValue &Target,
+                                                const uint64_t,
                                                 const MCSubtargetInfo *STI) {
   if (Fixup.getKind() >= FirstLiteralRelocationKind)
     return true;
diff --git llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
index 9df4ff22625c..adbfd01410a4 100644
--- llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
+++ llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
@@ -57,7 +57,7 @@ public:
                                      MCAlignFragment &AF) override;
 
   bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
-                             const MCValue &Target,
+                             const MCValue &Target, const uint64_t Value,
                              const MCSubtargetInfo *STI) override;
 
   unsigned getNumFixupKinds() const override {
diff --git llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h
index 23699043b992..833cd0626162 100644
--- llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h
+++ llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h
@@ -17,6 +17,7 @@
 #include "MCTargetDesc/LoongArchMCTargetDesc.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/TargetParser/SubtargetFeature.h"
 
@@ -58,8 +59,31 @@ enum {
   MO_LE_ADD_R,
   MO_LE_LO_R,
   // TODO: Add more flags.
+
+  // Used to differentiate between target-specific "direct" flags and "bitmask"
+  // flags. A machine operand can only have one "direct" flag, but can have
+  // multiple "bitmask" flags.
+  MO_DIRECT_FLAG_MASK = 0x3f,
+
+  MO_RELAX = 0x40
 };
 
+// Given a MachineOperand that may carry out "bitmask" flags, such as MO_RELAX,
+// return LoongArch target-specific "direct" flags.
+static inline unsigned getDirectFlags(const MachineOperand &MO) {
+  return MO.getTargetFlags() & MO_DIRECT_FLAG_MASK;
+}
+
+// Add MO_RELAX "bitmask" flag when FeatureRelax is enabled.
+static inline unsigned encodeFlags(unsigned Flags, bool Relax) {
+  return Flags | (Relax ? MO_RELAX : 0);
+}
+
+// \returns true if the given MachineOperand has MO_RELAX "bitmask" flag.
+static inline bool hasRelaxFlag(const MachineOperand &MO) {
+  return MO.getTargetFlags() & MO_RELAX;
+}
+
 // Target-specific flags of LAInst.
 // All definitions must match LoongArchInstrFormats.td.
 enum {
diff --git llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
index 04d57f0fe745..02ec321857e5 100644
--- llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
+++ llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
@@ -249,6 +249,7 @@ LoongArchMCCodeEmitter::getExprOpValue(const MCInst &MI, const MCOperand &MO,
       break;
     case LoongArchMCExpr::VK_LoongArch_CALL36:
       FixupKind = LoongArch::fixup_loongarch_call36;
+      RelaxCandidate = true;
       break;
     case LoongArchMCExpr::VK_LoongArch_TLS_DESC_PC_HI20:
       FixupKind = LoongArch::fixup_loongarch_tls_desc_pc_hi20;
diff --git llvm/lib/Target/M68k/M68kISelLowering.cpp llvm/lib/Target/M68k/M68kISelLowering.cpp
index 4297325cf0e6..39b307b28889 100644
--- llvm/lib/Target/M68k/M68kISelLowering.cpp
+++ llvm/lib/Target/M68k/M68kISelLowering.cpp
@@ -1060,7 +1060,8 @@ SDValue M68kTargetLowering::LowerFormalArguments(
 
 bool M68kTargetLowering::CanLowerReturn(
     CallingConv::ID CCID, MachineFunction &MF, bool IsVarArg,
-    const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
+    const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
+    const Type *RetTy) const {
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CCID, IsVarArg, MF, RVLocs, Context);
   return CCInfo.CheckReturn(Outs, RetCC_M68k);
diff --git llvm/lib/Target/M68k/M68kISelLowering.h llvm/lib/Target/M68k/M68kISelLowering.h
index d00907775f92..e01f333316db 100644
--- llvm/lib/Target/M68k/M68kISelLowering.h
+++ llvm/lib/Target/M68k/M68kISelLowering.h
@@ -271,7 +271,7 @@ private:
   bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
                       bool isVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
-                      LLVMContext &Context) const override;
+                      LLVMContext &Context, const Type *RetTy) const override;
 
   /// Lower the result values of a call into the
   /// appropriate copies out of appropriate physical registers.
diff --git llvm/lib/Target/MSP430/MSP430ISelLowering.cpp llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
index 31b793e9c0f2..28d782543b33 100644
--- llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -723,7 +723,8 @@ MSP430TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
                                      MachineFunction &MF,
                                      bool IsVarArg,
                                      const SmallVectorImpl<ISD::OutputArg> &Outs,
-                                     LLVMContext &Context) const {
+                                     LLVMContext &Context,
+                                     const Type *RetTy) const {
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
   return CCInfo.CheckReturn(Outs, RetCC_MSP430);
diff --git llvm/lib/Target/MSP430/MSP430ISelLowering.h llvm/lib/Target/MSP430/MSP430ISelLowering.h
index 667ad6033861..d1263e453dda 100644
--- llvm/lib/Target/MSP430/MSP430ISelLowering.h
+++ llvm/lib/Target/MSP430/MSP430ISelLowering.h
@@ -171,7 +171,7 @@ namespace llvm {
                         MachineFunction &MF,
                         bool IsVarArg,
                         const SmallVectorImpl<ISD::OutputArg> &Outs,
-                        LLVMContext &Context) const override;
+                        LLVMContext &Context, const Type *RetTy) const override;
 
     SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
                         const SmallVectorImpl<ISD::OutputArg> &Outs,
diff --git llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
index 6001d9d51d16..4af6768b13cc 100644
--- llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
+++ llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
@@ -542,6 +542,7 @@ bool MipsAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count,
 bool MipsAsmBackend::shouldForceRelocation(const MCAssembler &Asm,
                                            const MCFixup &Fixup,
                                            const MCValue &Target,
+                                           const uint64_t,
                                            const MCSubtargetInfo *STI) {
   if (Fixup.getKind() >= FirstLiteralRelocationKind)
     return true;
diff --git llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
index 799dd569f1ad..3a2c5e824a53 100644
--- llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
+++ llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
@@ -55,7 +55,7 @@ public:
                     const MCSubtargetInfo *STI) const override;
 
   bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
-                             const MCValue &Target,
+                             const MCValue &Target, const uint64_t Value,
                              const MCSubtargetInfo *STI) override;
 
   bool isMicroMips(const MCSymbol *Sym) const override;
diff --git llvm/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h llvm/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h
index aa35e7db6bda..b9a2af334123 100644
--- llvm/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h
+++ llvm/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h
@@ -92,7 +92,12 @@ namespace MipsII {
     MO_CALL_LO16,
 
     /// Helper operand used to generate R_MIPS_JALR
-    MO_JALR
+    MO_JALR,
+
+    /// MO_DLLIMPORT - On a symbol operand "FOO", this indicates that the
+    /// reference is actually to the "__imp_FOO" symbol.  This is used for
+    /// dllimport linkage on windows.
+    MO_DLLIMPORT = 0x20,
   };
 
   enum {
diff --git llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
index 655898817582..add36d87b9ef 100644
--- llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
+++ llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
@@ -19,6 +19,7 @@
 #include "MipsMCNaCl.h"
 #include "MipsTargetStreamer.h"
 #include "TargetInfo/MipsTargetInfo.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCInstrAnalysis.h"
@@ -44,6 +45,86 @@ using namespace llvm;
 #define GET_REGINFO_MC_DESC
 #include "MipsGenRegisterInfo.inc"
 
+void MIPS_MC::initLLVMToCVRegMapping(MCRegisterInfo *MRI) {
+  // Mapping from CodeView to MC register id.
+  static const struct {
+    codeview::RegisterId CVReg;
+    MCPhysReg Reg;
+  } RegMap[] = {
+      {codeview::RegisterId::MIPS_ZERO, Mips::ZERO},
+      {codeview::RegisterId::MIPS_AT, Mips::AT},
+      {codeview::RegisterId::MIPS_V0, Mips::V0},
+      {codeview::RegisterId::MIPS_V1, Mips::V1},
+      {codeview::RegisterId::MIPS_A0, Mips::A0},
+      {codeview::RegisterId::MIPS_A1, Mips::A1},
+      {codeview::RegisterId::MIPS_A2, Mips::A2},
+      {codeview::RegisterId::MIPS_A3, Mips::A3},
+      {codeview::RegisterId::MIPS_T0, Mips::T0},
+      {codeview::RegisterId::MIPS_T1, Mips::T1},
+      {codeview::RegisterId::MIPS_T2, Mips::T2},
+      {codeview::RegisterId::MIPS_T3, Mips::T3},
+      {codeview::RegisterId::MIPS_T4, Mips::T4},
+      {codeview::RegisterId::MIPS_T5, Mips::T5},
+      {codeview::RegisterId::MIPS_T6, Mips::T6},
+      {codeview::RegisterId::MIPS_T7, Mips::T7},
+      {codeview::RegisterId::MIPS_S0, Mips::S0},
+      {codeview::RegisterId::MIPS_S1, Mips::S1},
+      {codeview::RegisterId::MIPS_S2, Mips::S2},
+      {codeview::RegisterId::MIPS_S3, Mips::S3},
+      {codeview::RegisterId::MIPS_S4, Mips::S4},
+      {codeview::RegisterId::MIPS_S5, Mips::S5},
+      {codeview::RegisterId::MIPS_S6, Mips::S6},
+      {codeview::RegisterId::MIPS_S7, Mips::S7},
+      {codeview::RegisterId::MIPS_T8, Mips::T8},
+      {codeview::RegisterId::MIPS_T9, Mips::T9},
+      {codeview::RegisterId::MIPS_K0, Mips::K0},
+      {codeview::RegisterId::MIPS_K1, Mips::K1},
+      {codeview::RegisterId::MIPS_GP, Mips::GP},
+      {codeview::RegisterId::MIPS_SP, Mips::SP},
+      {codeview::RegisterId::MIPS_S8, Mips::FP},
+      {codeview::RegisterId::MIPS_RA, Mips::RA},
+      {codeview::RegisterId::MIPS_LO, Mips::HI0},
+      {codeview::RegisterId::MIPS_HI, Mips::LO0},
+      {codeview::RegisterId::MIPS_Fir, Mips::FCR0},
+      {codeview::RegisterId::MIPS_Psr, Mips::COP012}, // CP0.Status
+      {codeview::RegisterId::MIPS_F0, Mips::F0},
+      {codeview::RegisterId::MIPS_F1, Mips::F1},
+      {codeview::RegisterId::MIPS_F2, Mips::F2},
+      {codeview::RegisterId::MIPS_F3, Mips::F3},
+      {codeview::RegisterId::MIPS_F4, Mips::F4},
+      {codeview::RegisterId::MIPS_F5, Mips::F5},
+      {codeview::RegisterId::MIPS_F6, Mips::F6},
+      {codeview::RegisterId::MIPS_F7, Mips::F7},
+      {codeview::RegisterId::MIPS_F8, Mips::F8},
+      {codeview::RegisterId::MIPS_F9, Mips::F9},
+      {codeview::RegisterId::MIPS_F10, Mips::F10},
+      {codeview::RegisterId::MIPS_F11, Mips::F11},
+      {codeview::RegisterId::MIPS_F12, Mips::F12},
+      {codeview::RegisterId::MIPS_F13, Mips::F13},
+      {codeview::RegisterId::MIPS_F14, Mips::F14},
+      {codeview::RegisterId::MIPS_F15, Mips::F15},
+      {codeview::RegisterId::MIPS_F16, Mips::F16},
+      {codeview::RegisterId::MIPS_F17, Mips::F17},
+      {codeview::RegisterId::MIPS_F18, Mips::F18},
+      {codeview::RegisterId::MIPS_F19, Mips::F19},
+      {codeview::RegisterId::MIPS_F20, Mips::F20},
+      {codeview::RegisterId::MIPS_F21, Mips::F21},
+      {codeview::RegisterId::MIPS_F22, Mips::F22},
+      {codeview::RegisterId::MIPS_F23, Mips::F23},
+      {codeview::RegisterId::MIPS_F24, Mips::F24},
+      {codeview::RegisterId::MIPS_F25, Mips::F25},
+      {codeview::RegisterId::MIPS_F26, Mips::F26},
+      {codeview::RegisterId::MIPS_F27, Mips::F27},
+      {codeview::RegisterId::MIPS_F28, Mips::F28},
+      {codeview::RegisterId::MIPS_F29, Mips::F29},
+      {codeview::RegisterId::MIPS_F30, Mips::F30},
+      {codeview::RegisterId::MIPS_F31, Mips::F31},
+      {codeview::RegisterId::MIPS_Fsr, Mips::FCR31},
+  };
+  for (const auto &I : RegMap)
+    MRI->mapLLVMRegToCVReg(I.Reg, static_cast<int>(I.CVReg));
+}
+
 namespace {
 class MipsWinCOFFTargetStreamer : public MipsTargetStreamer {
 public:
diff --git llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
index c5293b03b0ac..f3e3e6e8d107 100644
--- llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
+++ llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
@@ -57,6 +57,8 @@ createMipsELFObjectWriter(const Triple &TT, bool IsN32);
 std::unique_ptr<MCObjectTargetWriter> createMipsWinCOFFObjectWriter();
 
 namespace MIPS_MC {
+void initLLVMToCVRegMapping(MCRegisterInfo *MRI);
+
 StringRef selectMipsCPU(const Triple &TT, StringRef CPU);
 }
 
diff --git llvm/lib/Target/Mips/MCTargetDesc/MipsWinCOFFObjectWriter.cpp llvm/lib/Target/Mips/MCTargetDesc/MipsWinCOFFObjectWriter.cpp
index 94187c71ba70..cbe74f87a221 100644
--- llvm/lib/Target/Mips/MCTargetDesc/MipsWinCOFFObjectWriter.cpp
+++ llvm/lib/Target/Mips/MCTargetDesc/MipsWinCOFFObjectWriter.cpp
@@ -40,6 +40,10 @@ unsigned MipsWinCOFFObjectWriter::getRelocType(MCContext &Ctx,
   switch (FixupKind) {
   case FK_Data_4:
     return COFF::IMAGE_REL_MIPS_REFWORD;
+  case FK_SecRel_2:
+    return COFF::IMAGE_REL_MIPS_SECTION;
+  case FK_SecRel_4:
+    return COFF::IMAGE_REL_MIPS_SECREL;
   case Mips::fixup_Mips_26:
     return COFF::IMAGE_REL_MIPS_JMPADDR;
   case Mips::fixup_Mips_HI16:
diff --git llvm/lib/Target/Mips/MipsCCState.cpp llvm/lib/Target/Mips/MipsCCState.cpp
index 76acfa97c3b4..781bb7c8c7e6 100644
--- llvm/lib/Target/Mips/MipsCCState.cpp
+++ llvm/lib/Target/Mips/MipsCCState.cpp
@@ -95,14 +95,13 @@ void MipsCCState::PreAnalyzeCallResultForF128(
 
 /// Identify lowered values that originated from f128 or float arguments and
 /// record this for use by RetCC_MipsN.
-void MipsCCState::PreAnalyzeReturnForF128(
-    const SmallVectorImpl<ISD::OutputArg> &Outs) {
-  const MachineFunction &MF = getMachineFunction();
+void MipsCCState::PreAnalyzeCallReturnForF128(
+    const SmallVectorImpl<ISD::OutputArg> &Outs, const Type *RetTy) {
   for (unsigned i = 0; i < Outs.size(); ++i) {
     OriginalArgWasF128.push_back(
-        originalTypeIsF128(MF.getFunction().getReturnType(), nullptr));
+        originalTypeIsF128(RetTy, nullptr));
     OriginalArgWasFloat.push_back(
-        MF.getFunction().getReturnType()->isFloatingPointTy());
+        RetTy->isFloatingPointTy());
   }
 }
 
diff --git llvm/lib/Target/Mips/MipsCCState.h llvm/lib/Target/Mips/MipsCCState.h
index bbb5225d5f67..4229da564630 100644
--- llvm/lib/Target/Mips/MipsCCState.h
+++ llvm/lib/Target/Mips/MipsCCState.h
@@ -49,7 +49,7 @@ private:
 
   /// Identify lowered values that originated from f128 arguments and record
   /// this for use by RetCC_MipsN.
-  void PreAnalyzeReturnForF128(const SmallVectorImpl<ISD::OutputArg> &Outs);
+  void PreAnalyzeCallReturnForF128(const SmallVectorImpl<ISD::OutputArg> &Outs, const Type *RetTy);
 
   /// Identify lowered values that originated from f128 arguments and record
   /// this.
@@ -167,10 +167,11 @@ public:
 
   void PreAnalyzeReturn(const SmallVectorImpl<ISD::OutputArg> &Outs,
                         CCAssignFn Fn) {
+    const MachineFunction &MF = getMachineFunction();
     OriginalArgWasFloat.clear();
     OriginalArgWasF128.clear();
     OriginalArgWasFloatVector.clear();
-    PreAnalyzeReturnForF128(Outs);
+    PreAnalyzeCallReturnForF128(Outs, MF.getFunction().getReturnType());
     PreAnalyzeReturnForVectorFloat(Outs);
   }
 
@@ -182,7 +183,8 @@ public:
 
   bool CheckReturn(const SmallVectorImpl<ISD::OutputArg> &ArgsFlags,
                    CCAssignFn Fn) {
-    PreAnalyzeReturnForF128(ArgsFlags);
+    const MachineFunction &MF = getMachineFunction();
+    PreAnalyzeCallReturnForF128(ArgsFlags, MF.getFunction().getReturnType());
     PreAnalyzeReturnForVectorFloat(ArgsFlags);
     bool Return = CCState::CheckReturn(ArgsFlags, Fn);
     OriginalArgWasFloat.clear();
@@ -191,6 +193,16 @@ public:
     return Return;
   }
 
+  bool CheckCallReturn(const SmallVectorImpl<ISD::OutputArg> &ArgsFlags,
+                   CCAssignFn Fn, const Type *RetTy) {
+    PreAnalyzeCallReturnForF128(ArgsFlags, RetTy);
+    PreAnalyzeReturnForVectorFloat(ArgsFlags);
+    bool Return = CCState::CheckReturn(ArgsFlags, Fn);
+    OriginalArgWasFloat.clear();
+    OriginalArgWasF128.clear();
+    OriginalArgWasFloatVector.clear();
+    return Return;
+  }
   bool WasOriginalArgF128(unsigned ValNo) { return OriginalArgWasF128[ValNo]; }
   bool WasOriginalArgFloat(unsigned ValNo) {
       return OriginalArgWasFloat[ValNo];
diff --git llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp
index 3a9421fae0f6..258010d33118 100644
--- llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp
+++ llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp
@@ -402,7 +402,7 @@ void RegDefsUses::addLiveOut(const MachineBasicBlock &MBB,
   for (const MachineBasicBlock *S : MBB.successors())
     if (S != &SuccBB)
       for (const auto &LI : S->liveins())
-        Uses.set(LI.PhysReg);
+        Uses.set(LI.PhysReg.id());
 }
 
 bool RegDefsUses::update(const MachineInstr &MI, unsigned Begin, unsigned End) {
diff --git llvm/lib/Target/Mips/MipsISelLowering.cpp llvm/lib/Target/Mips/MipsISelLowering.cpp
index d5f38c414e70..7c4257c222c0 100644
--- llvm/lib/Target/Mips/MipsISelLowering.cpp
+++ llvm/lib/Target/Mips/MipsISelLowering.cpp
@@ -2146,6 +2146,14 @@ SDValue MipsTargetLowering::lowerGlobalAddress(SDValue Op,
   GlobalAddressSDNode *N = cast<GlobalAddressSDNode>(Op);
   const GlobalValue *GV = N->getGlobal();
 
+  if (GV->hasDLLImportStorageClass()) {
+    assert(Subtarget.isTargetWindows() &&
+           "Windows is the only supported COFF target");
+    return getDllimportVariable(
+        N, SDLoc(N), Ty, DAG, DAG.getEntryNode(),
+        MachinePointerInfo::getGOT(DAG.getMachineFunction()));
+  }
+
   if (!isPositionIndependent()) {
     const MipsTargetObjectFile *TLOF =
         static_cast<const MipsTargetObjectFile *>(
@@ -3501,7 +3509,14 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   }
 
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
-    if (IsPIC) {
+    if (Subtarget.isTargetCOFF() &&
+        G->getGlobal()->hasDLLImportStorageClass()) {
+      assert(Subtarget.isTargetWindows() &&
+             "Windows is the only supported COFF target");
+      auto PtrInfo = MachinePointerInfo();
+      Callee = DAG.getLoad(Ty, DL, Chain,
+                           getDllimportSymbol(G, SDLoc(G), Ty, DAG), PtrInfo);
+    } else if (IsPIC) {
       const GlobalValue *Val = G->getGlobal();
       InternalLinkage = Val->hasInternalLinkage();
 
@@ -3864,10 +3879,10 @@ bool
 MipsTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
                                    MachineFunction &MF, bool IsVarArg,
                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
-                                   LLVMContext &Context) const {
+                                   LLVMContext &Context, const Type *RetTy) const {
   SmallVector<CCValAssign, 16> RVLocs;
   MipsCCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
-  return CCInfo.CheckReturn(Outs, RetCC_Mips);
+  return CCInfo.CheckCallReturn(Outs, RetCC_Mips, RetTy);
 }
 
 bool MipsTargetLowering::shouldSignExtendTypeInLibCall(Type *Ty,
diff --git llvm/lib/Target/Mips/MipsISelLowering.h llvm/lib/Target/Mips/MipsISelLowering.h
index 655a347679ad..ee1ab6a17a91 100644
--- llvm/lib/Target/Mips/MipsISelLowering.h
+++ llvm/lib/Target/Mips/MipsISelLowering.h
@@ -489,6 +489,33 @@ class TargetRegisterClass;
           DAG.getNode(MipsISD::GPRel, DL, DAG.getVTList(Ty), GPRel));
     }
 
+    // This method creates the following nodes, which are necessary for
+    // loading a dllimported symbol:
+    //
+    // (lw (add (shl(%high(sym), 16), %low(sym)))
+    template <class NodeTy>
+    SDValue getDllimportSymbol(NodeTy *N, const SDLoc &DL, EVT Ty,
+                               SelectionDAG &DAG) const {
+      SDValue Hi =
+          getTargetNode(N, Ty, DAG, MipsII::MO_ABS_HI | MipsII::MO_DLLIMPORT);
+      SDValue Lo =
+          getTargetNode(N, Ty, DAG, MipsII::MO_ABS_LO | MipsII::MO_DLLIMPORT);
+      return DAG.getNode(ISD::ADD, DL, Ty, DAG.getNode(MipsISD::Lo, DL, Ty, Lo),
+                         DAG.getNode(MipsISD::Hi, DL, Ty, Hi));
+    }
+
+    // This method creates the following nodes, which are necessary for
+    // loading a dllimported global variable:
+    //
+    // (lw (lw (add (shl(%high(sym), 16), %low(sym))))
+    template <class NodeTy>
+    SDValue getDllimportVariable(NodeTy *N, const SDLoc &DL, EVT Ty,
+                                 SelectionDAG &DAG, SDValue Chain,
+                                 const MachinePointerInfo &PtrInfo) const {
+      return DAG.getLoad(Ty, DL, Chain, getDllimportSymbol(N, DL, Ty, DAG),
+                         PtrInfo);
+    }
+
     /// This function fills Ops, which is the list of operands that will later
     /// be used when a function call node is created. It also generates
     /// copyToReg nodes to set up argument registers.
@@ -615,7 +642,7 @@ class TargetRegisterClass;
     bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
                         bool isVarArg,
                         const SmallVectorImpl<ISD::OutputArg> &Outs,
-                        LLVMContext &Context) const override;
+                        LLVMContext &Context, const Type *RetTy) const override;
 
     SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
                         const SmallVectorImpl<ISD::OutputArg> &Outs,
diff --git llvm/lib/Target/Mips/MipsMCInstLower.cpp llvm/lib/Target/Mips/MipsMCInstLower.cpp
index b0642f3d1ff2..e01d0d1e65cf 100644
--- llvm/lib/Target/Mips/MipsMCInstLower.cpp
+++ llvm/lib/Target/Mips/MipsMCInstLower.cpp
@@ -18,6 +18,7 @@
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -38,8 +39,16 @@ MCOperand MipsMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
   MipsMCExpr::MipsExprKind TargetKind = MipsMCExpr::MEK_None;
   bool IsGpOff = false;
   const MCSymbol *Symbol;
+  SmallString<128> Name;
+  unsigned TargetFlags = MO.getTargetFlags();
 
-  switch(MO.getTargetFlags()) {
+  if (TargetFlags & MipsII::MO_DLLIMPORT) {
+    // Handle dllimport linkage
+    Name += "__imp_";
+    TargetFlags &= ~MipsII::MO_DLLIMPORT;
+  }
+
+  switch (TargetFlags) {
   default:
     llvm_unreachable("Invalid target flag!");
   case MipsII::MO_NO_FLAG:
@@ -125,7 +134,8 @@ MCOperand MipsMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
     break;
 
   case MachineOperand::MO_GlobalAddress:
-    Symbol = AsmPrinter.getSymbol(MO.getGlobal());
+    AsmPrinter.getNameWithPrefix(Name, MO.getGlobal());
+    Symbol = Ctx->getOrCreateSymbol(Name);
     Offset += MO.getOffset();
     break;
 
diff --git llvm/lib/Target/Mips/MipsRegisterInfo.cpp llvm/lib/Target/Mips/MipsRegisterInfo.cpp
index c9eff0e0285f..4bfc35420b40 100644
--- llvm/lib/Target/Mips/MipsRegisterInfo.cpp
+++ llvm/lib/Target/Mips/MipsRegisterInfo.cpp
@@ -37,7 +37,9 @@ using namespace llvm;
 #define GET_REGINFO_TARGET_DESC
 #include "MipsGenRegisterInfo.inc"
 
-MipsRegisterInfo::MipsRegisterInfo() : MipsGenRegisterInfo(Mips::RA) {}
+MipsRegisterInfo::MipsRegisterInfo() : MipsGenRegisterInfo(Mips::RA) {
+  MIPS_MC::initLLVMToCVRegMapping(this);
+}
 
 unsigned MipsRegisterInfo::getPICCallReg() { return Mips::T9; }
 
diff --git llvm/lib/Target/Mips/MipsSEISelLowering.cpp llvm/lib/Target/Mips/MipsSEISelLowering.cpp
index 818b1683bb86..1d1b0f9c6ae2 100644
--- llvm/lib/Target/Mips/MipsSEISelLowering.cpp
+++ llvm/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -2736,7 +2736,7 @@ static SDValue lowerVECTOR_SHUFFLE_ILVOD(SDValue Op, EVT ResTy,
   else
     return SDValue();
 
-  return DAG.getNode(MipsISD::ILVOD, SDLoc(Op), ResTy, Wt, Ws);
+  return DAG.getNode(MipsISD::ILVOD, SDLoc(Op), ResTy, Ws, Wt);
 }
 
 // Lower VECTOR_SHUFFLE into ILVR (if possible).
diff --git llvm/lib/Target/Mips/MipsSubtarget.h llvm/lib/Target/Mips/MipsSubtarget.h
index c048ab29d5f9..85cf45d4702a 100644
--- llvm/lib/Target/Mips/MipsSubtarget.h
+++ llvm/lib/Target/Mips/MipsSubtarget.h
@@ -301,6 +301,7 @@ public:
     return (HasSym32 && isABI_N64()) || isABI_N32() || isABI_O32();
   }
   bool isSingleFloat() const { return IsSingleFloat; }
+  bool isTargetCOFF() const { return TargetTriple.isOSBinFormatCOFF(); }
   bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
   bool hasVFPU() const { return HasVFPU; }
   bool inMips16Mode() const { return InMips16Mode; }
@@ -356,6 +357,7 @@ public:
   bool os16() const { return Os16; }
 
   bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); }
+  bool isTargetWindows() const { return TargetTriple.isOSWindows(); }
 
   bool isXRaySupported() const override { return true; }
 
diff --git llvm/lib/Target/NVPTX/NVPTX.td llvm/lib/Target/NVPTX/NVPTX.td
index 9af8715ef52a..3ca8b4d29407 100644
--- llvm/lib/Target/NVPTX/NVPTX.td
+++ llvm/lib/Target/NVPTX/NVPTX.td
@@ -39,6 +39,7 @@ foreach sm = [20, 21, 30, 32, 35, 37, 50, 52, 53,
   def SM#sm: FeatureSM<""#sm, !mul(sm, 10)>;
 
 def SM90a: FeatureSM<"90a", 901>;
+def SM100a: FeatureSM<"100a", 1001>;
 
 foreach version = [32, 40, 41, 42, 43, 50, 60, 61, 62, 63, 64, 65,
                    70, 71, 72, 73, 74, 75, 76, 77, 78,
@@ -74,6 +75,7 @@ def : Proc<"sm_89", [SM89, PTX78]>;
 def : Proc<"sm_90", [SM90, PTX78]>;
 def : Proc<"sm_90a", [SM90a, PTX80]>;
 def : Proc<"sm_100", [SM100, PTX86]>;
+def : Proc<"sm_100a", [SM100a, PTX86]>;
 
 def NVPTXInstrInfo : InstrInfo {
 }
diff --git llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
index 76c6c8fb38d6..75fcf6829c50 100644
--- llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
+++ llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
@@ -154,8 +154,7 @@ Value *GenericToNVVM::remapConstant(Module *M, Function *F, Constant *C,
     if (I != GVMap.end()) {
       GlobalVariable *GV = I->second;
       NewValue = Builder.CreateAddrSpaceCast(
-          GV,
-          PointerType::get(GV->getValueType(), llvm::ADDRESS_SPACE_GENERIC));
+          GV, PointerType::get(GV->getContext(), llvm::ADDRESS_SPACE_GENERIC));
     }
   } else if (isa<ConstantAggregate>(C)) {
     // If any element in the constant vector or aggregate C is or uses a global
diff --git llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index c40c09c204fd..ed493d50712a 100644
--- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -2804,8 +2804,8 @@ SDValue NVPTXTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   Tmp1 = DAG.getStore(VAListLoad.getValue(1), DL, Tmp1, Tmp2,
                       MachinePointerInfo(V));
 
-  const Value *SrcV =
-      Constant::getNullValue(PointerType::get(Ty, ADDRESS_SPACE_LOCAL));
+  const Value *SrcV = Constant::getNullValue(
+      PointerType::get(*DAG.getContext(), ADDRESS_SPACE_LOCAL));
 
   // Load the actual argument out of the pointer VAList
   return DAG.getLoad(VT, DL, Tmp1, VAList, MachinePointerInfo(SrcV));
@@ -3194,8 +3194,8 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
           SDValue VecAddr =
               DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
                           DAG.getConstant(Offsets[VecIdx], dl, PtrVT));
-          Value *srcValue = Constant::getNullValue(PointerType::get(
-              EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM));
+          Value *srcValue = Constant::getNullValue(
+              PointerType::get(F->getContext(), ADDRESS_SPACE_PARAM));
 
           const MaybeAlign PartAlign = [&]() -> MaybeAlign {
             if (aggregateIsPacked)
diff --git llvm/lib/Target/NVPTX/NVPTXLowerAlloca.cpp llvm/lib/Target/NVPTX/NVPTXLowerAlloca.cpp
index be144c5ab7ba..a7544ce2df1a 100644
--- llvm/lib/Target/NVPTX/NVPTXLowerAlloca.cpp
+++ llvm/lib/Target/NVPTX/NVPTXLowerAlloca.cpp
@@ -90,13 +90,15 @@ bool NVPTXLowerAlloca::runOnFunction(Function &F) {
         // addrspacecast to ADDRESS_SPACE_GENERIC.
         if (AllocAddrSpace == ADDRESS_SPACE_GENERIC) {
           auto ASCastToLocalAS = new AddrSpaceCastInst(
-              allocaInst, PointerType::get(ETy, ADDRESS_SPACE_LOCAL), "");
+              allocaInst,
+              PointerType::get(ETy->getContext(), ADDRESS_SPACE_LOCAL), "");
           ASCastToLocalAS->insertAfter(allocaInst);
           AllocaInLocalAS = ASCastToLocalAS;
         }
 
         auto AllocaInGenericAS = new AddrSpaceCastInst(
-            AllocaInLocalAS, PointerType::get(ETy, ADDRESS_SPACE_GENERIC), "");
+            AllocaInLocalAS,
+            PointerType::get(ETy->getContext(), ADDRESS_SPACE_GENERIC), "");
         AllocaInGenericAS->insertAfter(AllocaInLocalAS);
 
         for (Use &AllocaUse : llvm::make_early_inc_range(allocaInst->uses())) {
diff --git llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
index ceb9d852d8ec..c763b54c8dbf 100644
--- llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
+++ llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
@@ -594,8 +594,8 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM,
       UsesToUpdate.push_back(&U);
 
     Value *ArgInParamAS = new AddrSpaceCastInst(
-        Arg, PointerType::get(StructType, ADDRESS_SPACE_PARAM), Arg->getName(),
-        FirstInst);
+        Arg, PointerType::get(StructType->getContext(), ADDRESS_SPACE_PARAM),
+        Arg->getName(), FirstInst);
     for (Use *U : UsesToUpdate)
       convertToParamAS(U, ArgInParamAS, HasCvtaParam, IsGridConstant);
     LLVM_DEBUG(dbgs() << "No need to copy or cast " << *Arg << "\n");
diff --git llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
index 91c565ea00fb..d1b136429d3a 100644
--- llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
+++ llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
@@ -171,7 +171,7 @@ void NVPTXRegisterInfo::addToDebugRegisterMap(
 }
 
 int64_t NVPTXRegisterInfo::getDwarfRegNum(MCRegister RegNum, bool isEH) const {
-  if (Register::isPhysicalRegister(RegNum)) {
+  if (RegNum.isPhysical()) {
     std::string name = NVPTXInstPrinter::getRegisterName(RegNum.id());
     // In NVPTXFrameLowering.cpp, we do arrange for %Depot to be accessible from
     // %SP. Using the %Depot register doesn't provide any debug info in
diff --git llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
index ef36af1a5e66..9077aa7de5a3 100644
--- llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
+++ llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
@@ -161,7 +161,7 @@ public:
   }
 
   bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
-                             const MCValue &Target,
+                             const MCValue &Target, const uint64_t,
                              const MCSubtargetInfo *STI) override {
     MCFixupKind Kind = Fixup.getKind();
     switch ((unsigned)Kind) {
diff --git llvm/lib/Target/PowerPC/PPCISelLowering.cpp llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 691107abf3e8..4ca328bd9a9b 100644
--- llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -7868,7 +7868,8 @@ bool
 PPCTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
                                   MachineFunction &MF, bool isVarArg,
                                   const SmallVectorImpl<ISD::OutputArg> &Outs,
-                                  LLVMContext &Context) const {
+                                  LLVMContext &Context,
+                                  const Type *RetTy) const {
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
   return CCInfo.CheckReturn(
diff --git llvm/lib/Target/PowerPC/PPCISelLowering.h llvm/lib/Target/PowerPC/PPCISelLowering.h
index 5d692e3fcae9..cc01cab7a208 100644
--- llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -1373,7 +1373,7 @@ namespace llvm {
     bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
                         bool isVarArg,
                         const SmallVectorImpl<ISD::OutputArg> &Outs,
-                        LLVMContext &Context) const override;
+                        LLVMContext &Context, const Type *RetTy) const override;
 
     SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
                         const SmallVectorImpl<ISD::OutputArg> &Outs,
diff --git llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 019d4cfa33fb..b60a91be8240 100644
--- llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -531,7 +531,7 @@ bool PPCRegisterInfo::requiresVirtualBaseRegisters(
 
 bool PPCRegisterInfo::isCallerPreservedPhysReg(MCRegister PhysReg,
                                                const MachineFunction &MF) const {
-  assert(Register::isPhysicalRegister(PhysReg));
+  assert(PhysReg.isPhysical());
   const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
   const MachineFrameInfo &MFI = MF.getFrameInfo();
 
diff --git llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
index 0cb1ef0a66b6..8facf62cfa12 100644
--- llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
+++ llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
@@ -115,6 +115,7 @@ RISCVAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
 bool RISCVAsmBackend::shouldForceRelocation(const MCAssembler &Asm,
                                             const MCFixup &Fixup,
                                             const MCValue &Target,
+                                            const uint64_t,
                                             const MCSubtargetInfo *STI) {
   if (Fixup.getKind() >= FirstLiteralRelocationKind)
     return true;
@@ -570,7 +571,7 @@ bool RISCVAsmBackend::evaluateTargetFixup(const MCAssembler &Asm,
   Value = Asm.getSymbolOffset(SA) + AUIPCTarget.getConstant();
   Value -= Asm.getFragmentOffset(*AUIPCDF) + AUIPCFixup->getOffset();
 
-  if (shouldForceRelocation(Asm, *AUIPCFixup, AUIPCTarget, STI)) {
+  if (shouldForceRelocation(Asm, *AUIPCFixup, AUIPCTarget, Value, STI)) {
     WasForced = true;
     return false;
   }
diff --git llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h
index 886e7efe76bc..275f5bb94232 100644
--- llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h
+++ llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h
@@ -65,7 +65,7 @@ public:
   createObjectTargetWriter() const override;
 
   bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
-                             const MCValue &Target,
+                             const MCValue &Target, const uint64_t Value,
                              const MCSubtargetInfo *STI) override;
 
   bool fixupNeedsRelaxationAdvanced(const MCAssembler &Asm,
diff --git llvm/lib/Target/RISCV/RISCVISelLowering.cpp llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index de100c683a94..d1a5a7602914 100644
--- llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1524,13 +1524,17 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     setTargetDAGCombine({ISD::ZERO_EXTEND, ISD::FP_TO_SINT, ISD::FP_TO_UINT,
                          ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT});
   if (Subtarget.hasVInstructions())
-    setTargetDAGCombine({ISD::FCOPYSIGN, ISD::MGATHER, ISD::MSCATTER,
-                         ISD::VP_GATHER, ISD::VP_SCATTER, ISD::SRA, ISD::SRL,
-                         ISD::SHL, ISD::STORE, ISD::SPLAT_VECTOR,
-                         ISD::BUILD_VECTOR, ISD::CONCAT_VECTORS,
-                         ISD::EXPERIMENTAL_VP_REVERSE, ISD::MUL,
-                         ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM,
-                         ISD::INSERT_VECTOR_ELT, ISD::ABS, ISD::CTPOP,
+    setTargetDAGCombine({ISD::FCOPYSIGN,     ISD::MGATHER,
+                         ISD::MSCATTER,      ISD::VP_GATHER,
+                         ISD::VP_SCATTER,    ISD::SRA,
+                         ISD::SRL,           ISD::SHL,
+                         ISD::STORE,         ISD::SPLAT_VECTOR,
+                         ISD::BUILD_VECTOR,  ISD::CONCAT_VECTORS,
+                         ISD::VP_STORE,      ISD::EXPERIMENTAL_VP_REVERSE,
+                         ISD::MUL,           ISD::SDIV,
+                         ISD::UDIV,          ISD::SREM,
+                         ISD::UREM,          ISD::INSERT_VECTOR_ELT,
+                         ISD::ABS,           ISD::CTPOP,
                          ISD::VECTOR_SHUFFLE});
   if (Subtarget.hasVendorXTHeadMemPair())
     setTargetDAGCombine({ISD::LOAD, ISD::STORE});
@@ -5729,14 +5733,14 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
 }
 
 bool RISCVTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
-  // Support splats for any type. These should type legalize well.
-  if (ShuffleVectorSDNode::isSplatMask(M.data(), VT))
-    return true;
-
   // Only support legal VTs for other shuffles for now.
   if (!isTypeLegal(VT))
     return false;
 
+  // Support splats for any type. These should type legalize well.
+  if (ShuffleVectorSDNode::isSplatMask(M.data(), VT))
+    return true;
+
   MVT SVT = VT.getSimpleVT();
 
   // Not for i1 vectors.
@@ -16294,6 +16298,65 @@ static SDValue performVP_REVERSECombine(SDNode *N, SelectionDAG &DAG,
   return Ret;
 }
 
+static SDValue performVP_STORECombine(SDNode *N, SelectionDAG &DAG,
+                                      const RISCVSubtarget &Subtarget) {
+  // Fold:
+  //    vp.store(vp.reverse(VAL), ADDR, MASK) -> vp.strided.store(VAL, NEW_ADDR,
+  //    -1, MASK)
+  auto *VPStore = cast<VPStoreSDNode>(N);
+
+  if (VPStore->getValue().getOpcode() != ISD::EXPERIMENTAL_VP_REVERSE)
+    return SDValue();
+
+  SDValue VPReverse = VPStore->getValue();
+  EVT ReverseVT = VPReverse->getValueType(0);
+
+  // We do not have a strided_store version for masks, and the evl of vp.reverse
+  // and vp.store should always be the same.
+  if (!ReverseVT.getVectorElementType().isByteSized() ||
+      VPStore->getVectorLength() != VPReverse.getOperand(2) ||
+      !VPReverse.hasOneUse())
+    return SDValue();
+
+  SDValue StoreMask = VPStore->getMask();
+  // If Mask is all ones, then load is unmasked and can be reversed.
+  if (!isOneOrOneSplat(StoreMask)) {
+    // If the mask is not all ones, we can reverse the store if the mask was
+    // also reversed by an unmasked vp.reverse with the same EVL.
+    if (StoreMask.getOpcode() != ISD::EXPERIMENTAL_VP_REVERSE ||
+        !isOneOrOneSplat(StoreMask.getOperand(1)) ||
+        StoreMask.getOperand(2) != VPStore->getVectorLength())
+      return SDValue();
+    StoreMask = StoreMask.getOperand(0);
+  }
+
+  // Base = StoreAddr + (NumElem - 1) * ElemWidthByte
+  SDLoc DL(N);
+  MVT XLenVT = Subtarget.getXLenVT();
+  SDValue NumElem = VPStore->getVectorLength();
+  uint64_t ElemWidthByte = VPReverse.getValueType().getScalarSizeInBits() / 8;
+
+  SDValue Temp1 = DAG.getNode(ISD::SUB, DL, XLenVT, NumElem,
+                              DAG.getConstant(1, DL, XLenVT));
+  SDValue Temp2 = DAG.getNode(ISD::MUL, DL, XLenVT, Temp1,
+                              DAG.getConstant(ElemWidthByte, DL, XLenVT));
+  SDValue Base =
+      DAG.getNode(ISD::ADD, DL, XLenVT, VPStore->getBasePtr(), Temp2);
+  SDValue Stride = DAG.getConstant(-ElemWidthByte, DL, XLenVT);
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachinePointerInfo PtrInfo(VPStore->getAddressSpace());
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      PtrInfo, VPStore->getMemOperand()->getFlags(),
+      LocationSize::beforeOrAfterPointer(), VPStore->getAlign());
+
+  return DAG.getStridedStoreVP(
+      VPStore->getChain(), DL, VPReverse.getOperand(0), Base,
+      VPStore->getOffset(), Stride, StoreMask, VPStore->getVectorLength(),
+      VPStore->getMemoryVT(), MMO, VPStore->getAddressingMode(),
+      VPStore->isTruncatingStore(), VPStore->isCompressingStore());
+}
+
 // Convert from one FMA opcode to another based on whether we are negating the
 // multiply result and/or the accumulator.
 // NOTE: Only supports RVV operations with VL.
@@ -18474,6 +18537,8 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
   }
   case ISD::EXPERIMENTAL_VP_REVERSE:
     return performVP_REVERSECombine(N, DAG, Subtarget);
+  case ISD::VP_STORE:
+    return performVP_STORECombine(N, DAG, Subtarget);
   case ISD::BITCAST: {
     assert(Subtarget.useRVVForFixedLengthVectors());
     SDValue N0 = N->getOperand(0);
@@ -20544,7 +20609,8 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
 
 bool RISCVTargetLowering::CanLowerReturn(
     CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
-    const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
+    const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
+    const Type *RetTy) const {
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
 
diff --git llvm/lib/Target/RISCV/RISCVISelLowering.h llvm/lib/Target/RISCV/RISCVISelLowering.h
index 892c1cd96ca6..21747cc35320 100644
--- llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -766,7 +766,7 @@ public:
   bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
                       bool IsVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
-                      LLVMContext &Context) const override;
+                      LLVMContext &Context, const Type *RetTy) const override;
   SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
                       const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
diff --git llvm/lib/Target/SPIRV/SPIRVAPI.cpp llvm/lib/Target/SPIRV/SPIRVAPI.cpp
index a1ee4aada853..4c806fd7c988 100644
--- llvm/lib/Target/SPIRV/SPIRVAPI.cpp
+++ llvm/lib/Target/SPIRV/SPIRVAPI.cpp
@@ -134,9 +134,8 @@ SPIRVTranslateModule(Module *M, std::string &SpirvObj, std::string &ErrMsg,
   TargetOptions Options;
   std::optional<Reloc::Model> RM;
   std::optional<CodeModel::Model> CM;
-  std::unique_ptr<TargetMachine> Target =
-      std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
-          TargetTriple.getTriple(), "", "", Options, RM, CM, OLevel));
+  std::unique_ptr<TargetMachine> Target(TheTarget->createTargetMachine(
+      TargetTriple.getTriple(), "", "", Options, RM, CM, OLevel));
   if (!Target) {
     ErrMsg = "Could not allocate target machine!";
     return false;
@@ -158,10 +157,10 @@ SPIRVTranslateModule(Module *M, std::string &SpirvObj, std::string &ErrMsg,
   TargetLibraryInfoImpl TLII(Triple(M->getTargetTriple()));
   legacy::PassManager PM;
   PM.add(new TargetLibraryInfoWrapperPass(TLII));
-  MachineModuleInfoWrapperPass *MMIWP =
-      new MachineModuleInfoWrapperPass(Target.get());
+  std::unique_ptr<MachineModuleInfoWrapperPass> MMIWP(
+      new MachineModuleInfoWrapperPass(Target.get()));
   const_cast<TargetLoweringObjectFile *>(Target->getObjFileLowering())
-      ->Initialize(MMIWP->getMMI().getContext(), *Target);
+      ->Initialize(MMIWP.get()->getMMI().getContext(), *Target);
 
   SmallString<4096> OutBuffer;
   raw_svector_ostream OutStream(OutBuffer);
diff --git llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
index e236d646e66f..b52c793e57e9 100644
--- llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
+++ llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
@@ -535,15 +535,15 @@ extern Register insertAssignInstr(Register Reg, Type *Ty, SPIRVType *SpirvTy,
 static SPIRV::MemorySemantics::MemorySemantics
 getSPIRVMemSemantics(std::memory_order MemOrder) {
   switch (MemOrder) {
-  case std::memory_order::memory_order_relaxed:
+  case std::memory_order_relaxed:
     return SPIRV::MemorySemantics::None;
-  case std::memory_order::memory_order_acquire:
+  case std::memory_order_acquire:
     return SPIRV::MemorySemantics::Acquire;
-  case std::memory_order::memory_order_release:
+  case std::memory_order_release:
     return SPIRV::MemorySemantics::Release;
-  case std::memory_order::memory_order_acq_rel:
+  case std::memory_order_acq_rel:
     return SPIRV::MemorySemantics::AcquireRelease;
-  case std::memory_order::memory_order_seq_cst:
+  case std::memory_order_seq_cst:
     return SPIRV::MemorySemantics::SequentiallyConsistent;
   default:
     report_fatal_error("Unknown CL memory scope");
@@ -611,8 +611,7 @@ static Register buildMemSemanticsReg(Register SemanticsRegister,
     Semantics =
         getSPIRVMemSemantics(Order) |
         getMemSemanticsForStorageClass(GR->getPointerStorageClass(PtrRegister));
-
-    if (Order == Semantics) {
+    if (static_cast<unsigned>(Order) == Semantics) {
       MRI->setRegClass(SemanticsRegister, &SPIRV::iIDRegClass);
       return SemanticsRegister;
     }
@@ -757,9 +756,9 @@ static bool buildAtomicCompareExchangeInst(
         static_cast<std::memory_order>(getIConstVal(Call->Arguments[4], MRI));
     MemSemEqual = getSPIRVMemSemantics(MemOrdEq) | MemSemStorage;
     MemSemUnequal = getSPIRVMemSemantics(MemOrdNeq) | MemSemStorage;
-    if (MemOrdEq == MemSemEqual)
+    if (static_cast<unsigned>(MemOrdEq) == MemSemEqual)
       MemSemEqualReg = Call->Arguments[3];
-    if (MemOrdNeq == MemSemEqual)
+    if (static_cast<unsigned>(MemOrdNeq) == MemSemEqual)
       MemSemUnequalReg = Call->Arguments[4];
   }
   if (!MemSemEqualReg.isValid())
diff --git llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
index 44b6f5f8d507..78f6b188c45c 100644
--- llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
+++ llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
@@ -544,13 +544,7 @@ bool SPIRVCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
   const auto *ST = static_cast<const SPIRVSubtarget *>(&MF.getSubtarget());
 
   bool isFunctionDecl = CF && CF->isDeclaration();
-  bool canUseOpenCL = ST->canUseExtInstSet(SPIRV::InstructionSet::OpenCL_std);
-  bool canUseGLSL = ST->canUseExtInstSet(SPIRV::InstructionSet::GLSL_std_450);
-  assert(canUseGLSL != canUseOpenCL &&
-         "Scenario where both sets are enabled is not supported.");
-
-  if (isFunctionDecl && !DemangledName.empty() &&
-      (canUseGLSL || canUseOpenCL)) {
+  if (isFunctionDecl && !DemangledName.empty()) {
     if (ResVReg.isValid()) {
       if (!GR->getSPIRVTypeForVReg(ResVReg)) {
         const Type *RetTy = OrigRetTy;
@@ -607,11 +601,9 @@ bool SPIRVCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
                                    GR->getPointerSize()));
       }
     }
-    auto instructionSet = canUseOpenCL ? SPIRV::InstructionSet::OpenCL_std
-                                       : SPIRV::InstructionSet::GLSL_std_450;
     if (auto Res =
-            SPIRV::lowerBuiltin(DemangledName, instructionSet, MIRBuilder,
-                                ResVReg, OrigRetTy, ArgVRegs, GR))
+            SPIRV::lowerBuiltin(DemangledName, ST->getPreferredInstructionSet(),
+                                MIRBuilder, ResVReg, OrigRetTy, ArgVRegs, GR))
       return *Res;
   }
 
diff --git llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
index 1c1acd29ee0e..702206b8e0dc 100644
--- llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
+++ llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
@@ -74,7 +74,6 @@ class SPIRVEmitIntrinsics
   DenseMap<Instruction *, Constant *> AggrConsts;
   DenseMap<Instruction *, Type *> AggrConstTypes;
   DenseSet<Instruction *> AggrStores;
-  SPIRV::InstructionSet::InstructionSet InstrSet;
 
   // map of function declarations to <pointer arg index => element type>
   DenseMap<Function *, SmallVector<std::pair<unsigned, Type *>>> FDeclPtrTys;
@@ -896,8 +895,9 @@ bool SPIRVEmitIntrinsics::deduceOperandElementTypeCalledFunction(
       getOclOrSpirvBuiltinDemangledName(CalledF->getName());
   if (DemangledName.length() > 0 &&
       !StringRef(DemangledName).starts_with("llvm.")) {
-    auto [Grp, Opcode, ExtNo] =
-        SPIRV::mapBuiltinToOpcode(DemangledName, InstrSet);
+    const SPIRVSubtarget &ST = TM->getSubtarget<SPIRVSubtarget>(*CalledF);
+    auto [Grp, Opcode, ExtNo] = SPIRV::mapBuiltinToOpcode(
+        DemangledName, ST.getPreferredInstructionSet());
     if (Opcode == SPIRV::OpGroupAsyncCopy) {
       for (unsigned i = 0, PtrCnt = 0; i < CI->arg_size() && PtrCnt < 2; ++i) {
         Value *Op = CI->getArgOperand(i);
@@ -2317,8 +2317,6 @@ bool SPIRVEmitIntrinsics::runOnFunction(Function &Func) {
 
   const SPIRVSubtarget &ST = TM->getSubtarget<SPIRVSubtarget>(Func);
   GR = ST.getSPIRVGlobalRegistry();
-  InstrSet = ST.isOpenCLEnv() ? SPIRV::InstructionSet::OpenCL_std
-                              : SPIRV::InstructionSet::GLSL_std_450;
 
   if (!CurrF)
     HaveFunPtrs =
@@ -2475,8 +2473,9 @@ void SPIRVEmitIntrinsics::parseFunDeclarations(Module &M) {
     if (DemangledName.empty())
       continue;
     // allow only OpGroupAsyncCopy use case at the moment
-    auto [Grp, Opcode, ExtNo] =
-        SPIRV::mapBuiltinToOpcode(DemangledName, InstrSet);
+    const SPIRVSubtarget &ST = TM->getSubtarget<SPIRVSubtarget>(F);
+    auto [Grp, Opcode, ExtNo] = SPIRV::mapBuiltinToOpcode(
+        DemangledName, ST.getPreferredInstructionSet());
     if (Opcode != SPIRV::OpGroupAsyncCopy)
       continue;
     // find pointer arguments
diff --git llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
index ecf9b6ddae1f..028699e56a94 100644
--- llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
+++ llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
@@ -22,6 +22,7 @@
 #include "SPIRVSubtarget.h"
 #include "SPIRVTargetMachine.h"
 #include "SPIRVUtils.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/IntrinsicLowering.h"
 #include "llvm/IR/IRBuilder.h"
@@ -30,7 +31,6 @@
 #include "llvm/IR/IntrinsicsSPIRV.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
-#include <charconv>
 #include <regex>
 
 using namespace llvm;
@@ -228,9 +228,7 @@ static SmallVector<Metadata *> parseAnnotation(Value *I,
         } else {
           MDsItem.push_back(MDString::get(Ctx, Item));
         }
-      } else if (int32_t Num;
-                 std::from_chars(Item.data(), Item.data() + Item.size(), Num)
-                     .ec == std::errc{}) {
+      } else if (int32_t Num; llvm::to_integer(StringRef(Item), Num, 10)) {
         MDsItem.push_back(
             ConstantAsMetadata::get(ConstantInt::get(Int32Ty, Num)));
       } else {
diff --git llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
index fc35a3e06c43..a476b51c3120 100644
--- llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
+++ llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
@@ -111,6 +111,14 @@ bool SPIRVSubtarget::canUseExtInstSet(
   return AvailableExtInstSets.contains(E);
 }
 
+SPIRV::InstructionSet::InstructionSet
+SPIRVSubtarget::getPreferredInstructionSet() const {
+  if (isOpenCLEnv())
+    return SPIRV::InstructionSet::OpenCL_std;
+  else
+    return SPIRV::InstructionSet::GLSL_std_450;
+}
+
 bool SPIRVSubtarget::isAtLeastSPIRVVer(VersionTuple VerToCompareTo) const {
   return isAtLeastVer(SPIRVVersion, VerToCompareTo);
 }
diff --git llvm/lib/Target/SPIRV/SPIRVSubtarget.h llvm/lib/Target/SPIRV/SPIRVSubtarget.h
index 984ba953e874..e587739a7636 100644
--- llvm/lib/Target/SPIRV/SPIRVSubtarget.h
+++ llvm/lib/Target/SPIRV/SPIRVSubtarget.h
@@ -96,6 +96,7 @@ public:
   }
   bool canUseExtension(SPIRV::Extension::Extension E) const;
   bool canUseExtInstSet(SPIRV::InstructionSet::InstructionSet E) const;
+  SPIRV::InstructionSet::InstructionSet getPreferredInstructionSet() const;
 
   SPIRVGlobalRegistry *getSPIRVGlobalRegistry() const { return GR.get(); }
 
diff --git llvm/lib/Target/SPIRV/SPIRVUtils.h llvm/lib/Target/SPIRV/SPIRVUtils.h
index 60649eac6281..fd4809825706 100644
--- llvm/lib/Target/SPIRV/SPIRVUtils.h
+++ llvm/lib/Target/SPIRV/SPIRVUtils.h
@@ -22,6 +22,7 @@
 #include "llvm/IR/TypedPointerType.h"
 #include <queue>
 #include <string>
+#include <unordered_map>
 #include <unordered_set>
 
 namespace llvm {
diff --git llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
index 66826fadddd2..597b9a779e84 100644
--- llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
+++ llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
@@ -273,7 +273,7 @@ namespace {
     }
 
     bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
-                               const MCValue &Target,
+                               const MCValue &Target, const uint64_t,
                                const MCSubtargetInfo *STI) override {
       if (Fixup.getKind() >= FirstLiteralRelocationKind)
         return true;
diff --git llvm/lib/Target/Sparc/SparcISelLowering.cpp llvm/lib/Target/Sparc/SparcISelLowering.cpp
index 03a74b625430..d0cd38cf7236 100644
--- llvm/lib/Target/Sparc/SparcISelLowering.cpp
+++ llvm/lib/Target/Sparc/SparcISelLowering.cpp
@@ -234,7 +234,8 @@ static unsigned toCallerWindow(unsigned Reg) {
 
 bool SparcTargetLowering::CanLowerReturn(
     CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
-    const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
+    const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
+    const Type *RetTy) const {
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
   return CCInfo.CheckReturn(Outs, Subtarget->is64Bit() ? RetCC_Sparc64
diff --git llvm/lib/Target/Sparc/SparcISelLowering.h llvm/lib/Target/Sparc/SparcISelLowering.h
index cc672074a4be..1bee5f4cfe84 100644
--- llvm/lib/Target/Sparc/SparcISelLowering.h
+++ llvm/lib/Target/Sparc/SparcISelLowering.h
@@ -153,7 +153,7 @@ namespace llvm {
     bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
                         bool isVarArg,
                         const SmallVectorImpl<ISD::OutputArg> &Outs,
-                        LLVMContext &Context) const override;
+                        LLVMContext &Context, const Type *RetTy) const override;
 
     SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
                         const SmallVectorImpl<ISD::OutputArg> &Outs,
diff --git llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
index 599afed2199f..b892c9ea6960 100644
--- llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
+++ llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -75,7 +75,8 @@ enum MemoryKind {
   BDXMem,
   BDLMem,
   BDRMem,
-  BDVMem
+  BDVMem,
+  LXAMem
 };
 
 class SystemZOperand : public MCParsedAsmOperand {
@@ -339,6 +340,13 @@ public:
     addExpr(Inst, Mem.Disp);
     Inst.addOperand(MCOperand::createReg(Mem.Index));
   }
+  void addLXAAddrOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 3 && "Invalid number of operands");
+    assert(isMem(LXAMem) && "Invalid operand type");
+    Inst.addOperand(MCOperand::createReg(Mem.Base));
+    addExpr(Inst, Mem.Disp);
+    Inst.addOperand(MCOperand::createReg(Mem.Index));
+  }
   void addImmTLSOperands(MCInst &Inst, unsigned N) const {
     assert(N == 2 && "Invalid number of operands");
     assert(Kind == KindImmTLS && "Invalid operand type");
@@ -376,6 +384,7 @@ public:
   bool isBDLAddr64Disp12Len8() const { return isMemDisp12Len8(GR64Reg); }
   bool isBDRAddr64Disp12() const { return isMemDisp12(BDRMem, GR64Reg); }
   bool isBDVAddr64Disp12() const { return isMemDisp12(BDVMem, GR64Reg); }
+  bool isLXAAddr64Disp20() const { return isMemDisp20(LXAMem, GR64Reg); }
   bool isU1Imm() const { return isImm(0, 1); }
   bool isU2Imm() const { return isImm(0, 3); }
   bool isU3Imm() const { return isImm(0, 7); }
@@ -582,6 +591,9 @@ public:
   ParseStatus parseBDVAddr64(OperandVector &Operands) {
     return parseAddress(Operands, BDVMem, GR64Reg);
   }
+  ParseStatus parseLXAAddr64(OperandVector &Operands) {
+    return parseAddress(Operands, LXAMem, GR64Reg);
+  }
   ParseStatus parsePCRel12(OperandVector &Operands) {
     return parsePCRel(Operands, -(1LL << 12), (1LL << 12) - 1, false);
   }
@@ -1144,15 +1156,20 @@ ParseStatus SystemZAsmParser::parseAddress(OperandVector &Operands,
       return Error(StartLoc, "invalid use of indexed addressing");
     break;
   case BDXMem:
+  case LXAMem:
     // If we have Reg1, it must be an address register.
     if (HaveReg1) {
+      const unsigned *IndexRegs = Regs;
+      if (MemKind == LXAMem)
+        IndexRegs = SystemZMC::GR32Regs;
+
       if (parseAddressRegister(Reg1))
         return ParseStatus::Failure;
       // If there are two registers, the first one is the index and the
       // second is the base.  If there is only a single register, it is
       // used as base with GAS and as index with HLASM.
       if (HaveReg2 || isParsingHLASM())
-        Index = Reg1.Num == 0 ? 0 : Regs[Reg1.Num];
+        Index = Reg1.Num == 0 ? 0 : IndexRegs[Reg1.Num];
       else
         Base = Reg1.Num == 0 ? 0 : Regs[Reg1.Num];
     }
@@ -1278,6 +1295,8 @@ bool SystemZAsmParser::ParseDirectiveInsn(SMLoc L) {
       ResTy = parseBDAddr64(Operands);
     else if (Kind == MCK_BDVAddr64Disp12)
       ResTy = parseBDVAddr64(Operands);
+    else if (Kind == MCK_LXAAddr64Disp20)
+      ResTy = parseLXAAddr64(Operands);
     else if (Kind == MCK_PCRel32)
       ResTy = parsePCRel32(Operands);
     else if (Kind == MCK_PCRel16)
@@ -1324,6 +1343,8 @@ bool SystemZAsmParser::ParseDirectiveInsn(SMLoc L) {
       ZOperand.addBDXAddrOperands(Inst, 3);
     else if (ZOperand.isMem(BDVMem))
       ZOperand.addBDVAddrOperands(Inst, 3);
+    else if (ZOperand.isMem(LXAMem))
+      ZOperand.addLXAAddrOperands(Inst, 3);
     else if (ZOperand.isImm())
       ZOperand.addImmOperands(Inst, 1);
     else
diff --git llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp
index fe0f38747656..85366dfa6c4b 100644
--- llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp
+++ llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp
@@ -234,6 +234,12 @@ void SystemZInstPrinterCommon::printBDVAddrOperand(const MCInst *MI, int OpNum,
                MI->getOperand(OpNum + 2).getReg(), O);
 }
 
+void SystemZInstPrinterCommon::printLXAAddrOperand(const MCInst *MI, int OpNum,
+                                             raw_ostream &O) {
+  printAddress(&MAI, MI->getOperand(OpNum).getReg(), MI->getOperand(OpNum + 1),
+               MI->getOperand(OpNum + 2).getReg(), O);
+}
+
 void SystemZInstPrinterCommon::printCond4Operand(const MCInst *MI, int OpNum,
                                                  raw_ostream &O) {
   static const char *const CondNames[] = {"o",  "h",  "nle", "l",   "nhe",
diff --git llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.h llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.h
index 1a11e421691a..304aa03d988d 100644
--- llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.h
+++ llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.h
@@ -58,6 +58,7 @@ protected:
   void printBDLAddrOperand(const MCInst *MI, int OpNum, raw_ostream &O);
   void printBDRAddrOperand(const MCInst *MI, int OpNum, raw_ostream &O);
   void printBDVAddrOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+  void printLXAAddrOperand(const MCInst *MI, int OpNum, raw_ostream &O);
   void printU1ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
   void printU2ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
   void printU3ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
diff --git llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
index ed174f7ac01f..cbf322a94704 100644
--- llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
+++ llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
@@ -116,7 +116,7 @@ public:
   std::optional<MCFixupKind> getFixupKind(StringRef Name) const override;
   const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
   bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
-                             const MCValue &Target,
+                             const MCValue &Target, const uint64_t Value,
                              const MCSubtargetInfo *STI) override;
   void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
                   const MCValue &Target, MutableArrayRef<char> Data,
@@ -161,7 +161,7 @@ SystemZMCAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
 
 bool SystemZMCAsmBackend::shouldForceRelocation(const MCAssembler &,
                                                 const MCFixup &Fixup,
-                                                const MCValue &,
+                                                const MCValue &, const uint64_t,
                                                 const MCSubtargetInfo *STI) {
   return Fixup.getKind() >= FirstLiteralRelocationKind;
 }
diff --git llvm/lib/Target/SystemZ/SystemZFeatures.td llvm/lib/Target/SystemZ/SystemZFeatures.td
index e6b95d32c29f..ec1a7beeab21 100644
--- llvm/lib/Target/SystemZ/SystemZFeatures.td
+++ llvm/lib/Target/SystemZ/SystemZFeatures.td
@@ -346,6 +346,45 @@ def Arch14NewFeatures : SystemZFeatureList<[
     FeatureProcessorActivityInstrumentation
 ]>;
 
+//===----------------------------------------------------------------------===//
+//
+// New features added in the Fifteenth Edition of the z/Architecture
+//
+//===----------------------------------------------------------------------===//
+
+def FeatureMiscellaneousExtensions4 : SystemZFeature<
+  "miscellaneous-extensions-4", "MiscellaneousExtensions4", (all_of FeatureMiscellaneousExtensions4),
+  "Assume that the miscellaneous-extensions facility 4 is installed"
+>;
+
+def FeatureVectorEnhancements3 : SystemZFeature<
+  "vector-enhancements-3", "VectorEnhancements3", (all_of FeatureVectorEnhancements3),
+  "Assume that the vector enhancements facility 3 is installed"
+>;
+
+def FeatureVectorPackedDecimalEnhancement3 : SystemZFeature<
+  "vector-packed-decimal-enhancement-3", "VectorPackedDecimalEnhancement3", (all_of FeatureVectorPackedDecimalEnhancement3),
+  "Assume that the vector packed decimal enhancement facility 3 is installed"
+>;
+
+def FeatureMessageSecurityAssist12 : SystemZFeature<
+  "message-security-assist-extension12", "MessageSecurityAssist12", (all_of FeatureMessageSecurityAssist12),
+  "Assume that the message-security-assist extension facility 12 is installed"
+>;
+
+def FeatureConcurrentFunctions : SystemZFeature<
+  "concurrent-functions", "ConcurrentFunctions", (all_of FeatureConcurrentFunctions),
+  "Assume that the concurrent-functions facility is installed"
+>;
+
+def Arch15NewFeatures : SystemZFeatureList<[
+    FeatureMiscellaneousExtensions4,
+    FeatureVectorEnhancements3,
+    FeatureVectorPackedDecimalEnhancement3,
+    FeatureMessageSecurityAssist12,
+    FeatureConcurrentFunctions
+]>;
+
 //===----------------------------------------------------------------------===//
 //
 // Cumulative supported and unsupported feature sets
@@ -366,9 +405,13 @@ def Arch13SupportedFeatures
   : SystemZFeatureAdd<Arch12SupportedFeatures.List, Arch13NewFeatures.List>;
 def Arch14SupportedFeatures
   : SystemZFeatureAdd<Arch13SupportedFeatures.List, Arch14NewFeatures.List>;
+def Arch15SupportedFeatures
+  : SystemZFeatureAdd<Arch14SupportedFeatures.List, Arch15NewFeatures.List>;
 
-def Arch14UnsupportedFeatures
+def Arch15UnsupportedFeatures
   : SystemZFeatureList<[]>;
+def Arch14UnsupportedFeatures
+  : SystemZFeatureAdd<Arch15UnsupportedFeatures.List, Arch15NewFeatures.List>;
 def Arch13UnsupportedFeatures
   : SystemZFeatureAdd<Arch14UnsupportedFeatures.List, Arch14NewFeatures.List>;
 def Arch12UnsupportedFeatures
diff --git llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
index 884d3a0614a8..3d90e3f6f678 100644
--- llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
+++ llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
@@ -1001,6 +1001,16 @@ bool SystemZDAGToDAGISel::tryRISBGZero(SDNode *N) {
   if (Count == 1 && N->getOpcode() != ISD::AND)
     return false;
 
+  // Prefer LOAD LOGICAL INDEXED ADDRESS over RISBG in the case where we
+  // can use its displacement to pull in an addition.
+  if (Subtarget->hasMiscellaneousExtensions4() &&
+      RISBG.Rotate >= 1 && RISBG.Rotate <= 4 &&
+      RISBG.Mask == (((uint64_t)1 << 32) - 1) << RISBG.Rotate &&
+      RISBG.Input.getOpcode() == ISD::ADD)
+    if (auto *C = dyn_cast<ConstantSDNode>(RISBG.Input.getOperand(1)))
+      if (isInt<20>(C->getSExtValue()))
+        return false;
+
   // Prefer register extensions like LLC over RISBG.  Also prefer to start
   // out with normal ANDs if one instruction would be enough.  We can convert
   // these ANDs into an RISBG later if a three-address instruction is useful.
diff --git llvm/lib/Target/SystemZ/SystemZISelLowering.cpp llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index d664b4a41fce..4040ab6d4510 100644
--- llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -253,15 +253,24 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::UMUL_LOHI, MVT::i128, Expand);
     setOperationAction(ISD::ROTR,      MVT::i128, Expand);
     setOperationAction(ISD::ROTL,      MVT::i128, Expand);
-    setOperationAction(ISD::MUL,       MVT::i128, Expand);
-    setOperationAction(ISD::MULHS,     MVT::i128, Expand);
-    setOperationAction(ISD::MULHU,     MVT::i128, Expand);
-    setOperationAction(ISD::SDIV,      MVT::i128, Expand);
-    setOperationAction(ISD::UDIV,      MVT::i128, Expand);
-    setOperationAction(ISD::SREM,      MVT::i128, Expand);
-    setOperationAction(ISD::UREM,      MVT::i128, Expand);
-    setOperationAction(ISD::CTLZ,      MVT::i128, Expand);
-    setOperationAction(ISD::CTTZ,      MVT::i128, Expand);
+
+    // No special instructions for these before arch15.
+    if (!Subtarget.hasVectorEnhancements3()) {
+      setOperationAction(ISD::MUL,   MVT::i128, Expand);
+      setOperationAction(ISD::MULHS, MVT::i128, Expand);
+      setOperationAction(ISD::MULHU, MVT::i128, Expand);
+      setOperationAction(ISD::SDIV,  MVT::i128, Expand);
+      setOperationAction(ISD::UDIV,  MVT::i128, Expand);
+      setOperationAction(ISD::SREM,  MVT::i128, Expand);
+      setOperationAction(ISD::UREM,  MVT::i128, Expand);
+      setOperationAction(ISD::CTLZ,  MVT::i128, Expand);
+      setOperationAction(ISD::CTTZ,  MVT::i128, Expand);
+    } else {
+      // Even if we do have a legal 128-bit multiply, we do not
+      // want 64-bit multiply-high operations to use it.
+      setOperationAction(ISD::MULHS, MVT::i64, Custom);
+      setOperationAction(ISD::MULHU, MVT::i64, Custom);
+    }
 
     // Support addition/subtraction with carry.
     setOperationAction(ISD::UADDO, MVT::i128, Custom);
@@ -272,6 +281,11 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
     // Use VPOPCT and add up partial results.
     setOperationAction(ISD::CTPOP, MVT::i128, Custom);
 
+    // Additional instructions available with arch15.
+    if (Subtarget.hasVectorEnhancements3()) {
+      setOperationAction(ISD::ABS, MVT::i128, Legal);
+    }
+
     // We have to use libcalls for these.
     setOperationAction(ISD::FP_TO_UINT, MVT::i128, LibCall);
     setOperationAction(ISD::FP_TO_SINT, MVT::i128, LibCall);
@@ -339,6 +353,13 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Promote);
   setOperationAction(ISD::CTLZ, MVT::i64, Legal);
 
+  // On arch15 we have native support for a 64-bit CTTZ.
+  if (Subtarget.hasMiscellaneousExtensions4()) {
+    setOperationAction(ISD::CTTZ, MVT::i32, Promote);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Promote);
+    setOperationAction(ISD::CTTZ, MVT::i64, Legal);
+  }
+
   // On z15 we have native support for a 64-bit CTPOP.
   if (Subtarget.hasMiscellaneousExtensions3()) {
     setOperationAction(ISD::CTPOP, MVT::i32, Promote);
@@ -433,8 +454,15 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Legal);
       setOperationAction(ISD::ADD, VT, Legal);
       setOperationAction(ISD::SUB, VT, Legal);
-      if (VT != MVT::v2i64)
+      if (VT != MVT::v2i64 || Subtarget.hasVectorEnhancements3())
         setOperationAction(ISD::MUL, VT, Legal);
+      if (Subtarget.hasVectorEnhancements3() &&
+          VT != MVT::v16i8 && VT != MVT::v8i16) {
+        setOperationAction(ISD::SDIV, VT, Legal);
+        setOperationAction(ISD::UDIV, VT, Legal);
+        setOperationAction(ISD::SREM, VT, Legal);
+        setOperationAction(ISD::UREM, VT, Legal);
+      }
       setOperationAction(ISD::ABS, VT, Legal);
       setOperationAction(ISD::AND, VT, Legal);
       setOperationAction(ISD::OR, VT, Legal);
@@ -2412,7 +2440,8 @@ bool SystemZTargetLowering::
 CanLowerReturn(CallingConv::ID CallConv,
                MachineFunction &MF, bool isVarArg,
                const SmallVectorImpl<ISD::OutputArg> &Outs,
-               LLVMContext &Context) const {
+               LLVMContext &Context,
+               const Type *RetTy) const {
   // Special case that we cannot easily detect in RetCC_SystemZ since
   // i128 may not be a legal type.
   for (auto &Out : Outs)
@@ -2527,6 +2556,7 @@ static bool isIntrinsicWithCC(SDValue Op, unsigned &Opcode, unsigned &CCValid) {
   case Intrinsic::s390_vceqhs:
   case Intrinsic::s390_vceqfs:
   case Intrinsic::s390_vceqgs:
+  case Intrinsic::s390_vceqqs:
     Opcode = SystemZISD::VICMPES;
     CCValid = SystemZ::CCMASK_VCMP;
     return true;
@@ -2535,6 +2565,7 @@ static bool isIntrinsicWithCC(SDValue Op, unsigned &Opcode, unsigned &CCValid) {
   case Intrinsic::s390_vchhs:
   case Intrinsic::s390_vchfs:
   case Intrinsic::s390_vchgs:
+  case Intrinsic::s390_vchqs:
     Opcode = SystemZISD::VICMPHS;
     CCValid = SystemZ::CCMASK_VCMP;
     return true;
@@ -2543,6 +2574,7 @@ static bool isIntrinsicWithCC(SDValue Op, unsigned &Opcode, unsigned &CCValid) {
   case Intrinsic::s390_vchlhs:
   case Intrinsic::s390_vchlfs:
   case Intrinsic::s390_vchlgs:
+  case Intrinsic::s390_vchlqs:
     Opcode = SystemZISD::VICMPHLS;
     CCValid = SystemZ::CCMASK_VCMP;
     return true;
@@ -3222,6 +3254,8 @@ static void adjustICmp128(SelectionDAG &DAG, const SDLoc &DL,
     return;
   if (C.Op0.getValueType() != MVT::i128)
     return;
+  if (DAG.getSubtarget<SystemZSubtarget>().hasVectorEnhancements3())
+    return;
 
   // (In-)Equality comparisons can be implemented via VCEQGS.
   if (C.CCMask == SystemZ::CCMASK_CMP_EQ ||
@@ -3635,6 +3669,18 @@ SDValue SystemZTargetLowering::lowerVectorSETCC(SelectionDAG &DAG,
     // matter whether we try the inversion or the swap first, since
     // there are no cases where both work.
   default:
+    // Optimize sign-bit comparisons to signed compares.
+    if (Mode == CmpMode::Int && (CC == ISD::SETEQ || CC == ISD::SETNE) &&
+        ISD::isConstantSplatVectorAllZeros(CmpOp1.getNode())) {
+      unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+      APInt Mask;
+      if (CmpOp0.getOpcode() == ISD::AND
+          && ISD::isConstantSplatVector(CmpOp0.getOperand(1).getNode(), Mask)
+          && Mask == APInt::getSignMask(EltSize)) {
+        CC = CC == ISD::SETEQ ? ISD::SETGE : ISD::SETLT;
+        CmpOp0 = CmpOp0.getOperand(0);
+      }
+    }
     if (unsigned Opcode = getVectorComparisonOrInvert(CC, Mode, Invert))
       Cmp = getVectorCmp(DAG, Opcode, DL, VT, CmpOp0, CmpOp1, Chain);
     else {
@@ -3733,6 +3779,42 @@ static SDValue getAbsolute(SelectionDAG &DAG, const SDLoc &DL, SDValue Op,
   return Op;
 }
 
+static SDValue getI128Select(SelectionDAG &DAG, const SDLoc &DL,
+                             Comparison C, SDValue TrueOp, SDValue FalseOp) {
+  EVT VT = MVT::i128;
+  unsigned Op;
+
+  if (C.CCMask == SystemZ::CCMASK_CMP_NE ||
+      C.CCMask == SystemZ::CCMASK_CMP_GE ||
+      C.CCMask == SystemZ::CCMASK_CMP_LE) {
+    std::swap(TrueOp, FalseOp);
+    C.CCMask ^= C.CCValid;
+  }
+  if (C.CCMask == SystemZ::CCMASK_CMP_LT) {
+    std::swap(C.Op0, C.Op1);
+    C.CCMask = SystemZ::CCMASK_CMP_GT;
+  }
+  switch (C.CCMask) {
+  case SystemZ::CCMASK_CMP_EQ:
+    Op = SystemZISD::VICMPE;
+    break;
+  case SystemZ::CCMASK_CMP_GT:
+    if (C.ICmpType == SystemZICMP::UnsignedOnly)
+      Op = SystemZISD::VICMPHL;
+    else
+      Op = SystemZISD::VICMPH;
+    break;
+  default:
+    llvm_unreachable("Unhandled comparison");
+    break;
+  }
+
+  SDValue Mask = DAG.getNode(Op, DL, VT, C.Op0, C.Op1);
+  TrueOp = DAG.getNode(ISD::AND, DL, VT, TrueOp, Mask);
+  FalseOp = DAG.getNode(ISD::AND, DL, VT, FalseOp, DAG.getNOT(DL, Mask, VT));
+  return DAG.getNode(ISD::OR, DL, VT, TrueOp, FalseOp);
+}
+
 SDValue SystemZTargetLowering::lowerSELECT_CC(SDValue Op,
                                               SelectionDAG &DAG) const {
   SDValue CmpOp0   = Op.getOperand(0);
@@ -3758,6 +3840,13 @@ SDValue SystemZTargetLowering::lowerSELECT_CC(SDValue Op,
       return getAbsolute(DAG, DL, FalseOp, C.CCMask & SystemZ::CCMASK_CMP_GT);
   }
 
+  if (Subtarget.hasVectorEnhancements3() &&
+      C.Opcode == SystemZISD::ICMP &&
+      C.Op0.getValueType() == MVT::i128 &&
+      TrueOp.getValueType() == MVT::i128) {
+    return getI128Select(DAG, DL, C, TrueOp, FalseOp);
+  }
+
   SDValue CCReg = emitCmp(DAG, DL, C);
   SDValue Ops[] = {TrueOp, FalseOp,
                    DAG.getTargetConstant(C.CCValid, DL, MVT::i32),
@@ -4370,6 +4459,24 @@ SDValue SystemZTargetLowering::lowerGET_DYNAMIC_AREA_OFFSET(
   return DAG.getNode(SystemZISD::ADJDYNALLOC, DL, MVT::i64);
 }
 
+SDValue SystemZTargetLowering::lowerMULH(SDValue Op,
+                                         SelectionDAG &DAG,
+                                         unsigned Opcode) const {
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  SDValue Even, Odd;
+
+  // This custom expander is only used on arch15 and later for 64-bit types.
+  assert(!is32Bit(VT));
+  assert(Subtarget.hasMiscellaneousExtensions2());
+
+  // SystemZISD::xMUL_LOHI returns the low result in the odd register and
+  // the high result in the even register.  Return the latter.
+  lowerGR128Binary(DAG, DL, VT, Opcode,
+                   Op.getOperand(0), Op.getOperand(1), Even, Odd);
+  return Even;
+}
+
 SDValue SystemZTargetLowering::lowerSMUL_LOHI(SDValue Op,
                                               SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
@@ -5117,24 +5224,28 @@ SystemZTargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::s390_vuphb:
   case Intrinsic::s390_vuphh:
   case Intrinsic::s390_vuphf:
+  case Intrinsic::s390_vuphg:
     return DAG.getNode(SystemZISD::UNPACK_HIGH, SDLoc(Op), Op.getValueType(),
                        Op.getOperand(1));
 
   case Intrinsic::s390_vuplhb:
   case Intrinsic::s390_vuplhh:
   case Intrinsic::s390_vuplhf:
+  case Intrinsic::s390_vuplhg:
     return DAG.getNode(SystemZISD::UNPACKL_HIGH, SDLoc(Op), Op.getValueType(),
                        Op.getOperand(1));
 
   case Intrinsic::s390_vuplb:
   case Intrinsic::s390_vuplhw:
   case Intrinsic::s390_vuplf:
+  case Intrinsic::s390_vuplg:
     return DAG.getNode(SystemZISD::UNPACK_LOW, SDLoc(Op), Op.getValueType(),
                        Op.getOperand(1));
 
   case Intrinsic::s390_vupllb:
   case Intrinsic::s390_vupllh:
   case Intrinsic::s390_vupllf:
+  case Intrinsic::s390_vupllg:
     return DAG.getNode(SystemZISD::UNPACKL_LOW, SDLoc(Op), Op.getValueType(),
                        Op.getOperand(1));
 
@@ -6441,6 +6552,10 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
     return lowerDYNAMIC_STACKALLOC(Op, DAG);
   case ISD::GET_DYNAMIC_AREA_OFFSET:
     return lowerGET_DYNAMIC_AREA_OFFSET(Op, DAG);
+  case ISD::MULHS:
+    return lowerMULH(Op, DAG, SystemZISD::SMUL_LOHI);
+  case ISD::MULHU:
+    return lowerMULH(Op, DAG, SystemZISD::UMUL_LOHI);
   case ISD::SMUL_LOHI:
     return lowerSMUL_LOHI(Op, DAG);
   case ISD::UMUL_LOHI:
@@ -7945,9 +8060,9 @@ static bool combineCCMask(SDValue &CCReg, int &CCValid, int &CCMask) {
     auto *FalseVal = dyn_cast<ConstantSDNode>(CompareLHS->getOperand(1));
     if (!FalseVal)
       return false;
-    if (CompareRHS->getZExtValue() == FalseVal->getZExtValue())
+    if (CompareRHS->getAPIntValue() == FalseVal->getAPIntValue())
       Invert = !Invert;
-    else if (CompareRHS->getZExtValue() != TrueVal->getZExtValue())
+    else if (CompareRHS->getAPIntValue() != TrueVal->getAPIntValue())
       return false;
 
     // Compute the effective CC mask for the new branch or select.
diff --git llvm/lib/Target/SystemZ/SystemZISelLowering.h llvm/lib/Target/SystemZ/SystemZISelLowering.h
index d663e4abfb4e..839a55001244 100644
--- llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -620,7 +620,8 @@ public:
   bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
                       bool isVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
-                      LLVMContext &Context) const override;
+                      LLVMContext &Context,
+                      const Type *RetTy) const override;
   SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
                       const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
@@ -697,6 +698,7 @@ private:
   SDValue lowerDYNAMIC_STACKALLOC_ELF(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerDYNAMIC_STACKALLOC_XPLINK(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerGET_DYNAMIC_AREA_OFFSET(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerMULH(SDValue Op, SelectionDAG &DAG, unsigned Opcode) const;
   SDValue lowerSMUL_LOHI(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerUMUL_LOHI(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerSDIVREM(SDValue Op, SelectionDAG &DAG) const;
diff --git llvm/lib/Target/SystemZ/SystemZInstrFormats.td llvm/lib/Target/SystemZ/SystemZInstrFormats.td
index ae8f669e9bab..e16f3ed5f9fb 100644
--- llvm/lib/Target/SystemZ/SystemZInstrFormats.td
+++ llvm/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -1337,6 +1337,74 @@ class InstVRIi<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   let Inst{7-0}   = op{7-0};
 }
 
+class InstVRIj<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<5> V1;
+  bits<5> V2;
+  bits<8> I3;
+  bits<4> M4;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = V1{3-0};
+  let Inst{35-32} = V2{3-0};
+  let Inst{31-24} = 0;
+  let Inst{23-20} = M4;
+  let Inst{19-12} = I3;
+  let Inst{11}    = V1{4};
+  let Inst{10}    = V2{4};
+  let Inst{9-8}   = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
+class InstVRIk<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<5> V1;
+  bits<5> V2;
+  bits<5> V3;
+  bits<5> V4;
+  bits<8> I5;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = V1{3-0};
+  let Inst{35-32} = V2{3-0};
+  let Inst{31-28} = V3{3-0};
+  let Inst{27-24} = 0;
+  let Inst{23-16} = I5;
+  let Inst{15-12} = V4{3-0};
+  let Inst{11}    = V1{4};
+  let Inst{10}    = V2{4};
+  let Inst{9}     = V3{4};
+  let Inst{8}     = V4{4};
+  let Inst{7-0}   = op{7-0};
+}
+
+class InstVRIl<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<5> V1;
+  bits<5> V2;
+  bits<16> I3;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = 0;
+  let Inst{35-32} = V1{3-0};
+  let Inst{31-28} = V2{3-0};
+  let Inst{27-12} = I3;
+  let Inst{11}    = 0;
+  let Inst{10}    = V1{4};
+  let Inst{9}     = V2{4};
+  let Inst{8}     = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
 // Depending on the instruction mnemonic, certain bits may be or-ed into
 // the M4 value provided as explicit operand.  These are passed as m4or.
 class InstVRRa<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern,
@@ -1511,11 +1579,13 @@ class InstVRRg<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   field bits<48> SoftFail = 0;
 
   bits<5> V1;
+  bits<16> I2;
 
   let Inst{47-40} = op{15-8};
   let Inst{39-36} = 0;
   let Inst{35-32} = V1{3-0};
-  let Inst{31-12} = 0;
+  let Inst{31-28} = 0;
+  let Inst{27-12} = I2;
   let Inst{11}    = 0;
   let Inst{10}    = V1{4};
   let Inst{9-8}   = 0;
@@ -2187,6 +2257,9 @@ multiclass MnemonicCondBranchAlias<CondVariant V, string from, string to,
 //   LoadAddress:
 //     One register output operand and one address operand.
 //
+//   LoadIndexedAddress:
+//     One register output operand and one indexed address operand.
+//
 //   SideEffectAddress:
 //     One address operand.  No output operands, but causes some side effect.
 //
@@ -3079,6 +3152,32 @@ class LoadAddressRIL<string mnemonic, bits<12> opcode,
              mnemonic#"\t$R1, $RI2",
              [(set GR64:$R1, (operator pcrel32:$RI2))]>;
 
+multiclass LoadIndexedAddressRXY<string mnemonic, bits<16> opcode,
+                                 SDPatternOperator ext,
+                                 SDPatternOperator shift = bitconvert> {
+  def "" : InstRXYa<opcode, (outs GR64:$R1),
+                            (ins (lxaaddr20only $B2, $D2, $X2):$XBD2),
+                    mnemonic#"\t$R1, $XBD2", []>;
+
+  // Patterns matching LXA with displacement.
+  def : Pat<(add ADDR64:$base,
+                 (shift (i64 (ext (add ADDR32:$index, disp20imm32:$disp))))),
+            (!cast<Instruction>(NAME) ADDR64:$base, imm32:$disp, ADDR32:$index)>;
+  def : Pat<(shift (i64 (ext (add ADDR32:$index, disp20imm32:$disp)))),
+            (!cast<Instruction>(NAME) zero_reg, imm32:$disp, ADDR32:$index)>;
+
+  // Patterns matching LXA without displacement.  These are only beneficial
+  // if we have a non-trivial shift.  Also, we need to add some complexity
+  // to account for the fact that the regular shift patterns have rather
+  // high complexity values due to allowing base + displacement.
+  if !ne(shift, bitconvert) then let AddedComplexity = 2 in {
+    def : Pat<(add ADDR64:$base, (shift (i64 (ext ADDR32:$index)))),
+              (!cast<Instruction>(NAME) ADDR64:$base, 0, ADDR32:$index)>;
+    def : Pat<(shift (i64 (ext ADDR32:$index))),
+              (!cast<Instruction>(NAME) zero_reg, 0, ADDR32:$index)>;
+  }
+}
+
 class UnaryRR<string mnemonic, bits<8> opcode, SDPatternOperator operator,
               RegisterOperand cls1, RegisterOperand cls2>
   : InstRR<opcode, (outs cls1:$R1), (ins cls2:$R2),
@@ -4453,7 +4552,17 @@ class TestRSL<string mnemonic, bits<16> opcode>
 
 class TestVRRg<string mnemonic, bits<16> opcode>
   : InstVRRg<opcode, (outs), (ins VR128:$V1),
-             mnemonic#"\t$V1", []>;
+             mnemonic#"\t$V1", []> {
+  let I2 = 0;
+}
+
+class TestExtraVRRg<string mnemonic, bits<16> opcode>
+  : InstVRRg<opcode, (outs), (ins VR128:$V1, imm32zx16:$I2),
+             mnemonic#"\t$V1, $I2", []>;
+
+class TestExtraVRIl<string mnemonic, bits<16> opcode>
+  : InstVRIl<opcode, (outs), (ins VR128:$V1, VR128:$V2, imm32zx16:$I3),
+             mnemonic#"\t$V1, $V2, $I3", []>;
 
 class SideEffectTernarySSc<string mnemonic, bits<8> opcode>
   : InstSSc<opcode, (outs), (ins (bdladdr12onlylen4 $B1, $D1, $L1):$BDL1,
@@ -4675,6 +4784,11 @@ class TernaryVRIi<string mnemonic, bits<16> opcode, RegisterOperand cls>
              (ins cls:$R2, imm32zx8:$I3, imm32zx4:$M4),
              mnemonic#"\t$V1, $R2, $I3, $M4", []>;
 
+class TernaryVRIj<string mnemonic, bits<16> opcode>
+  : InstVRIj<opcode, (outs VR128:$V1),
+             (ins VR128:$V2, imm32zx8:$I3, imm32zx4:$M4),
+             mnemonic#"\t$V1, $V2, $I3, $M4", []>;
+
 class TernaryVRRa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                   TypedReg tr1, TypedReg tr2, bits<4> type, bits<4> m4or>
   : InstVRRa<opcode, (outs tr1.op:$V1),
@@ -4748,6 +4862,26 @@ class TernaryVRRc<string mnemonic, bits<16> opcode, SDPatternOperator operator,
   let M6 = 0;
 }
 
+class TernaryVRRcInt<string mnemonic, bits<16> opcode,
+                     SDPatternOperator operator, TypedReg tr1, TypedReg tr2,
+                     bits<4> type = 0>
+  : InstVRRc<opcode, (outs tr1.op:$V1),
+             (ins tr2.op:$V2, tr2.op:$V3, imm32zx4:$M5),
+             mnemonic#"\t$V1, $V2, $V3, $M5",
+             [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V2),
+                                                  (tr2.vt tr2.op:$V3),
+                                                  imm32zx4_timm:$M5))]> {
+  let M4 = type;
+  let M6 = 0;
+}
+
+class TernaryVRRcIntGeneric<string mnemonic, bits<16> opcode>
+  : InstVRRc<opcode, (outs VR128:$V1),
+             (ins VR128:$V2, VR128:$V3, imm32zx4:$M4, imm32zx4:$M5),
+             mnemonic#"\t$V1, $V2, $V3, $M4, $M5", []> {
+  let M6 = 0;
+}
+
 class TernaryVRRcFloat<string mnemonic, bits<16> opcode,
                        SDPatternOperator operator, TypedReg tr1, TypedReg tr2,
                        bits<4> type = 0, bits<4> m5 = 0>
@@ -4926,6 +5060,16 @@ class QuaternaryVRIg<string mnemonic, bits<16> opcode>
                   imm32zx8:$I4, imm32zx4:$M5),
              mnemonic#"\t$V1, $V2, $I3, $I4, $M5", []>;
 
+class QuaternaryVRIk<string mnemonic, bits<16> opcode,
+                     SDPatternOperator operator, TypedReg tr>
+  : InstVRIk<opcode, (outs VR128:$V1),
+             (ins VR128:$V2, VR128:$V3, VR128:$V4, imm32zx8:$I5),
+             mnemonic#"\t$V1, $V2, $V3, $V4, $I5",
+             [(set (tr.vt tr.op:$V1), (operator (tr.vt tr.op:$V2),
+                                                (tr.vt tr.op:$V3),
+                                                (tr.vt tr.op:$V4),
+                                                imm32zx8_timm:$I5))]>;
+
 class QuaternaryVRRd<string mnemonic, bits<16> opcode,
                      SDPatternOperator operator, TypedReg tr1, TypedReg tr2,
                      TypedReg tr3, TypedReg tr4, bits<4> type,
diff --git llvm/lib/Target/SystemZ/SystemZInstrInfo.td llvm/lib/Target/SystemZ/SystemZInstrInfo.td
index e70ae5dadcb0..adfd0a19859c 100644
--- llvm/lib/Target/SystemZ/SystemZInstrInfo.td
+++ llvm/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -884,6 +884,39 @@ let isAsCheapAsAMove = 1, isMoveImm = 1, isReMaterializable = 1 in
 def GOT : Alias<6, (outs GR64:$R1), (ins),
                 [(set GR64:$R1, (global_offset_table))]>;
 
+// Load (logical) indexed address.
+let Predicates = [FeatureMiscellaneousExtensions4] in {
+  defm LXAB  : LoadIndexedAddressRXY<"lxab",  0xE360, sext32>;
+  defm LXAH  : LoadIndexedAddressRXY<"lxah",  0xE362, sext32, shl1>;
+  defm LXAF  : LoadIndexedAddressRXY<"lxaf",  0xE364, sext32, shl2>;
+  defm LXAG  : LoadIndexedAddressRXY<"lxag",  0xE366, sext32, shl3>;
+  defm LXAQ  : LoadIndexedAddressRXY<"lxaq",  0xE368, sext32, shl4>;
+  defm LLXAB : LoadIndexedAddressRXY<"llxab", 0xE361, zext32>;
+  defm LLXAH : LoadIndexedAddressRXY<"llxah", 0xE363, zext32, shl1>;
+  defm LLXAF : LoadIndexedAddressRXY<"llxaf", 0xE365, zext32, shl2>;
+  defm LLXAG : LoadIndexedAddressRXY<"llxag", 0xE367, zext32, shl3>;
+  defm LLXAQ : LoadIndexedAddressRXY<"llxaq", 0xE369, zext32, shl4>;
+
+  // Peepholes to use load (logical) indexed address to implement
+  // add + shift of an already extended value.
+  def : Pat<(add ADDR64:$base, (shl1 (assertsext32 ADDR64:$index))),
+            (LXAH ADDR64:$base, 0, (EXTRACT_SUBREG ADDR64:$index, subreg_l32))>;
+  def : Pat<(add ADDR64:$base, (shl2 (assertsext32 ADDR64:$index))),
+            (LXAF ADDR64:$base, 0, (EXTRACT_SUBREG ADDR64:$index, subreg_l32))>;
+  def : Pat<(add ADDR64:$base, (shl3 (assertsext32 ADDR64:$index))),
+            (LXAG ADDR64:$base, 0, (EXTRACT_SUBREG ADDR64:$index, subreg_l32))>;
+  def : Pat<(add ADDR64:$base, (shl4 (assertsext32 ADDR64:$index))),
+            (LXAQ ADDR64:$base, 0, (EXTRACT_SUBREG ADDR64:$index, subreg_l32))>;
+  def : Pat<(add ADDR64:$base, (shl1 (assertzext32 ADDR64:$index))),
+            (LLXAH ADDR64:$base, 0, (EXTRACT_SUBREG ADDR64:$index, subreg_l32))>;
+  def : Pat<(add ADDR64:$base, (shl2 (assertzext32 ADDR64:$index))),
+            (LLXAF ADDR64:$base, 0, (EXTRACT_SUBREG ADDR64:$index, subreg_l32))>;
+  def : Pat<(add ADDR64:$base, (shl3 (assertzext32 ADDR64:$index))),
+            (LLXAG ADDR64:$base, 0, (EXTRACT_SUBREG ADDR64:$index, subreg_l32))>;
+  def : Pat<(add ADDR64:$base, (shl4 (assertzext32 ADDR64:$index))),
+            (LLXAQ ADDR64:$base, 0, (EXTRACT_SUBREG ADDR64:$index, subreg_l32))>;
+}
+
 //===----------------------------------------------------------------------===//
 // Absolute and Negation
 //===----------------------------------------------------------------------===//
@@ -1821,6 +1854,19 @@ let Predicates = [FeatureInterlockedAccess1], Defs = [CC] in {
   def LPDG : BinarySSF<"lpdg", 0xC85, GR128>;
 }
 
+// Compare and load.
+let Predicates = [FeatureConcurrentFunctions], Defs = [CC] in {
+  def CAL   : BinarySSF<"cal", 0xC86, GR32>;
+  def CALGF : BinarySSF<"calgf", 0xC8F, GR64>;
+  def CALG  : BinarySSF<"calg", 0xC87, GR64>;
+}
+
+// Perform function with concurrent results.
+let Predicates = [FeatureConcurrentFunctions], Uses = [R0D], Defs = [CC],
+    mayLoad = 1, mayStore = 1, hasSideEffects = 1 in {
+  def PFCR : BinaryRSY<"pfcr", 0xEB16, null_frag, GR64>;
+}
+
 //===----------------------------------------------------------------------===//
 // Translate and convert
 //===----------------------------------------------------------------------===//
@@ -1910,6 +1956,11 @@ let mayLoad = 1, mayStore = 1, Uses = [R0L, R1D], Defs = [CC] in {
 
   let Predicates = [FeatureMessageSecurityAssist9] in
     def KDSA : SideEffectBinaryMemRRE<"kdsa", 0xB93A, GR64, GR128>;
+
+  let Predicates = [FeatureMessageSecurityAssist12] in {
+    def KIMDOpt : SideEffectTernaryMemMemRRFc<"kimd", 0xB93E, GR64, GR128, imm32zx4>;
+    def KLMDOpt : SideEffectTernaryMemMemRRFc<"klmd", 0xB93F, GR64, GR128, imm32zx4>;
+  }
 }
 
 //===----------------------------------------------------------------------===//
@@ -2081,6 +2132,12 @@ let Predicates = [FeatureProcessorAssist] in {
 // Miscellaneous Instructions.
 //===----------------------------------------------------------------------===//
 
+// Count leading/trailing zeros.
+let Predicates = [FeatureMiscellaneousExtensions4] in {
+  def CLZG : UnaryRRE<"clzg", 0xB968, ctlz, GR64, GR64>;
+  def CTZG : UnaryRRE<"ctzg", 0xB969, cttz, GR64, GR64>;
+}
+
 // Find leftmost one, AKA count leading zeros.  The instruction actually
 // returns a pair of GR64s, the first giving the number of leading zeros
 // and the second giving a copy of the source with the leftmost one bit
@@ -2099,6 +2156,12 @@ let Predicates = [FeatureMiscellaneousExtensions3] in {
 let Predicates = [FeaturePopulationCount], Defs = [CC] in
   def POPCNT : UnaryRRE<"popcnt", 0xB9E1, z_popcnt, GR64, GR64>;
 
+// Bit deposit and bit extract.
+let Predicates = [FeatureMiscellaneousExtensions4] in {
+  def BDEPG : BinaryRRFa<"bdepg", 0xB96D, int_s390_bdepg, GR64, GR64, GR64>;
+  def BEXTG : BinaryRRFa<"bextg", 0xB96C, int_s390_bextg, GR64, GR64, GR64>;
+}
+
 // Search a block of memory for a character.
 let mayLoad = 1, Defs = [CC] in
   defm SRST : StringRRE<"srst", 0xB25E, z_search_string>;
diff --git llvm/lib/Target/SystemZ/SystemZInstrVector.td llvm/lib/Target/SystemZ/SystemZInstrVector.td
index c09f48891c13..edd20a5de8c6 100644
--- llvm/lib/Target/SystemZ/SystemZInstrVector.td
+++ llvm/lib/Target/SystemZ/SystemZInstrVector.td
@@ -377,6 +377,16 @@ let Predicates = [FeatureVector] in {
 
   // Select.
   def VSEL : TernaryVRRe<"vsel", 0xE78D, null_frag, v128any, v128any>;
+
+  // Blend.
+  let Predicates = [FeatureVectorEnhancements3] in {
+    def VBLEND  : TernaryVRRdGeneric<"vblend", 0xE789>;
+    def VBLENDB : TernaryVRRd<"vblendb", 0xE789, null_frag, v128b, v128b, 0>;
+    def VBLENDH : TernaryVRRd<"vblendh", 0xE789, null_frag, v128h, v128h, 1>;
+    def VBLENDF : TernaryVRRd<"vblendf", 0xE789, null_frag, v128f, v128f, 2>;
+    def VBLENDG : TernaryVRRd<"vblendg", 0xE789, null_frag, v128g, v128g, 3>;
+    def VBLENDQ : TernaryVRRd<"vblendq", 0xE789, null_frag, v128q, v128q, 4>;
+  }
 }
 
 //===----------------------------------------------------------------------===//
@@ -417,29 +427,47 @@ let Predicates = [FeatureVector] in {
   def : Pat<(z_vsei16_by_parts (v8i16 VR128:$src)), (VSEGH VR128:$src)>;
   def : Pat<(z_vsei32_by_parts (v4i32 VR128:$src)), (VSEGF VR128:$src)>;
 
+  // Generate element masks.
+  let Predicates = [FeatureVectorEnhancements3] in {
+    def VGEM  : UnaryVRRaGeneric<"vgem", 0xE754>;
+    def VGEMB : UnaryVRRa<"vgemb", 0xE754, int_s390_vgemb, v128b, v128h, 0>;
+    def VGEMH : UnaryVRRa<"vgemh", 0xE754, int_s390_vgemh, v128h, v128b, 1>;
+    def VGEMF : UnaryVRRa<"vgemf", 0xE754, int_s390_vgemf, v128f, v128b, 2>;
+    def VGEMG : UnaryVRRa<"vgemg", 0xE754, int_s390_vgemg, v128g, v128b, 3>;
+    def VGEMQ : UnaryVRRa<"vgemq", 0xE754, int_s390_vgemq, v128q, v128b, 4>;
+  }
+
   // Unpack high.
   def VUPH  : UnaryVRRaGeneric<"vuph", 0xE7D7>;
   def VUPHB : UnaryVRRa<"vuphb", 0xE7D7, z_unpack_high, v128h, v128b, 0>;
   def VUPHH : UnaryVRRa<"vuphh", 0xE7D7, z_unpack_high, v128f, v128h, 1>;
   def VUPHF : UnaryVRRa<"vuphf", 0xE7D7, z_unpack_high, v128g, v128f, 2>;
+  let Predicates = [FeatureVectorEnhancements3] in
+    def VUPHG : UnaryVRRa<"vuphg", 0xE7D7, z_unpack_high, v128q, v128g, 3>;
 
   // Unpack logical high.
   def VUPLH  : UnaryVRRaGeneric<"vuplh", 0xE7D5>;
   def VUPLHB : UnaryVRRa<"vuplhb", 0xE7D5, z_unpackl_high, v128h, v128b, 0>;
   def VUPLHH : UnaryVRRa<"vuplhh", 0xE7D5, z_unpackl_high, v128f, v128h, 1>;
   def VUPLHF : UnaryVRRa<"vuplhf", 0xE7D5, z_unpackl_high, v128g, v128f, 2>;
+  let Predicates = [FeatureVectorEnhancements3] in
+    def VUPLHG : UnaryVRRa<"vuplhg", 0xE7D5, z_unpackl_high, v128q, v128g, 3>;
 
   // Unpack low.
   def VUPL   : UnaryVRRaGeneric<"vupl", 0xE7D6>;
   def VUPLB  : UnaryVRRa<"vuplb",  0xE7D6, z_unpack_low, v128h, v128b, 0>;
   def VUPLHW : UnaryVRRa<"vuplhw", 0xE7D6, z_unpack_low, v128f, v128h, 1>;
   def VUPLF  : UnaryVRRa<"vuplf",  0xE7D6, z_unpack_low, v128g, v128f, 2>;
+  let Predicates = [FeatureVectorEnhancements3] in
+    def VUPLG  : UnaryVRRa<"vuplg",  0xE7D6, z_unpack_low, v128q, v128g, 3>;
 
   // Unpack logical low.
   def VUPLL  : UnaryVRRaGeneric<"vupll", 0xE7D4>;
   def VUPLLB : UnaryVRRa<"vupllb", 0xE7D4, z_unpackl_low, v128h, v128b, 0>;
   def VUPLLH : UnaryVRRa<"vupllh", 0xE7D4, z_unpackl_low, v128f, v128h, 1>;
   def VUPLLF : UnaryVRRa<"vupllf", 0xE7D4, z_unpackl_low, v128g, v128f, 2>;
+  let Predicates = [FeatureVectorEnhancements3] in
+    def VUPLLG : UnaryVRRa<"vupllg", 0xE7D4, z_unpackl_low, v128q, v128g, 3>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -466,6 +494,31 @@ defm : GenericVectorOps<v2i64, v2i64>;
 defm : GenericVectorOps<v4f32, v4i32>;
 defm : GenericVectorOps<v2f64, v2i64>;
 
+multiclass BlendVectorOps<ValueType type, ValueType inttype,
+                          Instruction blend> {
+  let Predicates = [FeatureVectorEnhancements3] in {
+    def : Pat<(type (vselect (inttype (z_vicmpl_zero VR128:$x)),
+                             VR128:$y, VR128:$z)),
+              (blend VR128:$y, VR128:$z, VR128:$x)>;
+    def : Pat<(type (vselect (inttype (z_vnot (z_vicmpl_zero VR128:$x))),
+                             VR128:$y, VR128:$z)),
+              (blend VR128:$z, VR128:$y, VR128:$x)>;
+  }
+}
+
+defm : BlendVectorOps<v16i8, v16i8, VBLENDB>;
+defm : BlendVectorOps<v8i16, v8i16, VBLENDH>;
+defm : BlendVectorOps<v4i32, v4i32, VBLENDF>;
+defm : BlendVectorOps<v2i64, v2i64, VBLENDG>;
+defm : BlendVectorOps<v4f32, v4i32, VBLENDF>;
+defm : BlendVectorOps<v2f64, v2i64, VBLENDG>;
+
+let Predicates = [FeatureVectorEnhancements3] in {
+    def : Pat<(i128 (or (and VR128:$y, (z_vicmph 0, VR128:$x)),
+                        (and VR128:$z, (not (z_vicmph 0, VR128:$x))))),
+              (VBLENDQ VR128:$y, VR128:$z, VR128:$x)>;
+}
+
 //===----------------------------------------------------------------------===//
 // Integer arithmetic
 //===----------------------------------------------------------------------===//
@@ -513,6 +566,8 @@ let Predicates = [FeatureVector] in {
     def VAVGH : BinaryVRRc<"vavgh", 0xE7F2, int_s390_vavgh, v128h, v128h, 1>;
     def VAVGF : BinaryVRRc<"vavgf", 0xE7F2, int_s390_vavgf, v128f, v128f, 2>;
     def VAVGG : BinaryVRRc<"vavgg", 0xE7F2, int_s390_vavgg, v128g, v128g, 3>;
+    let Predicates = [FeatureVectorEnhancements3] in
+      def VAVGQ : BinaryVRRc<"vavgq", 0xE7F2, int_s390_vavgq, v128q, v128q, 4>;
 
     // Average logical.
     def VAVGL  : BinaryVRRcGeneric<"vavgl", 0xE7F0>;
@@ -520,6 +575,8 @@ let Predicates = [FeatureVector] in {
     def VAVGLH : BinaryVRRc<"vavglh", 0xE7F0, int_s390_vavglh, v128h, v128h, 1>;
     def VAVGLF : BinaryVRRc<"vavglf", 0xE7F0, int_s390_vavglf, v128f, v128f, 2>;
     def VAVGLG : BinaryVRRc<"vavglg", 0xE7F0, int_s390_vavglg, v128g, v128g, 3>;
+    let Predicates = [FeatureVectorEnhancements3] in
+      def VAVGLQ : BinaryVRRc<"vavglq", 0xE7F0, int_s390_vavglq, v128q, v128q, 4>;
   }
 
   // Checksum.
@@ -531,6 +588,8 @@ let Predicates = [FeatureVector] in {
   def VCLZH : UnaryVRRa<"vclzh", 0xE753, ctlz, v128h, v128h, 1>;
   def VCLZF : UnaryVRRa<"vclzf", 0xE753, ctlz, v128f, v128f, 2>;
   def VCLZG : UnaryVRRa<"vclzg", 0xE753, ctlz, v128g, v128g, 3>;
+  let Predicates = [FeatureVectorEnhancements3] in
+    def VCLZQ : UnaryVRRa<"vclzq", 0xE753, ctlz, v128q, v128q, 4>;
 
   // Count trailing zeros.
   def VCTZ  : UnaryVRRaGeneric<"vctz", 0xE752>;
@@ -538,6 +597,38 @@ let Predicates = [FeatureVector] in {
   def VCTZH : UnaryVRRa<"vctzh", 0xE752, cttz, v128h, v128h, 1>;
   def VCTZF : UnaryVRRa<"vctzf", 0xE752, cttz, v128f, v128f, 2>;
   def VCTZG : UnaryVRRa<"vctzg", 0xE752, cttz, v128g, v128g, 3>;
+  let Predicates = [FeatureVectorEnhancements3] in
+    def VCTZQ : UnaryVRRa<"vctzq", 0xE752, cttz, v128q, v128q, 4>;
+
+  // Divide.
+  let Predicates = [FeatureVectorEnhancements3] in {
+    let hasSideEffects = 1 in {
+      def VD  : TernaryVRRcIntGeneric<"vd", 0xE7B2>;
+      def VDF : TernaryVRRcInt<"vdf", 0xE7B2, null_frag, v128f, v128f, 2>;
+      def VDG : TernaryVRRcInt<"vdg", 0xE7B2, null_frag, v128g, v128g, 3>;
+      def VDQ : TernaryVRRcInt<"vdq", 0xE7B2, null_frag, v128q, v128q, 4>;
+    }
+    def : Pat<(v4i32 (sdiv VR128:$x, VR128:$y)), (VDF VR128:$x, VR128:$y, 0)>;
+    def : Pat<(v2i64 (sdiv VR128:$x, VR128:$y)), (VDG VR128:$x, VR128:$y, 0)>;
+    def : Pat<(i128 (sdiv VR128:$x, VR128:$y)), (VDQ VR128:$x, VR128:$y, 0)>;
+  }
+
+  // Divide logical.
+  let Predicates = [FeatureVectorEnhancements3] in {
+    let hasSideEffects = 1 in {
+      def VDL  : TernaryVRRcIntGeneric<"vdl", 0xE7B0>;
+      def VDLF : TernaryVRRcInt<"vdlf", 0xE7B0, null_frag, v128f, v128f, 2>;
+      def VDLG : TernaryVRRcInt<"vdlg", 0xE7B0, null_frag, v128g, v128g, 3>;
+      def VDLQ : TernaryVRRcInt<"vdlq", 0xE7B0, null_frag, v128q, v128q, 4>;
+    }
+    def : Pat<(v4i32 (udiv VR128:$x, VR128:$y)), (VDLF VR128:$x, VR128:$y, 0)>;
+    def : Pat<(v2i64 (udiv VR128:$x, VR128:$y)), (VDLG VR128:$x, VR128:$y, 0)>;
+    def : Pat<(i128 (udiv VR128:$x, VR128:$y)), (VDLQ VR128:$x, VR128:$y, 0)>;
+  }
+
+  // Evaluate.
+  let Predicates = [FeatureVectorEnhancements3] in
+    def VEVAL : QuaternaryVRIk<"veval", 0xE788, int_s390_veval, v128b>;
 
   let isCommutable = 1 in {
     // Not exclusive or.
@@ -568,6 +659,8 @@ let Predicates = [FeatureVector] in {
   def VLCH : UnaryVRRa<"vlch", 0xE7DE, z_vneg, v128h, v128h, 1>;
   def VLCF : UnaryVRRa<"vlcf", 0xE7DE, z_vneg, v128f, v128f, 2>;
   def VLCG : UnaryVRRa<"vlcg", 0xE7DE, z_vneg, v128g, v128g, 3>;
+  let Predicates = [FeatureVectorEnhancements3] in
+    def VLCQ : UnaryVRRa<"vlcq", 0xE7DE, ineg, v128q, v128q, 4>;
 
   // Load positive.
   def VLP  : UnaryVRRaGeneric<"vlp", 0xE7DF>;
@@ -575,6 +668,8 @@ let Predicates = [FeatureVector] in {
   def VLPH : UnaryVRRa<"vlph", 0xE7DF, abs, v128h, v128h, 1>;
   def VLPF : UnaryVRRa<"vlpf", 0xE7DF, abs, v128f, v128f, 2>;
   def VLPG : UnaryVRRa<"vlpg", 0xE7DF, abs, v128g, v128g, 3>;
+  let Predicates = [FeatureVectorEnhancements3] in
+    def VLPQ : UnaryVRRa<"vlpq", 0xE7DF, abs, v128q, v128q, 4>;
 
   let isCommutable = 1 in {
     // Maximum.
@@ -583,6 +678,8 @@ let Predicates = [FeatureVector] in {
     def VMXH : BinaryVRRc<"vmxh", 0xE7FF, null_frag, v128h, v128h, 1>;
     def VMXF : BinaryVRRc<"vmxf", 0xE7FF, null_frag, v128f, v128f, 2>;
     def VMXG : BinaryVRRc<"vmxg", 0xE7FF, null_frag, v128g, v128g, 3>;
+    let Predicates = [FeatureVectorEnhancements3] in
+      def VMXQ : BinaryVRRc<"vmxq", 0xE7FF, null_frag, v128q, v128q, 4>;
 
     // Maximum logical.
     def VMXL  : BinaryVRRcGeneric<"vmxl", 0xE7FD>;
@@ -590,6 +687,8 @@ let Predicates = [FeatureVector] in {
     def VMXLH : BinaryVRRc<"vmxlh", 0xE7FD, null_frag, v128h, v128h, 1>;
     def VMXLF : BinaryVRRc<"vmxlf", 0xE7FD, null_frag, v128f, v128f, 2>;
     def VMXLG : BinaryVRRc<"vmxlg", 0xE7FD, null_frag, v128g, v128g, 3>;
+    let Predicates = [FeatureVectorEnhancements3] in
+      def VMXLQ : BinaryVRRc<"vmxlq", 0xE7FD, null_frag, v128q, v128q, 4>;
   }
 
   let isCommutable = 1 in {
@@ -599,6 +698,8 @@ let Predicates = [FeatureVector] in {
     def VMNH : BinaryVRRc<"vmnh", 0xE7FE, null_frag, v128h, v128h, 1>;
     def VMNF : BinaryVRRc<"vmnf", 0xE7FE, null_frag, v128f, v128f, 2>;
     def VMNG : BinaryVRRc<"vmng", 0xE7FE, null_frag, v128g, v128g, 3>;
+    let Predicates = [FeatureVectorEnhancements3] in
+      def VMNQ : BinaryVRRc<"vmnq", 0xE7FE, null_frag, v128q, v128q, 4>;
 
     // Minimum logical.
     def VMNL  : BinaryVRRcGeneric<"vmnl", 0xE7FC>;
@@ -606,6 +707,8 @@ let Predicates = [FeatureVector] in {
     def VMNLH : BinaryVRRc<"vmnlh", 0xE7FC, null_frag, v128h, v128h, 1>;
     def VMNLF : BinaryVRRc<"vmnlf", 0xE7FC, null_frag, v128f, v128f, 2>;
     def VMNLG : BinaryVRRc<"vmnlg", 0xE7FC, null_frag, v128g, v128g, 3>;
+    let Predicates = [FeatureVectorEnhancements3] in
+      def VMNLQ : BinaryVRRc<"vmnlq", 0xE7FC, null_frag, v128q, v128q, 4>;
   }
 
   let isCommutable = 1 in {
@@ -614,42 +717,62 @@ let Predicates = [FeatureVector] in {
     def VMALB  : TernaryVRRd<"vmalb",  0xE7AA, z_muladd, v128b, v128b, 0>;
     def VMALHW : TernaryVRRd<"vmalhw", 0xE7AA, z_muladd, v128h, v128h, 1>;
     def VMALF  : TernaryVRRd<"vmalf",  0xE7AA, z_muladd, v128f, v128f, 2>;
+    let Predicates = [FeatureVectorEnhancements3] in {
+      def VMALG : TernaryVRRd<"vmalg",  0xE7AA, z_muladd, v128g, v128g, 3>;
+      def VMALQ : TernaryVRRd<"vmalq",  0xE7AA, z_muladd, v128q, v128q, 4>;
+    }
 
     // Multiply and add high.
     def VMAH  : TernaryVRRdGeneric<"vmah", 0xE7AB>;
     def VMAHB : TernaryVRRd<"vmahb", 0xE7AB, int_s390_vmahb, v128b, v128b, 0>;
     def VMAHH : TernaryVRRd<"vmahh", 0xE7AB, int_s390_vmahh, v128h, v128h, 1>;
     def VMAHF : TernaryVRRd<"vmahf", 0xE7AB, int_s390_vmahf, v128f, v128f, 2>;
+    let Predicates = [FeatureVectorEnhancements3] in {
+      def VMAHG : TernaryVRRd<"vmahg", 0xE7AB, int_s390_vmahg, v128g, v128g, 3>;
+      def VMAHQ : TernaryVRRd<"vmahq", 0xE7AB, int_s390_vmahq, v128q, v128q, 4>;
+    }
 
     // Multiply and add logical high.
     def VMALH  : TernaryVRRdGeneric<"vmalh", 0xE7A9>;
     def VMALHB : TernaryVRRd<"vmalhb", 0xE7A9, int_s390_vmalhb, v128b, v128b, 0>;
     def VMALHH : TernaryVRRd<"vmalhh", 0xE7A9, int_s390_vmalhh, v128h, v128h, 1>;
     def VMALHF : TernaryVRRd<"vmalhf", 0xE7A9, int_s390_vmalhf, v128f, v128f, 2>;
+    let Predicates = [FeatureVectorEnhancements3] in {
+      def VMALHG : TernaryVRRd<"vmalhg", 0xE7A9, int_s390_vmalhg, v128g, v128g, 3>;
+      def VMALHQ : TernaryVRRd<"vmalhq", 0xE7A9, int_s390_vmalhq, v128q, v128q, 4>;
+    }
 
     // Multiply and add even.
     def VMAE  : TernaryVRRdGeneric<"vmae", 0xE7AE>;
     def VMAEB : TernaryVRRd<"vmaeb", 0xE7AE, int_s390_vmaeb, v128h, v128b, 0>;
     def VMAEH : TernaryVRRd<"vmaeh", 0xE7AE, int_s390_vmaeh, v128f, v128h, 1>;
     def VMAEF : TernaryVRRd<"vmaef", 0xE7AE, int_s390_vmaef, v128g, v128f, 2>;
+    let Predicates = [FeatureVectorEnhancements3] in
+      def VMAEG : TernaryVRRd<"vmaeg", 0xE7AE, int_s390_vmaeg, v128q, v128g, 3>;
 
     // Multiply and add logical even.
     def VMALE  : TernaryVRRdGeneric<"vmale", 0xE7AC>;
     def VMALEB : TernaryVRRd<"vmaleb", 0xE7AC, int_s390_vmaleb, v128h, v128b, 0>;
     def VMALEH : TernaryVRRd<"vmaleh", 0xE7AC, int_s390_vmaleh, v128f, v128h, 1>;
     def VMALEF : TernaryVRRd<"vmalef", 0xE7AC, int_s390_vmalef, v128g, v128f, 2>;
+    let Predicates = [FeatureVectorEnhancements3] in
+      def VMALEG : TernaryVRRd<"vmaleg", 0xE7AC, int_s390_vmaleg, v128q, v128g, 3>;
 
     // Multiply and add odd.
     def VMAO  : TernaryVRRdGeneric<"vmao", 0xE7AF>;
     def VMAOB : TernaryVRRd<"vmaob", 0xE7AF, int_s390_vmaob, v128h, v128b, 0>;
     def VMAOH : TernaryVRRd<"vmaoh", 0xE7AF, int_s390_vmaoh, v128f, v128h, 1>;
     def VMAOF : TernaryVRRd<"vmaof", 0xE7AF, int_s390_vmaof, v128g, v128f, 2>;
+    let Predicates = [FeatureVectorEnhancements3] in
+      def VMAOG : TernaryVRRd<"vmaog", 0xE7AF, int_s390_vmaog, v128q, v128g, 3>;
 
     // Multiply and add logical odd.
     def VMALO  : TernaryVRRdGeneric<"vmalo", 0xE7AD>;
     def VMALOB : TernaryVRRd<"vmalob", 0xE7AD, int_s390_vmalob, v128h, v128b, 0>;
     def VMALOH : TernaryVRRd<"vmaloh", 0xE7AD, int_s390_vmaloh, v128f, v128h, 1>;
     def VMALOF : TernaryVRRd<"vmalof", 0xE7AD, int_s390_vmalof, v128g, v128f, 2>;
+    let Predicates = [FeatureVectorEnhancements3] in
+      def VMALOG : TernaryVRRd<"vmalog", 0xE7AD, int_s390_vmalog, v128q, v128g, 3>;
   }
 
   let isCommutable = 1 in {
@@ -658,42 +781,66 @@ let Predicates = [FeatureVector] in {
     def VMHB : BinaryVRRc<"vmhb", 0xE7A3, int_s390_vmhb, v128b, v128b, 0>;
     def VMHH : BinaryVRRc<"vmhh", 0xE7A3, int_s390_vmhh, v128h, v128h, 1>;
     def VMHF : BinaryVRRc<"vmhf", 0xE7A3, int_s390_vmhf, v128f, v128f, 2>;
+    let Predicates = [FeatureVectorEnhancements3] in {
+      def VMHG : BinaryVRRc<"vmhg", 0xE7A3, int_s390_vmhg, v128g, v128g, 3>;
+      def VMHQ : BinaryVRRc<"vmhq", 0xE7A3, int_s390_vmhq, v128q, v128q, 4>;
+    }
 
     // Multiply logical high.
     def VMLH  : BinaryVRRcGeneric<"vmlh", 0xE7A1>;
     def VMLHB : BinaryVRRc<"vmlhb", 0xE7A1, int_s390_vmlhb, v128b, v128b, 0>;
     def VMLHH : BinaryVRRc<"vmlhh", 0xE7A1, int_s390_vmlhh, v128h, v128h, 1>;
     def VMLHF : BinaryVRRc<"vmlhf", 0xE7A1, int_s390_vmlhf, v128f, v128f, 2>;
+    let Predicates = [FeatureVectorEnhancements3] in {
+      def VMLHG : BinaryVRRc<"vmlhg", 0xE7A1, int_s390_vmlhg, v128g, v128g, 3>;
+      def VMLHQ : BinaryVRRc<"vmlhq", 0xE7A1, int_s390_vmlhq, v128q, v128q, 4>;
+    }
 
     // Multiply low.
     def VML   : BinaryVRRcGeneric<"vml", 0xE7A2>;
     def VMLB  : BinaryVRRc<"vmlb",  0xE7A2, mul, v128b, v128b, 0>;
     def VMLHW : BinaryVRRc<"vmlhw", 0xE7A2, mul, v128h, v128h, 1>;
     def VMLF  : BinaryVRRc<"vmlf",  0xE7A2, mul, v128f, v128f, 2>;
+    let Predicates = [FeatureVectorEnhancements3] in {
+      def VMLG : BinaryVRRc<"vmlg",  0xE7A2, mul, v128g, v128g, 3>;
+      def VMLQ : BinaryVRRc<"vmlq",  0xE7A2, mul, v128q, v128q, 4>;
+    }
 
     // Multiply even.
     def VME  : BinaryVRRcGeneric<"vme", 0xE7A6>;
     def VMEB : BinaryVRRc<"vmeb", 0xE7A6, int_s390_vmeb, v128h, v128b, 0>;
     def VMEH : BinaryVRRc<"vmeh", 0xE7A6, int_s390_vmeh, v128f, v128h, 1>;
     def VMEF : BinaryVRRc<"vmef", 0xE7A6, int_s390_vmef, v128g, v128f, 2>;
+    let Predicates = [FeatureVectorEnhancements3] in
+      def VMEG : BinaryVRRc<"vmeg", 0xE7A6, int_s390_vmeg, v128q, v128g, 3>;
 
     // Multiply logical even.
     def VMLE  : BinaryVRRcGeneric<"vmle", 0xE7A4>;
     def VMLEB : BinaryVRRc<"vmleb", 0xE7A4, int_s390_vmleb, v128h, v128b, 0>;
     def VMLEH : BinaryVRRc<"vmleh", 0xE7A4, int_s390_vmleh, v128f, v128h, 1>;
     def VMLEF : BinaryVRRc<"vmlef", 0xE7A4, int_s390_vmlef, v128g, v128f, 2>;
+    let Predicates = [FeatureVectorEnhancements3] in
+      def VMLEG : BinaryVRRc<"vmleg", 0xE7A4, int_s390_vmleg, v128q, v128g, 3>;
 
     // Multiply odd.
     def VMO  : BinaryVRRcGeneric<"vmo", 0xE7A7>;
     def VMOB : BinaryVRRc<"vmob", 0xE7A7, int_s390_vmob, v128h, v128b, 0>;
     def VMOH : BinaryVRRc<"vmoh", 0xE7A7, int_s390_vmoh, v128f, v128h, 1>;
     def VMOF : BinaryVRRc<"vmof", 0xE7A7, int_s390_vmof, v128g, v128f, 2>;
+    let Predicates = [FeatureVectorEnhancements3] in
+      def VMOG : BinaryVRRc<"vmog", 0xE7A7, int_s390_vmog, v128q, v128g, 3>;
 
     // Multiply logical odd.
     def VMLO  : BinaryVRRcGeneric<"vmlo", 0xE7A5>;
     def VMLOB : BinaryVRRc<"vmlob", 0xE7A5, int_s390_vmlob, v128h, v128b, 0>;
     def VMLOH : BinaryVRRc<"vmloh", 0xE7A5, int_s390_vmloh, v128f, v128h, 1>;
     def VMLOF : BinaryVRRc<"vmlof", 0xE7A5, int_s390_vmlof, v128g, v128f, 2>;
+    let Predicates = [FeatureVectorEnhancements3] in
+      def VMLOG : BinaryVRRc<"vmlog", 0xE7A5, int_s390_vmlog, v128q, v128g, 3>;
+  }
+  let Predicates = [FeatureVectorEnhancements3] in {
+    def : Pat<(i128 (mulhs VR128:$x, VR128:$y)), (VMHQ VR128:$x, VR128:$y)>;
+    def : Pat<(i128 (mulhu VR128:$x, VR128:$y)), (VMLHQ VR128:$x, VR128:$y)>;
   }
 
   // Multiply sum logical.
@@ -730,6 +877,32 @@ let Predicates = [FeatureVector] in {
     def VPOPCTG : UnaryVRRa<"vpopctg", 0xE750, ctpop, v128g, v128g, 3>;
   }
 
+  // Remainder.
+  let Predicates = [FeatureVectorEnhancements3] in {
+    let hasSideEffects = 1 in {
+      def VR  : TernaryVRRcIntGeneric<"vr", 0xE7B3>;
+      def VRF : TernaryVRRcInt<"vrf", 0xE7B3, null_frag, v128f, v128f, 2>;
+      def VRG : TernaryVRRcInt<"vrg", 0xE7B3, null_frag, v128g, v128g, 3>;
+      def VRQ : TernaryVRRcInt<"vrq", 0xE7B3, null_frag, v128q, v128q, 4>;
+    }
+    def : Pat<(v4i32 (srem VR128:$x, VR128:$y)), (VRF VR128:$x, VR128:$y, 0)>;
+    def : Pat<(v2i64 (srem VR128:$x, VR128:$y)), (VRG VR128:$x, VR128:$y, 0)>;
+    def : Pat<(i128 (srem VR128:$x, VR128:$y)), (VRQ VR128:$x, VR128:$y, 0)>;
+  }
+
+  // Remainder logical.
+  let Predicates = [FeatureVectorEnhancements3] in {
+    let hasSideEffects = 1 in {
+      def VRL  : TernaryVRRcIntGeneric<"vrl", 0xE7B1>;
+      def VRLF : TernaryVRRcInt<"vrlf", 0xE7B1, null_frag, v128f, v128f, 2>;
+      def VRLG : TernaryVRRcInt<"vrlg", 0xE7B1, null_frag, v128g, v128g, 3>;
+      def VRLQ : TernaryVRRcInt<"vrlq", 0xE7B1, null_frag, v128q, v128q, 4>;
+    }
+    def : Pat<(v4i32 (urem VR128:$x, VR128:$y)), (VRLF VR128:$x, VR128:$y, 0)>;
+    def : Pat<(v2i64 (urem VR128:$x, VR128:$y)), (VRLG VR128:$x, VR128:$y, 0)>;
+    def : Pat<(i128 (urem VR128:$x, VR128:$y)), (VRLQ VR128:$x, VR128:$y, 0)>;
+  }
+
   // Element rotate left logical (with vector shift amount).
   def VERLLV  : BinaryVRRcGeneric<"verllv", 0xE773>;
   def VERLLVB : BinaryVRRc<"verllvb", 0xE773, rotl, v128b, v128b, 0>;
@@ -887,6 +1060,144 @@ multiclass BitwiseVectorOps<ValueType type, SDPatternOperator not_op> {
     def : Pat<(type (or VR128:$x, (not_op VR128:$y))),
               (VOC VR128:$x, VR128:$y)>;
   }
+  let Predicates = [FeatureVectorEnhancements3] in {
+    def : Pat<(type (and VR128:$x, (and VR128:$y, VR128:$z))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 1)>;
+    def : Pat<(type (and (not_op VR128:$z), (and VR128:$x, VR128:$y))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 2)>;
+    def : Pat<(type (and VR128:$x, (xor VR128:$y, VR128:$z))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 6)>;
+    def : Pat<(type (and VR128:$x, (or VR128:$y, VR128:$z))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 7)>;
+    def : Pat<(type (and VR128:$x, (not_op (or VR128:$y, VR128:$z)))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 8)>;
+    def : Pat<(type (and VR128:$x, (not_op (xor VR128:$y, VR128:$z)))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 9)>;
+    def : Pat<(type (and VR128:$x, (or VR128:$y, (not_op VR128:$z)))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 11)>;
+    def : Pat<(type (and VR128:$x, (not_op (and VR128:$y, VR128:$z)))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 14)>;
+    def : Pat<(type (and (or VR128:$x, VR128:$y), (xor VR128:$z, (and VR128:$x, VR128:$y)))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 22)>;
+    def : Pat<(type (or (and VR128:$x, VR128:$y), (and VR128:$z, (or VR128:$x, VR128:$y)))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 23)>;
+    def : Pat<(type (and (xor VR128:$x, VR128:$y), (xor VR128:$x, VR128:$z))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 24)>;
+    def : Pat<(type (and (or VR128:$x, VR128:$y), (not_op (xor VR128:$y, VR128:$z)))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 25)>;
+    def : Pat<(type (and (or VR128:$x, VR128:$y), (xor VR128:$x, VR128:$z))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 26)>;
+    def : Pat<(type (and (or VR128:$x, VR128:$z), (or VR128:$y, (not_op VR128:$z)))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 27)>;
+    def : Pat<(type (xor VR128:$x, (and VR128:$y, VR128:$z))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 30)>;
+    def : Pat<(type (or VR128:$x, (and VR128:$y, VR128:$z))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 31)>;
+    def : Pat<(type (and (not_op VR128:$z), (xor VR128:$x, VR128:$y))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 40)>;
+    def : Pat<(type (and (or VR128:$x, VR128:$y), (not_op (xor VR128:$z, (and VR128:$x, VR128:$y))))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 41)>;
+    def : Pat<(type (and (not_op VR128:$z), (or VR128:$x, VR128:$y))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 42)>;
+    def : Pat<(type (or (and VR128:$x, VR128:$y), (and (not_op VR128:$z), (or VR128:$x, VR128:$y)))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 43)>;
+    def : Pat<(type (xor VR128:$y, (or VR128:$x, (and VR128:$y, VR128:$z)))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 44)>;
+    def : Pat<(type (xor VR128:$x, (and VR128:$y, (not_op VR128:$z)))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 45)>;
+    def : Pat<(type (and (or VR128:$x, VR128:$y), (not_op (and VR128:$y, VR128:$z)))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 46)>;
+    def : Pat<(type (or VR128:$x, (and VR128:$y, (not_op VR128:$z)))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 47)>;
+    def : Pat<(type (or (xor VR128:$x, VR128:$y), (and VR128:$x, VR128:$z))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 61)>;
+    def : Pat<(type (or (xor VR128:$x, VR128:$y), (and VR128:$x, (not_op VR128:$z)))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 62)>;
+    def : Pat<(type (xor (or VR128:$x, VR128:$y), (or VR128:$z, (and VR128:$x, VR128:$y)))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 104)>;
+    def : Pat<(type (xor VR128:$x, (xor VR128:$y, VR128:$z))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 105)>;
+    def : Pat<(type (xor VR128:$z, (or VR128:$x, VR128:$y))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 106)>;
+    def : Pat<(type (or (and VR128:$x, VR128:$y), (xor VR128:$z, (or VR128:$x, VR128:$y)))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 107)>;
+    def : Pat<(type (or (xor VR128:$y, VR128:$z), (and VR128:$x, (not_op VR128:$y)))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 110)>;
+    def : Pat<(type (or VR128:$x, (xor VR128:$y, VR128:$z))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 111)>;
+    def : Pat<(type (or (xor VR128:$x, VR128:$y), (xor VR128:$x, VR128:$z))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 126)>;
+    def : Pat<(type (or VR128:$x, (or VR128:$y, VR128:$z))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 127)>;
+    def : Pat<(type (not_op (or VR128:$x, (or VR128:$y, VR128:$z)))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 128)>;
+    def : Pat<(type (not_op (or (xor VR128:$x, VR128:$y), (xor VR128:$x, VR128:$z)))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 129)>;
+    def : Pat<(type (not_op (or VR128:$z, (xor VR128:$x, VR128:$y)))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 130)>;
+    def : Pat<(type (and (not_op (xor VR128:$x, VR128:$y)), (or VR128:$x, (not_op VR128:$z)))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 131)>;
+    def : Pat<(type (xor (or VR128:$y, VR128:$z), (or (not_op VR128:$x), (and VR128:$y, VR128:$z)))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 134)>;
+    def : Pat<(type (not_op (xor VR128:$x, (or VR128:$y, VR128:$z)))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 135)>;
+    def : Pat<(type (or (not_op (or VR128:$y, VR128:$z)), (and VR128:$x, (and VR128:$y, VR128:$z)))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 137)>;
+    def : Pat<(type (and (not_op VR128:$z), (or VR128:$x, (not_op VR128:$y)))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 138)>;
+    def : Pat<(type (or (and VR128:$x, VR128:$y), (not_op (or VR128:$y, VR128:$z)))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 139)>;
+    def : Pat<(type (or (not_op (or VR128:$y, VR128:$z)), (and VR128:$x, (xor VR128:$y, VR128:$z)))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 142)>;
+    def : Pat<(type (or VR128:$x, (not_op (or VR128:$y, VR128:$z)))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 143)>;
+    def : Pat<(type (not_op (xor VR128:$x, (xor VR128:$y, VR128:$z)))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 150)>;
+    def : Pat<(type (or (and VR128:$x, VR128:$y), (not_op (xor VR128:$z, (or VR128:$x, VR128:$y))))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 151)>;
+    def : Pat<(type (not_op (or (and VR128:$x, VR128:$y), (xor VR128:$y, VR128:$z)))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 152)>;
+    def : Pat<(type (xor VR128:$z, (or VR128:$x, (not_op VR128:$y)))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 154)>;
+    def : Pat<(type (or (and VR128:$x, VR128:$y), (not_op (xor VR128:$y, VR128:$z)))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 155)>;
+    def : Pat<(type (or (not_op (or VR128:$y, VR128:$z)), (xor VR128:$x, (and VR128:$y, VR128:$z)))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 158)>;
+    def : Pat<(type (or VR128:$x, (not_op (xor VR128:$y, VR128:$z)))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 159)>;
+    def : Pat<(type (not_op (or VR128:$z, (and VR128:$x, VR128:$y)))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 168)>;
+    def : Pat<(type (not_op (xor VR128:$z, (and VR128:$x, VR128:$y)))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 169)>;
+    def : Pat<(type (or (not_op VR128:$z), (and VR128:$x, VR128:$y))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 171)>;
+    def : Pat<(type (and (not_op (and VR128:$x, VR128:$y)), (or VR128:$x, (not_op VR128:$z)))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 172)>;
+    def : Pat<(type (not_op (and (xor VR128:$x, VR128:$z), (or VR128:$y, VR128:$z)))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 173)>;
+    def : Pat<(type (or (not_op VR128:$z), (and VR128:$x, (not_op VR128:$y)))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 174)>;
+    def : Pat<(type (or (xor VR128:$x, VR128:$y), (not_op (or VR128:$x, VR128:$z)))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 188)>;
+    def : Pat<(type (not_op (and (xor VR128:$x, VR128:$z), (xor VR128:$y, VR128:$z)))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 189)>;
+    def : Pat<(type (or (not_op VR128:$z), (xor VR128:$x, VR128:$y))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 190)>;
+    def : Pat<(type (or (not_op VR128:$z), (or VR128:$x, VR128:$y))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 191)>;
+    def : Pat<(type (or (not_op (or VR128:$x, VR128:$y)), (and (not_op VR128:$z), (xor VR128:$x, VR128:$y)))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 232)>;
+    def : Pat<(type (xor (not_op (and VR128:$x, VR128:$y)), (and VR128:$z, (or VR128:$x, VR128:$y)))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 233)>;
+    def : Pat<(type (not_op (and VR128:$z, (or VR128:$x, VR128:$y)))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 234)>;
+    def : Pat<(type (not_op (and VR128:$z, (xor VR128:$x, VR128:$y)))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 235)>;
+    def : Pat<(type (or VR128:$x, (not_op (and VR128:$y, VR128:$z)))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 239)>;
+    def : Pat<(type (not_op (and VR128:$x, (and VR128:$y, VR128:$z)))),
+              (VEVAL VR128:$x, VR128:$y, VR128:$z, 254)>;
+  }
 }
 
 defm : BitwiseVectorOps<v16i8, z_vnot>;
@@ -956,12 +1267,30 @@ defm : IntegerMinMaxVectorOps<v8i16, z_vicmph, VMNH, VMXH>;
 defm : IntegerMinMaxVectorOps<v4i32, z_vicmph, VMNF, VMXF>;
 defm : IntegerMinMaxVectorOps<v2i64, z_vicmph, VMNG, VMXG>;
 
+let Predicates = [FeatureVectorEnhancements3] in {
+  def : Pat<(i128 (or (and VR128:$x, (z_vicmph VR128:$x, VR128:$y)),
+                      (and VR128:$y, (not (z_vicmph VR128:$x, VR128:$y))))),
+            (VMXQ VR128:$x, VR128:$y)>;
+  def : Pat<(i128 (or (and VR128:$y, (z_vicmph VR128:$x, VR128:$y)),
+                      (and VR128:$x, (not (z_vicmph VR128:$x, VR128:$y))))),
+            (VMNQ VR128:$x, VR128:$y)>;
+}
+
 // Unsigned min/max.
 defm : IntegerMinMaxVectorOps<v16i8, z_vicmphl, VMNLB, VMXLB>;
 defm : IntegerMinMaxVectorOps<v8i16, z_vicmphl, VMNLH, VMXLH>;
 defm : IntegerMinMaxVectorOps<v4i32, z_vicmphl, VMNLF, VMXLF>;
 defm : IntegerMinMaxVectorOps<v2i64, z_vicmphl, VMNLG, VMXLG>;
 
+let Predicates = [FeatureVectorEnhancements3] in {
+  def : Pat<(i128 (or (and VR128:$x, (z_vicmphl VR128:$x, VR128:$y)),
+                      (and VR128:$y, (not (z_vicmphl VR128:$x, VR128:$y))))),
+            (VMXLQ VR128:$x, VR128:$y)>;
+  def : Pat<(i128 (or (and VR128:$y, (z_vicmphl VR128:$x, VR128:$y)),
+                      (and VR128:$x, (not (z_vicmphl VR128:$x, VR128:$y))))),
+            (VMNLQ VR128:$x, VR128:$y)>;
+}
+
 // Instantiate full-vector shifts.
 multiclass FullVectorShiftOps<SDPatternOperator shift,
                               Instruction sbit, Instruction sbyte> {
@@ -994,6 +1323,8 @@ let Predicates = [FeatureVector] in {
     def VECH : CompareVRRa<"vech", 0xE7DB, null_frag, v128h, 1>;
     def VECF : CompareVRRa<"vecf", 0xE7DB, null_frag, v128f, 2>;
     def VECG : CompareVRRa<"vecg", 0xE7DB, null_frag, v128g, 3>;
+    let Predicates = [FeatureVectorEnhancements3] in
+      def VECQ : CompareVRRa<"vecq", 0xE7DB, z_scmp, v128q, 4>;
   }
 
   // Element compare logical.
@@ -1003,6 +1334,8 @@ let Predicates = [FeatureVector] in {
     def VECLH : CompareVRRa<"veclh", 0xE7D9, null_frag, v128h, 1>;
     def VECLF : CompareVRRa<"veclf", 0xE7D9, null_frag, v128f, 2>;
     def VECLG : CompareVRRa<"veclg", 0xE7D9, null_frag, v128g, 3>;
+    let Predicates = [FeatureVectorEnhancements3] in
+      def VECLQ : CompareVRRa<"veclq", 0xE7D9, z_ucmp, v128q, 4>;
   }
 
   // Compare equal.
@@ -1015,6 +1348,9 @@ let Predicates = [FeatureVector] in {
                                v128f, v128f, 2>;
   defm VCEQG : BinaryVRRbSPair<"vceqg", 0xE7F8, z_vicmpe, z_vicmpes,
                                v128g, v128g, 3>;
+  let Predicates = [FeatureVectorEnhancements3] in
+    defm VCEQQ : BinaryVRRbSPair<"vceqq", 0xE7F8, z_vicmpe, z_vicmpes,
+                                 v128q, v128q, 4>;
 
   // Compare high.
   def  VCH  : BinaryVRRbSPairGeneric<"vch", 0xE7FB>;
@@ -1026,6 +1362,9 @@ let Predicates = [FeatureVector] in {
                               v128f, v128f, 2>;
   defm VCHG : BinaryVRRbSPair<"vchg", 0xE7FB, z_vicmph, z_vicmphs,
                               v128g, v128g, 3>;
+  let Predicates = [FeatureVectorEnhancements3] in
+    defm VCHQ : BinaryVRRbSPair<"vchq", 0xE7FB, z_vicmph, z_vicmphs,
+                                v128q, v128q, 4>;
 
   // Compare high logical.
   def  VCHL  : BinaryVRRbSPairGeneric<"vchl", 0xE7F9>;
@@ -1037,6 +1376,9 @@ let Predicates = [FeatureVector] in {
                                v128f, v128f, 2>;
   defm VCHLG : BinaryVRRbSPair<"vchlg", 0xE7F9, z_vicmphl, z_vicmphls,
                                v128g, v128g, 3>;
+  let Predicates = [FeatureVectorEnhancements3] in
+    defm VCHLQ : BinaryVRRbSPair<"vchlq", 0xE7F9, z_vicmphl, z_vicmphls,
+                                 v128q, v128q, 4>;
 
   // Test under mask.
   let Defs = [CC] in
@@ -1631,6 +1973,14 @@ let Predicates = [FeatureVector] in {
             (VLEG (VGBM 0), bdxaddr12only:$addr, 1)>;
 }
 
+// In-register i128 sign-extensions on arch15.
+let Predicates = [FeatureVectorEnhancements3] in {
+  def : Pat<(i128 (sext_inreg VR128:$x, i8)), (VUPLG (VSEGB VR128:$x))>;
+  def : Pat<(i128 (sext_inreg VR128:$x, i16)), (VUPLG (VSEGH VR128:$x))>;
+  def : Pat<(i128 (sext_inreg VR128:$x, i32)), (VUPLG (VSEGF VR128:$x))>;
+  def : Pat<(i128 (sext_inreg VR128:$x, i64)), (VUPLG VR128:$x)>;
+}
+
 // In-register i128 sign-extensions.
 let Predicates = [FeatureVector] in {
   def : Pat<(i128 (sext_inreg VR128:$x, i8)),
@@ -1643,6 +1993,20 @@ let Predicates = [FeatureVector] in {
             (VSRAB (VREPG VR128:$x, 1), (VREPIB 64))>;
 }
 
+// Sign-extensions from GPR to i128 on arch15.
+let Predicates = [FeatureVectorEnhancements3] in {
+  def : Pat<(i128 (sext_inreg (anyext GR32:$x), i8)),
+            (VUPLG (VLVGP (LGBR (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$x, subreg_l32)),
+                          (LGBR (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$x, subreg_l32))))>;
+  def : Pat<(i128 (sext_inreg (anyext GR32:$x), i16)),
+            (VUPLG (VLVGP (LGHR (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$x, subreg_l32)),
+                          (LGHR (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$x, subreg_l32))))>;
+  def : Pat<(i128 (sext GR32:$x)),
+            (VUPLG (VLVGP (LGFR GR32:$x), (LGFR GR32:$x)))>;
+  def : Pat<(i128 (sext GR64:$x)),
+            (VUPLG (VLVGP GR64:$x, GR64:$x))>;
+}
+
 // Sign-extensions from GPR to i128.
 let Predicates = [FeatureVector] in {
   def : Pat<(i128 (sext_inreg (anyext GR32:$x), i8)),
@@ -2025,3 +2389,14 @@ let Predicates = [FeatureVectorPackedDecimalEnhancement2] in {
     def VUPKZL : BinaryVRRk<"vupkzl", 0xE65C>;
   }
 }
+
+let Predicates = [FeatureVectorPackedDecimalEnhancement3] in {
+  def VCVBQ : BinaryVRRk<"vcvbq", 0xE64E>;
+  let Defs = [CC] in
+    def VCVDQ : TernaryVRIj<"vcvdq", 0xE64A>;
+
+  let Defs = [CC] in {
+    def VTPOpt : TestExtraVRRg<"vtp", 0xE65F>;
+    def VTZ : TestExtraVRIl<"vtz", 0xE67F>;
+  }
+}
diff --git llvm/lib/Target/SystemZ/SystemZOperands.td llvm/lib/Target/SystemZ/SystemZOperands.td
index e7b45a40a3cc..22dcc4a6d7cd 100644
--- llvm/lib/Target/SystemZ/SystemZOperands.td
+++ llvm/lib/Target/SystemZ/SystemZOperands.td
@@ -165,6 +165,13 @@ class BDVMode<string bitsize, string dispsize>
                         !cast<Operand>("disp"#dispsize#"imm"#bitsize),
                         !cast<RegisterOperand>("VR128"))>;
 
+// An addressing mode with a base, 32-bit displacement and 32-bit index.
+class LXAMode<string bitsize, string dispsize>
+  : AddressOperand<bitsize, dispsize, "", "LXAAddr",
+                   (ops !cast<RegisterOperand>("ADDR"#bitsize),
+                        !cast<Operand>("disp"#dispsize#"imm32"),
+                        !cast<RegisterOperand>("ADDR32"))>;
+
 //===----------------------------------------------------------------------===//
 // Extracting immediate operands from nodes
 // These all create MVT::i64 nodes to ensure the value is not sign-extended
@@ -601,18 +608,20 @@ def pcrel32 : PCRelAddress<i64, "pcrel32", PCRel32> {
 // Addressing modes
 //===----------------------------------------------------------------------===//
 
+class DispOp<ValueType vt, code pred> : Operand<vt>, PatLeaf<(vt imm), pred>;
+
 // 12-bit displacement operands.
 let EncoderMethod = "getImmOpValue<SystemZ::FK_390_U12Imm>",
     DecoderMethod = "decodeU12ImmOperand" in {
-  def disp12imm32 : Operand<i32>;
-  def disp12imm64 : Operand<i64>;
+  def disp12imm32 : DispOp<i32, [{ return N->getAPIntValue().isIntN(12); }]>;
+  def disp12imm64 : DispOp<i64, [{ return N->getAPIntValue().isIntN(12); }]>;
 }
 
 // 20-bit displacement operands.
 let EncoderMethod = "getImmOpValue<SystemZ::FK_390_S20Imm>",
     DecoderMethod = "decodeS20ImmOperand" in {
-  def disp20imm32 : Operand<i32>;
-  def disp20imm64 : Operand<i64>;
+  def disp20imm32 : DispOp<i32, [{ return N->getAPIntValue().isSignedIntN(20); }]>;
+  def disp20imm64 : DispOp<i64, [{ return N->getAPIntValue().isSignedIntN(20); }]>;
 }
 
 def BDAddr32Disp12      : AddressAsmOperand<"BDAddr",   "32", "12">;
@@ -625,6 +634,7 @@ def BDLAddr64Disp12Len4 : AddressAsmOperand<"BDLAddr",  "64", "12", "Len4">;
 def BDLAddr64Disp12Len8 : AddressAsmOperand<"BDLAddr",  "64", "12", "Len8">;
 def BDRAddr64Disp12     : AddressAsmOperand<"BDRAddr",  "64", "12">;
 def BDVAddr64Disp12     : AddressAsmOperand<"BDVAddr",  "64", "12">;
+def LXAAddr64Disp20     : AddressAsmOperand<"LXAAddr",  "64", "20">;
 
 // DAG patterns and operands for addressing modes.  Each mode has
 // the form <type><range><group>[<len>] where:
@@ -635,6 +645,7 @@ def BDVAddr64Disp12     : AddressAsmOperand<"BDVAddr",  "64", "12">;
 //   mviaddr  : like bdaddr, but reject cases with a natural index
 //   bdxaddr  : base + displacement + index
 //   laaddr   : like bdxaddr, but used for Load Address operations
+//   lxaaddr  : like bdxaddr, but used for Load (Logical) Indexed Address
 //   dynalloc : base + displacement + index + ADJDYNALLOC
 //   bdladdr  : base + displacement with a length field
 //   bdvaddr  : base + displacement with a vector index
@@ -669,6 +680,7 @@ def bdxaddr20pair     : BDXMode<"BDXAddr",  "64", "20", "Pair">;
 def dynalloc12only    : BDXMode<"DynAlloc", "64", "12", "Only">;
 def laaddr12pair      : BDXMode<"LAAddr",   "64", "12", "Pair">;
 def laaddr20pair      : BDXMode<"LAAddr",   "64", "20", "Pair">;
+def lxaaddr20only     : LXAMode<            "64", "20">;
 def bdladdr12onlylen4 : BDLMode<"BDLAddr",  "64", "12", "Only", "4">;
 def bdladdr12onlylen8 : BDLMode<"BDLAddr",  "64", "12", "Only", "8">;
 def bdraddr12only     : BDRMode<"BDRAddr",  "64", "12", "Only">;
diff --git llvm/lib/Target/SystemZ/SystemZOperators.td llvm/lib/Target/SystemZ/SystemZOperators.td
index 15b334b042d2..39670adaa257 100644
--- llvm/lib/Target/SystemZ/SystemZOperators.td
+++ llvm/lib/Target/SystemZ/SystemZOperators.td
@@ -154,6 +154,8 @@ def SDT_ZExtractVectorElt   : SDTypeProfile<1, 2,
                                              SDTCisVT<2, i32>]>;
 def SDT_ZReplicate          : SDTypeProfile<1, 1,
                                             [SDTCisVec<0>]>;
+def SDT_ZVecUnpack          : SDTypeProfile<1, 1,
+                                            [SDTCisVec<1>]>;
 def SDT_ZVecUnaryConv       : SDTypeProfile<1, 1,
                                             [SDTCisVec<0>,
                                              SDTCisVec<1>]>;
@@ -164,6 +166,13 @@ def SDT_ZVecUnaryCC         : SDTypeProfile<2, 1,
                                             [SDTCisVec<0>,
                                              SDTCisVT<1, i32>,
                                              SDTCisSameAs<0, 2>]>;
+def SDT_ZVecCompare         : SDTypeProfile<1, 2,
+                                            [SDTCisSameAs<0, 1>,
+                                             SDTCisSameAs<0, 2>]>;
+def SDT_ZVecCompareCC       : SDTypeProfile<2, 2,
+                                            [SDTCisVT<1, i32>,
+                                             SDTCisSameAs<0, 2>,
+                                             SDTCisSameAs<0, 2>]>;
 def SDT_ZVecBinary          : SDTypeProfile<1, 2,
                                             [SDTCisVec<0>,
                                              SDTCisSameAs<0, 1>,
@@ -345,10 +354,10 @@ def z_permute           : SDNode<"SystemZISD::PERMUTE", SDT_ZVecTernary>;
 def z_pack              : SDNode<"SystemZISD::PACK", SDT_ZVecBinaryConv>;
 def z_packs_cc          : SDNode<"SystemZISD::PACKS_CC", SDT_ZVecBinaryConvCC>;
 def z_packls_cc         : SDNode<"SystemZISD::PACKLS_CC", SDT_ZVecBinaryConvCC>;
-def z_unpack_high       : SDNode<"SystemZISD::UNPACK_HIGH", SDT_ZVecUnaryConv>;
-def z_unpackl_high      : SDNode<"SystemZISD::UNPACKL_HIGH", SDT_ZVecUnaryConv>;
-def z_unpack_low        : SDNode<"SystemZISD::UNPACK_LOW", SDT_ZVecUnaryConv>;
-def z_unpackl_low       : SDNode<"SystemZISD::UNPACKL_LOW", SDT_ZVecUnaryConv>;
+def z_unpack_high       : SDNode<"SystemZISD::UNPACK_HIGH", SDT_ZVecUnpack>;
+def z_unpackl_high      : SDNode<"SystemZISD::UNPACKL_HIGH", SDT_ZVecUnpack>;
+def z_unpack_low        : SDNode<"SystemZISD::UNPACK_LOW", SDT_ZVecUnpack>;
+def z_unpackl_low       : SDNode<"SystemZISD::UNPACKL_LOW", SDT_ZVecUnpack>;
 def z_vshl_by_scalar    : SDNode<"SystemZISD::VSHL_BY_SCALAR",
                                  SDT_ZVecBinaryInt>;
 def z_vsrl_by_scalar    : SDNode<"SystemZISD::VSRL_BY_SCALAR",
@@ -358,12 +367,12 @@ def z_vsra_by_scalar    : SDNode<"SystemZISD::VSRA_BY_SCALAR",
 def z_vrotl_by_scalar   : SDNode<"SystemZISD::VROTL_BY_SCALAR",
                                  SDT_ZVecBinaryInt>;
 def z_vsum              : SDNode<"SystemZISD::VSUM", SDT_ZBinaryConv>;
-def z_vicmpe            : SDNode<"SystemZISD::VICMPE", SDT_ZVecBinary>;
-def z_vicmph            : SDNode<"SystemZISD::VICMPH", SDT_ZVecBinary>;
-def z_vicmphl           : SDNode<"SystemZISD::VICMPHL", SDT_ZVecBinary>;
-def z_vicmpes           : SDNode<"SystemZISD::VICMPES", SDT_ZVecBinaryCC>;
-def z_vicmphs           : SDNode<"SystemZISD::VICMPHS", SDT_ZVecBinaryCC>;
-def z_vicmphls          : SDNode<"SystemZISD::VICMPHLS", SDT_ZVecBinaryCC>;
+def z_vicmpe            : SDNode<"SystemZISD::VICMPE", SDT_ZVecCompare>;
+def z_vicmph            : SDNode<"SystemZISD::VICMPH", SDT_ZVecCompare>;
+def z_vicmphl           : SDNode<"SystemZISD::VICMPHL", SDT_ZVecCompare>;
+def z_vicmpes           : SDNode<"SystemZISD::VICMPES", SDT_ZVecCompareCC>;
+def z_vicmphs           : SDNode<"SystemZISD::VICMPHS", SDT_ZVecCompareCC>;
+def z_vicmphls          : SDNode<"SystemZISD::VICMPHLS", SDT_ZVecCompareCC>;
 def z_vfcmpe            : SDNode<"SystemZISD::VFCMPE", SDT_ZVecBinaryConv>;
 def z_strict_vfcmpe     : SDNode<"SystemZISD::STRICT_VFCMPE",
                                  SDT_ZVecBinaryConv, [SDNPHasChain]>;
@@ -535,6 +544,12 @@ def z_ucmp : PatFrag<(ops node:$a, node:$b), (z_icmp node:$a, node:$b, timm), [{
 def z_tm_reg : PatFrag<(ops node:$a, node:$b), (z_tm node:$a, node:$b, timm)>;
 def z_tm_mem : PatFrag<(ops node:$a, node:$b), (z_tm node:$a, node:$b, 0)>;
 
+// Shifts by small immediate amounts.
+def shl1 : PatFrag<(ops node:$src), (shl node:$src, (i32 1))>;
+def shl2 : PatFrag<(ops node:$src), (shl node:$src, (i32 2))>;
+def shl3 : PatFrag<(ops node:$src), (shl node:$src, (i32 3))>;
+def shl4 : PatFrag<(ops node:$src), (shl node:$src, (i32 4))>;
+
 // Register sign-extend operations.  Sub-32-bit values are represented as i32s.
 def sext8  : PatFrag<(ops node:$src), (sext_inreg node:$src, i8)>;
 def sext16 : PatFrag<(ops node:$src), (sext_inreg node:$src, i16)>;
@@ -550,6 +565,15 @@ def zext8  : PatFrag<(ops node:$src), (and node:$src, 0xff)>;
 def zext16 : PatFrag<(ops node:$src), (and node:$src, 0xffff)>;
 def zext32 : PatFrag<(ops node:$src), (zext (i32 node:$src))>;
 
+// Match a 64-bit value that is guaranteed to have been sign-
+// or zero-extended from a 32-bit value.
+def assertsext32 : PatFrag<(ops node:$src), (assertsext node:$src), [{
+  return cast<VTSDNode>(N->getOperand(1))->getVT() == MVT::i32;
+}]>;
+def assertzext32 : PatFrag<(ops node:$src), (assertzext node:$src), [{
+  return cast<VTSDNode>(N->getOperand(1))->getVT() == MVT::i32;
+}]>;
+
 // Match a load or a non-extending atomic load.
 def z_load : PatFrags<(ops node:$ptr),
                       [(load node:$ptr),
diff --git llvm/lib/Target/SystemZ/SystemZProcessors.td llvm/lib/Target/SystemZ/SystemZProcessors.td
index d00b94d00242..75b6671dc772 100644
--- llvm/lib/Target/SystemZ/SystemZProcessors.td
+++ llvm/lib/Target/SystemZ/SystemZProcessors.td
@@ -41,3 +41,4 @@ def : ProcessorModel<"z15", Z15Model, Arch13SupportedFeatures.List>;
 def : ProcessorModel<"arch14", Z16Model, Arch14SupportedFeatures.List>;
 def : ProcessorModel<"z16", Z16Model, Arch14SupportedFeatures.List>;
 
+def : ProcessorModel<"arch15", Z16Model, Arch15SupportedFeatures.List>;
diff --git llvm/lib/Target/SystemZ/SystemZSubtarget.cpp llvm/lib/Target/SystemZ/SystemZSubtarget.cpp
index e4e84460399d..6c376e4bf622 100644
--- llvm/lib/Target/SystemZ/SystemZSubtarget.cpp
+++ llvm/lib/Target/SystemZ/SystemZSubtarget.cpp
@@ -44,9 +44,11 @@ SystemZSubtarget &SystemZSubtarget::initializeSubtargetDependencies(
   if (!HasVector) {
     HasVectorEnhancements1 = false;
     HasVectorEnhancements2 = false;
+    HasVectorEnhancements3 = false;
     HasVectorPackedDecimal = false;
     HasVectorPackedDecimalEnhancement = false;
     HasVectorPackedDecimalEnhancement2 = false;
+    HasVectorPackedDecimalEnhancement3 = false;
   }
 
   return *this;
diff --git llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index 772efcdf8f9f..2b9483293941 100644
--- llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -648,12 +648,16 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
       return VF * DivMulSeqCost +
              BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind);
     }
-    if ((SignedDivRem || UnsignedDivRem) && VF > 4)
-      // Temporary hack: disable high vectorization factors with integer
-      // division/remainder, which will get scalarized and handled with
-      // GR128 registers. The mischeduler is not clever enough to avoid
-      // spilling yet.
-      return 1000;
+    if (SignedDivRem || UnsignedDivRem) {
+      if (ST->hasVectorEnhancements3() && ScalarBits >= 32)
+        return NumVectors * DivInstrCost;
+      else if (VF > 4)
+        // Temporary hack: disable high vectorization factors with integer
+        // division/remainder, which will get scalarized and handled with
+        // GR128 registers. The mischeduler is not clever enough to avoid
+        // spilling yet.
+        return 1000;
+    }
 
     // These FP operations are supported with a single vector instruction for
     // double (base implementation assumes float generally costs 2). For
@@ -900,8 +904,11 @@ InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
 
     if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt)) {
       if (Src->isIntegerTy(1)) {
-        if (DstScalarBits == 128)
+        if (DstScalarBits == 128) {
+          if (Opcode == Instruction::SExt && ST->hasVectorEnhancements3())
+            return 0;/*VCEQQ*/
           return 5 /*branch seq.*/;
+        }
 
         if (ST->hasLoadStoreOnCond2())
           return 2; // li 0; loc 1
@@ -1089,9 +1096,18 @@ InstructionCost SystemZTTIImpl::getCmpSelInstrCost(
       return Cost;
     }
     case Instruction::Select:
-      if (ValTy->isFloatingPointTy() || isInt128InVR(ValTy))
-        return 4; // No LOC for FP / i128 - costs a conditional jump.
-      return 1; // Load On Condition / Select Register.
+      if (ValTy->isFloatingPointTy())
+        return 4; // No LOC for FP - costs a conditional jump.
+
+      // When selecting based on an i128 comparison, LOC / VSEL is possible
+      // if i128 comparisons are directly supported.
+      if (I != nullptr)
+        if (ICmpInst *CI = dyn_cast<ICmpInst>(I->getOperand(0)))
+          if (CI->getOperand(0)->getType()->isIntegerTy(128))
+            return ST->hasVectorEnhancements3() ? 1 : 4;
+
+      // Load On Condition / Select Register available, except for i128.
+      return !isInt128InVR(ValTy) ? 1 : 4;
     }
   }
   else if (ST->hasVector()) {
diff --git llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp
index f06a2ab71f24..2e1ab8d599ee 100644
--- llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp
+++ llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp
@@ -132,7 +132,7 @@ public:
   }
 
   bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
-                             const MCValue &Target,
+                             const MCValue &Target, const uint64_t,
                              const MCSubtargetInfo *STI) override {
     switch ((VE::Fixups)Fixup.getKind()) {
     default:
diff --git llvm/lib/Target/VE/VEISelLowering.cpp llvm/lib/Target/VE/VEISelLowering.cpp
index 87c1625c1145..aff058868f30 100644
--- llvm/lib/Target/VE/VEISelLowering.cpp
+++ llvm/lib/Target/VE/VEISelLowering.cpp
@@ -65,7 +65,8 @@ CCAssignFn *getParamCC(CallingConv::ID CallConv, bool IsVarArg) {
 
 bool VETargetLowering::CanLowerReturn(
     CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
-    const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
+    const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
+    const Type *RetTy) const {
   CCAssignFn *RetCC = getReturnCC(CallConv);
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
diff --git llvm/lib/Target/VE/VEISelLowering.h llvm/lib/Target/VE/VEISelLowering.h
index 8b9412d78662..04274b14baa1 100644
--- llvm/lib/Target/VE/VEISelLowering.h
+++ llvm/lib/Target/VE/VEISelLowering.h
@@ -191,7 +191,8 @@ public:
   bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
                       bool isVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &ArgsFlags,
-                      LLVMContext &Context) const override;
+                      LLVMContext &Context,
+                      const Type *RetTy) const override;
   SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
                       const SmallVectorImpl<SDValue> &OutVals, const SDLoc &dl,
diff --git llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 084aed6eed46..02db1b142a22 100644
--- llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -1429,7 +1429,8 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
 bool WebAssemblyTargetLowering::CanLowerReturn(
     CallingConv::ID /*CallConv*/, MachineFunction & /*MF*/, bool /*IsVarArg*/,
     const SmallVectorImpl<ISD::OutputArg> &Outs,
-    LLVMContext & /*Context*/) const {
+    LLVMContext & /*Context*/,
+    const Type *RetTy) const {
   // WebAssembly can only handle returning tuples with multivalue enabled
   return WebAssembly::canLowerReturn(Outs.size(), Subtarget);
 }
diff --git llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
index 454432728ca8..d9ced1a1a527 100644
--- llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
+++ llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
@@ -87,7 +87,8 @@ private:
   bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
                       bool isVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
-                      LLVMContext &Context) const override;
+                      LLVMContext &Context,
+                      const Type *RetTy) const override;
   SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
                       const SmallVectorImpl<SDValue> &OutVals, const SDLoc &dl,
diff --git llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 144a0c99fdf4..e234d320b2a1 100644
--- llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -172,7 +172,7 @@ public:
   const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
 
   bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
-                             const MCValue &Target,
+                             const MCValue &Target, const uint64_t Value,
                              const MCSubtargetInfo *STI) override;
 
   void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
@@ -659,6 +659,7 @@ const MCFixupKindInfo &X86AsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
 
 bool X86AsmBackend::shouldForceRelocation(const MCAssembler &,
                                           const MCFixup &Fixup, const MCValue &,
+                                          const uint64_t,
                                           const MCSubtargetInfo *STI) {
   return Fixup.getKind() >= FirstLiteralRelocationKind;
 }
diff --git llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
index e166b68668d9..0e0e13e896ae 100644
--- llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
+++ llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
@@ -14,6 +14,7 @@
 #define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCTARGETDESC_H
 
 #include "llvm/ADT/SmallVector.h"
+#include <cstdint>
 #include <memory>
 #include <string>
 
diff --git llvm/lib/Target/X86/X86ArgumentStackSlotRebase.cpp llvm/lib/Target/X86/X86ArgumentStackSlotRebase.cpp
index bf9b8e573059..89a2146227bd 100644
--- llvm/lib/Target/X86/X86ArgumentStackSlotRebase.cpp
+++ llvm/lib/Target/X86/X86ArgumentStackSlotRebase.cpp
@@ -128,7 +128,7 @@ bool X86ArgumentStackSlotPass::runOnMachineFunction(MachineFunction &MF) {
           if (!MO.isReg())
             continue;
           Register Reg = MO.getReg();
-          if (!Register::isPhysicalRegister(Reg))
+          if (!Reg.isPhysical())
             continue;
           if (TRI->isSuperOrSubRegisterEq(BasePtr, Reg))
             return true;
diff --git llvm/lib/Target/X86/X86FixupVectorConstants.cpp llvm/lib/Target/X86/X86FixupVectorConstants.cpp
index 7390cc580545..453898e132ca 100644
--- llvm/lib/Target/X86/X86FixupVectorConstants.cpp
+++ llvm/lib/Target/X86/X86FixupVectorConstants.cpp
@@ -649,41 +649,25 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
   }
   }
 
-  auto ConvertToBroadcastAVX512 = [&](unsigned OpSrc32, unsigned OpSrc64) {
-    unsigned OpBcst32 = 0, OpBcst64 = 0;
-    unsigned OpNoBcst32 = 0, OpNoBcst64 = 0;
-    if (OpSrc32) {
+  auto ConvertToBroadcast = [&](unsigned OpSrc, int BW) {
+    if (OpSrc) {
       if (const X86FoldTableEntry *Mem2Bcst =
-              llvm::lookupBroadcastFoldTableBySize(OpSrc32, 32)) {
-        OpBcst32 = Mem2Bcst->DstOp;
-        OpNoBcst32 = Mem2Bcst->Flags & TB_INDEX_MASK;
+              llvm::lookupBroadcastFoldTableBySize(OpSrc, BW)) {
+        unsigned OpBcst = Mem2Bcst->DstOp;
+        unsigned OpNoBcst = Mem2Bcst->Flags & TB_INDEX_MASK;
+        FixupEntry Fixups[] = {{(int)OpBcst, 1, BW, rebuildSplatCst}};
+        // TODO: Add support for RegBitWidth, but currently rebuildSplatCst
+        // doesn't require it (defaults to Constant::getPrimitiveSizeInBits).
+        return FixupConstant(Fixups, 0, OpNoBcst);
       }
     }
-    if (OpSrc64) {
-      if (const X86FoldTableEntry *Mem2Bcst =
-              llvm::lookupBroadcastFoldTableBySize(OpSrc64, 64)) {
-        OpBcst64 = Mem2Bcst->DstOp;
-        OpNoBcst64 = Mem2Bcst->Flags & TB_INDEX_MASK;
-      }
-    }
-    assert(((OpBcst32 == 0) || (OpBcst64 == 0) || (OpNoBcst32 == OpNoBcst64)) &&
-           "OperandNo mismatch");
-
-    if (OpBcst32 || OpBcst64) {
-      unsigned OpNo = OpBcst32 == 0 ? OpNoBcst64 : OpNoBcst32;
-      FixupEntry Fixups[] = {{(int)OpBcst32, 32, 32, rebuildSplatCst},
-                             {(int)OpBcst64, 64, 64, rebuildSplatCst}};
-      // TODO: Add support for RegBitWidth, but currently rebuildSplatCst
-      // doesn't require it (defaults to Constant::getPrimitiveSizeInBits).
-      return FixupConstant(Fixups, 0, OpNo);
-    }
     return false;
   };
 
   // Attempt to find a AVX512 mapping from a full width memory-fold instruction
   // to a broadcast-fold instruction variant.
   if ((MI.getDesc().TSFlags & X86II::EncodingMask) == X86II::EVEX)
-    return ConvertToBroadcastAVX512(Opc, Opc);
+    return ConvertToBroadcast(Opc, 32) || ConvertToBroadcast(Opc, 64);
 
   // Reverse the X86InstrInfo::setExecutionDomainCustom EVEX->VEX logic
   // conversion to see if we can convert to a broadcasted (integer) logic op.
@@ -740,7 +724,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
       break;
     }
     if (OpSrc32 || OpSrc64)
-      return ConvertToBroadcastAVX512(OpSrc32, OpSrc64);
+      return ConvertToBroadcast(OpSrc32, 32) || ConvertToBroadcast(OpSrc64, 64);
   }
 
   return false;
diff --git llvm/lib/Target/X86/X86FrameLowering.cpp llvm/lib/Target/X86/X86FrameLowering.cpp
index 4d40c23eb561..f7398ac7aa13 100644
--- llvm/lib/Target/X86/X86FrameLowering.cpp
+++ llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -174,7 +174,7 @@ static unsigned getPOP2Opcode(const X86Subtarget &ST) {
 
 static bool isEAXLiveIn(MachineBasicBlock &MBB) {
   for (MachineBasicBlock::RegisterMaskPair RegMask : MBB.liveins()) {
-    unsigned Reg = RegMask.PhysReg;
+    MCRegister Reg = RegMask.PhysReg;
 
     if (Reg == X86::RAX || Reg == X86::EAX || Reg == X86::AX ||
         Reg == X86::AH || Reg == X86::AL)
diff --git llvm/lib/Target/X86/X86ISelLowering.cpp llvm/lib/Target/X86/X86ISelLowering.cpp
index 84736f18011a..a956074e50d8 100644
--- llvm/lib/Target/X86/X86ISelLowering.cpp
+++ llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2812,7 +2812,16 @@ bool X86::mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT,
 }
 
 bool X86::mayFoldIntoStore(SDValue Op) {
-  return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->user_begin());
+  if (!Op.hasOneUse())
+    return false;
+  // Peek through (oneuse) bitcast users
+  SDNode *User = *Op->user_begin();
+  while (User->getOpcode() == ISD::BITCAST) {
+    if (!User->hasOneUse())
+      return false;
+    User = *User->user_begin();
+  }
+  return ISD::isNormalStore(User);
 }
 
 bool X86::mayFoldIntoZeroExtend(SDValue Op) {
@@ -4079,7 +4088,7 @@ static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
       isNullConstant(Vec.getOperand(2)))
     return DAG.getUNDEF(ResultVT);
 
-  SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
+  SDValue VecIdx = DAG.getVectorIdxConstant(IdxVal, dl);
   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
 }
 
@@ -4123,7 +4132,7 @@ static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
   // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
   IdxVal &= ~(ElemsPerChunk - 1);
 
-  SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
+  SDValue VecIdx = DAG.getVectorIdxConstant(IdxVal, dl);
   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
 }
 
@@ -4161,7 +4170,7 @@ static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
   SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
                                 : DAG.getUNDEF(VT);
   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,
-                     DAG.getIntPtrConstant(0, dl));
+                     DAG.getVectorIdxConstant(0, dl));
 }
 
 /// Widen a vector to a larger size with the same scalar type, with the new
@@ -4492,7 +4501,7 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
 
   MVT OpVT = Op.getSimpleValueType();
   unsigned NumElems = OpVT.getVectorNumElements();
-  SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
+  SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl);
 
   // Extend to natively supported kshift.
   MVT WideOpVT = widenMaskVectorType(OpVT, Subtarget);
@@ -6741,7 +6750,7 @@ static SDValue LowerBuildVectorAsInsert(SDValue Op, const SDLoc &DL,
       }
     }
     V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, V, Op.getOperand(i),
-                    DAG.getIntPtrConstant(i, DL));
+                    DAG.getVectorIdxConstant(i, DL));
   }
 
   return V;
@@ -6823,7 +6832,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, const SDLoc &DL,
     }
     Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
     V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16, V, Elt,
-                    DAG.getIntPtrConstant(i / 2, DL));
+                    DAG.getVectorIdxConstant(i / 2, DL));
   }
 
   return DAG.getBitcast(MVT::v16i8, V);
@@ -6964,8 +6973,9 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, const SDLoc &DL,
 
   unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
   assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
-  SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
-                               DAG.getIntPtrConstant(InsertPSMask, DL, true));
+  SDValue Result =
+      DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
+                  DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
   return DAG.getBitcast(VT, Result);
 }
 
@@ -7285,7 +7295,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
                                    DAG, Subtarget, IsAfterLegalize);
       if (HalfLD)
         return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
-                           HalfLD, DAG.getIntPtrConstant(0, DL));
+                           HalfLD, DAG.getVectorIdxConstant(0, DL));
     }
   }
 
@@ -7820,7 +7830,7 @@ static SDValue buildFromShuffleMostly(SDValue Op, const SDLoc &DL,
 
   for (unsigned Idx : InsertIndices)
     NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
-                     DAG.getIntPtrConstant(Idx, DL));
+                     DAG.getVectorIdxConstant(Idx, DL));
 
   return NV;
 }
@@ -7899,7 +7909,7 @@ static SDValue LowerBUILD_VECTORvXi1(SDValue Op, const SDLoc &dl,
       MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
       Select = DAG.getBitcast(VecVT, Select);
       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
-                         DAG.getIntPtrConstant(0, dl));
+                         DAG.getVectorIdxConstant(0, dl));
     }
   }
 
@@ -7918,7 +7928,7 @@ static SDValue LowerBUILD_VECTORvXi1(SDValue Op, const SDLoc &dl,
       MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
       DstVec = DAG.getBitcast(VecVT, Imm);
       DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
-                           DAG.getIntPtrConstant(0, dl));
+                           DAG.getVectorIdxConstant(0, dl));
     }
   } else
     DstVec = DAG.getUNDEF(VT);
@@ -7926,7 +7936,7 @@ static SDValue LowerBUILD_VECTORvXi1(SDValue Op, const SDLoc &dl,
   for (unsigned InsertIdx : NonConstIdx) {
     DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
                          Op.getOperand(InsertIdx),
-                         DAG.getIntPtrConstant(InsertIdx, dl));
+                         DAG.getVectorIdxConstant(InsertIdx, dl));
   }
   return DstVec;
 }
@@ -9384,7 +9394,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
     for (unsigned i = 1; i < NumElems; ++i) {
       if (Op.getOperand(i).isUndef()) continue;
       Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
-                           Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
+                           Op.getOperand(i), DAG.getVectorIdxConstant(i, dl));
     }
     return Result;
   }
@@ -9477,9 +9487,8 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
     if ((NonZeros & (1 << i)) == 0)
       continue;
 
-    Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,
-                      Op.getOperand(i),
-                      DAG.getIntPtrConstant(i * NumSubElems, dl));
+    Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(i),
+                      DAG.getVectorIdxConstant(i * NumSubElems, dl));
   }
 
   return Vec;
@@ -9527,7 +9536,7 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
     Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, Op,
                      DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
-                       DAG.getIntPtrConstant(0, dl));
+                       DAG.getVectorIdxConstant(0, dl));
   }
 
   // If there are zero or one non-zeros we can handle this very simply.
@@ -9539,16 +9548,16 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
     SDValue SubVec = Op.getOperand(Idx);
     unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
     return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
-                       DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
+                       DAG.getVectorIdxConstant(Idx * SubVecNumElts, dl));
   }
 
   if (NumOperands > 2) {
     MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
     ArrayRef<SDUse> Ops = Op->ops();
     SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
-                             Ops.slice(0, NumOperands/2));
+                             Ops.slice(0, NumOperands / 2));
     SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
-                             Ops.slice(NumOperands/2));
+                             Ops.slice(NumOperands / 2));
     return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
   }
 
@@ -9557,11 +9566,11 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
   if (ResVT.getVectorNumElements() >= 16)
     return Op; // The operation is legal with KUNPCK
 
-  SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
-                            DAG.getUNDEF(ResVT), Op.getOperand(0),
-                            DAG.getIntPtrConstant(0, dl));
+  SDValue Vec =
+      DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, DAG.getUNDEF(ResVT),
+                  Op.getOperand(0), DAG.getVectorIdxConstant(0, dl));
   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
-                     DAG.getIntPtrConstant(NumElems/2, dl));
+                     DAG.getVectorIdxConstant(NumElems / 2, dl));
 }
 
 static SDValue LowerCONCAT_VECTORS(SDValue Op,
@@ -12726,7 +12735,7 @@ static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,
                                       NewMask);
   // This is free: ymm -> xmm.
   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
-                     DAG.getIntPtrConstant(0, DL));
+                     DAG.getVectorIdxConstant(0, DL));
 }
 
 /// Try to lower broadcast of a single element.
@@ -15256,10 +15265,10 @@ static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
   if (WidenedMask[0] == 0 && IsHighZero) {
     MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
     SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
-                              DAG.getIntPtrConstant(0, DL));
+                              DAG.getVectorIdxConstant(0, DL));
     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
                        getZeroVector(VT, Subtarget, DAG, DL), LoV,
-                       DAG.getIntPtrConstant(0, DL));
+                       DAG.getVectorIdxConstant(0, DL));
   }
 
   // TODO: If minimizing size and one of the inputs is a zero vector and the
@@ -15283,11 +15292,11 @@ static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
       // this will likely become vinsertf128 which can't fold a 256-bit memop.
       if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
         MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
-        SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
-                                     OnlyUsesV1 ? V1 : V2,
-                                     DAG.getIntPtrConstant(0, DL));
+        SDValue SubVec =
+            DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
+                        DAG.getVectorIdxConstant(0, DL));
         return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
-                           DAG.getIntPtrConstant(2, DL));
+                           DAG.getVectorIdxConstant(2, DL));
       }
     }
 
@@ -15580,7 +15589,7 @@ static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2,
     SDValue V = (HalfIdx < 2 ? V1 : V2);
     HalfIdx = (HalfIdx % 2) * HalfNumElts;
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
-                       DAG.getIntPtrConstant(HalfIdx, DL));
+                       DAG.getVectorIdxConstant(HalfIdx, DL));
   };
 
   // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
@@ -15597,7 +15606,7 @@ static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2,
 
   unsigned Offset = UndefLower ? HalfNumElts : 0;
   return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
-                     DAG.getIntPtrConstant(Offset, DL));
+                     DAG.getVectorIdxConstant(Offset, DL));
 }
 
 /// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
@@ -15624,9 +15633,9 @@ static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1,
   if (!UndefLower &&
       isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
     SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
-                             DAG.getIntPtrConstant(HalfNumElts, DL));
+                             DAG.getVectorIdxConstant(HalfNumElts, DL));
     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
-                       DAG.getIntPtrConstant(0, DL));
+                       DAG.getVectorIdxConstant(0, DL));
   }
 
   // Lower half is undef and upper half is whole lower subvector.
@@ -15634,9 +15643,9 @@ static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1,
   if (UndefLower &&
       isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
     SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
-                             DAG.getIntPtrConstant(0, DL));
+                             DAG.getVectorIdxConstant(0, DL));
     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
-                       DAG.getIntPtrConstant(HalfNumElts, DL));
+                       DAG.getVectorIdxConstant(HalfNumElts, DL));
   }
 
   int HalfIdx1, HalfIdx2;
@@ -16019,7 +16028,7 @@ static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT,
   // Insert the unpckldq into a zero vector to widen to v32i8.
   return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
                      DAG.getConstant(0, DL, MVT::v32i8), Unpack,
-                     DAG.getIntPtrConstant(0, DL));
+                     DAG.getVectorIdxConstant(0, DL));
 }
 
 // a = shuffle v1, v2, mask1    ; interleaving lower lanes of v1 and v2
@@ -16948,10 +16957,10 @@ static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
     unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
     MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
     SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
-                              DAG.getIntPtrConstant(0, DL));
+                              DAG.getVectorIdxConstant(0, DL));
     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
                        getZeroVector(VT, Subtarget, DAG, DL), LoV,
-                       DAG.getIntPtrConstant(0, DL));
+                       DAG.getVectorIdxConstant(0, DL));
   }
 
   // Check for patterns which can be matched with a single insert of a 256-bit
@@ -16962,9 +16971,9 @@ static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
     MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
     SDValue SubVec =
         DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
-                    DAG.getIntPtrConstant(0, DL));
+                    DAG.getVectorIdxConstant(0, DL));
     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
-                       DAG.getIntPtrConstant(4, DL));
+                       DAG.getVectorIdxConstant(4, DL));
   }
 
   // See if this is an insertion of the lower 128-bits of V2 into V1.
@@ -16993,7 +17002,7 @@ static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
   if (IsInsert && V2Index >= 0) {
     MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
     SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
-                                 DAG.getIntPtrConstant(0, DL));
+                                 DAG.getVectorIdxConstant(0, DL));
     return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
   }
 
@@ -17600,7 +17609,7 @@ static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef<int> Mask,
   Res = DAG.getNode(X86ISD::KSHIFTR, DL, Res.getValueType(), Res,
                     DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
-                     DAG.getIntPtrConstant(0, DL));
+                     DAG.getVectorIdxConstant(0, DL));
 }
 
 // Determine if this shuffle can be implemented with a KSHIFT instruction.
@@ -17676,22 +17685,22 @@ static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   if ((int)Zeroable.countl_one() >= (NumElts - SubvecElts)) {
     assert(Src >= 0 && "Expected a source!");
     MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
-    SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,
-                                  Src == 0 ? V1 : V2,
-                                  DAG.getIntPtrConstant(0, DL));
+    SDValue Extract =
+        DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, Src == 0 ? V1 : V2,
+                    DAG.getVectorIdxConstant(0, DL));
     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
-                       DAG.getConstant(0, DL, VT),
-                       Extract, DAG.getIntPtrConstant(0, DL));
+                       DAG.getConstant(0, DL, VT), Extract,
+                       DAG.getVectorIdxConstant(0, DL));
   }
 
   // Try a simple shift right with undef elements. Later we'll try with zeros.
-  if (SDValue Shift = lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget,
-                                                DAG))
+  if (SDValue Shift =
+          lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget, DAG))
     return Shift;
 
   // Try to match KSHIFTs.
   unsigned Offset = 0;
-  for (SDValue V : { V1, V2 }) {
+  for (SDValue V : {V1, V2}) {
     unsigned Opcode;
     int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
     if (ShiftAmt >= 0) {
@@ -17701,8 +17710,9 @@ static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
       if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
         int WideElts = WideVT.getVectorNumElements();
         // Shift left to put the original vector in the MSBs of the new size.
-        Res = DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
-                          DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
+        Res =
+            DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
+                        DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
         // Increase the shift amount to account for the left shift.
         ShiftAmt += WideElts - NumElts;
       }
@@ -17710,7 +17720,7 @@ static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
       Res = DAG.getNode(Opcode, DL, WideVT, Res,
                         DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
-                         DAG.getIntPtrConstant(0, DL));
+                         DAG.getVectorIdxConstant(0, DL));
     }
     Offset += NumElts; // Increment for next iteration.
   }
@@ -18336,7 +18346,7 @@ static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
                     DAG.getTargetConstant(IdxVal, dl, MVT::i8));
 
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
-                     DAG.getIntPtrConstant(0, dl));
+                     DAG.getVectorIdxConstant(0, dl));
 }
 
 // Helper to find all the extracted elements from a vector.
@@ -18435,7 +18445,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
     // this can be done with a mask.
     IdxVal &= ElemsPerChunk - 1;
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
-                       DAG.getIntPtrConstant(IdxVal, dl));
+                       DAG.getVectorIdxConstant(IdxVal, dl));
   }
 
   assert(VecVT.is128BitVector() && "Unexpected vector length");
@@ -18476,7 +18486,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
     if (DWordIdx == 0 && DemandedElts == (DemandedElts & 15)) {
       SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
                                 DAG.getBitcast(MVT::v4i32, Vec),
-                                DAG.getIntPtrConstant(DWordIdx, dl));
+                                DAG.getVectorIdxConstant(DWordIdx, dl));
       int ShiftVal = (IdxVal % 4) * 8;
       if (ShiftVal != 0)
         Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
@@ -18488,7 +18498,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
     if (DemandedElts == (DemandedElts & (3 << (WordIdx * 2)))) {
       SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
                                 DAG.getBitcast(MVT::v8i16, Vec),
-                                DAG.getIntPtrConstant(WordIdx, dl));
+                                DAG.getVectorIdxConstant(WordIdx, dl));
       int ShiftVal = (IdxVal % 2) * 8;
       if (ShiftVal != 0)
         Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
@@ -18506,7 +18516,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
     Mask[0] = static_cast<int>(IdxVal);
     Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
-                       DAG.getIntPtrConstant(0, dl));
+                       DAG.getVectorIdxConstant(0, dl));
   }
 
   if (VT.getSizeInBits() == 64) {
@@ -18522,7 +18532,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
     int Mask[2] = { 1, -1 };
     Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
-                       DAG.getIntPtrConstant(0, dl));
+                       DAG.getVectorIdxConstant(0, dl));
   }
 
   return SDValue();
@@ -18683,7 +18693,7 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
     unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
 
     V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
-                    DAG.getIntPtrConstant(IdxIn128, dl));
+                    DAG.getVectorIdxConstant(IdxIn128, dl));
 
     // Insert the changed part back into the bigger vector
     return insert128BitVector(N0, V, IdxVal, DAG, dl);
@@ -18832,7 +18842,7 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
                     DAG.getTargetConstant(IdxVal, dl, MVT::i8));
 
   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
-                     DAG.getIntPtrConstant(0, dl));
+                     DAG.getVectorIdxConstant(0, dl));
 }
 
 // Returns the appropriate wrapper opcode for a global reference.
@@ -19414,14 +19424,14 @@ static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, const SDLoc &dl,
                                  {Op.getOperand(0), InVec});
     SDValue Chain = CvtVec.getValue(1);
     SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
-                                DAG.getIntPtrConstant(0, dl));
+                                DAG.getVectorIdxConstant(0, dl));
     return DAG.getMergeValues({Value, Chain}, dl);
   }
 
   SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
 
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
-                     DAG.getIntPtrConstant(0, dl));
+                     DAG.getVectorIdxConstant(0, dl));
 }
 
 // Try to use a packed vector operation to handle i64 on 32-bit targets.
@@ -19450,14 +19460,14 @@ static SDValue LowerI64IntToFP16(SDValue Op, const SDLoc &dl, SelectionDAG &DAG,
                                  {Op.getOperand(0), InVec});
     SDValue Chain = CvtVec.getValue(1);
     SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
-                                DAG.getIntPtrConstant(0, dl));
+                                DAG.getVectorIdxConstant(0, dl));
     return DAG.getMergeValues({Value, Chain}, dl);
   }
 
   SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, MVT::v2f16, InVec);
 
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
-                     DAG.getIntPtrConstant(0, dl));
+                     DAG.getVectorIdxConstant(0, dl));
 }
 
 static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
@@ -19521,7 +19531,7 @@ static SDValue vectorizeExtractedCast(SDValue Cast, const SDLoc &DL,
   // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
   SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
-                     DAG.getIntPtrConstant(0, DL));
+                     DAG.getVectorIdxConstant(0, DL));
 }
 
 /// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
@@ -19567,7 +19577,7 @@ static SDValue lowerFPToIntToFP(SDValue CastToFP, const SDLoc &DL,
   // that could nullify any performance advantage that we hoped to gain from
   // this vector op hack. We do not expect any adverse effects (like denorm
   // penalties) with cast ops.
-  SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
+  SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
   SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
   SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
   SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
@@ -19598,7 +19608,7 @@ static SDValue lowerINT_TO_FP_vXi64(SDValue Op, const SDLoc &DL,
     SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
                            : DAG.getUNDEF(MVT::v8i64);
     Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
-                      DAG.getIntPtrConstant(0, DL));
+                      DAG.getVectorIdxConstant(0, DL));
     SDValue Res, Chain;
     if (IsStrict) {
       Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
@@ -19609,7 +19619,7 @@ static SDValue lowerINT_TO_FP_vXi64(SDValue Op, const SDLoc &DL,
     }
 
     Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
-                      DAG.getIntPtrConstant(0, DL));
+                      DAG.getVectorIdxConstant(0, DL));
 
     if (IsStrict)
       return DAG.getMergeValues({Res, Chain}, DL);
@@ -19632,7 +19642,7 @@ static SDValue lowerINT_TO_FP_vXi64(SDValue Op, const SDLoc &DL,
   SmallVector<SDValue, 4> Chains(4);
   for (int i = 0; i != 4; ++i) {
     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
-                              DAG.getIntPtrConstant(i, DL));
+                              DAG.getVectorIdxConstant(i, DL));
     if (IsStrict) {
       SignCvts[i] =
           DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
@@ -19915,7 +19925,7 @@ static SDValue LowerUINT_TO_FP_i64(SDValue Op, const SDLoc &dl,
     Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
   }
   Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
-                       DAG.getIntPtrConstant(0, dl));
+                       DAG.getVectorIdxConstant(0, dl));
   return Result;
 }
 
@@ -19941,9 +19951,9 @@ static SDValue LowerUINT_TO_FP_i32(SDValue Op, const SDLoc &dl,
       DAG.getBitcast(MVT::v2i64, Load),
       DAG.getBitcast(MVT::v2i64,
                      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
-  Or =
-      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
-                  DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
+  Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
+                   DAG.getBitcast(MVT::v2f64, Or),
+                   DAG.getVectorIdxConstant(0, dl));
 
   if (Op.getNode()->isStrictFPOpcode()) {
     // Subtract the bias.
@@ -19993,7 +20003,7 @@ static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, const SDLoc &DL,
                                 {Op.getOperand(0), N0});
       SDValue Chain = Res.getValue(1);
       Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
-                        DAG.getIntPtrConstant(0, DL));
+                        DAG.getVectorIdxConstant(0, DL));
       return DAG.getMergeValues({Res, Chain}, DL);
     }
 
@@ -20050,7 +20060,7 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, const SDLoc &DL,
     SDValue Tmp =
         IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
     V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
-                    DAG.getIntPtrConstant(0, DL));
+                    DAG.getVectorIdxConstant(0, DL));
     SDValue Res, Chain;
     if (IsStrict) {
       Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
@@ -20061,7 +20071,7 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, const SDLoc &DL,
     }
 
     Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
-                      DAG.getIntPtrConstant(0, DL));
+                      DAG.getVectorIdxConstant(0, DL));
 
     if (IsStrict)
       return DAG.getMergeValues({Res, Chain}, DL);
@@ -20581,9 +20591,9 @@ static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
                                    const SDLoc &dl, SelectionDAG &DAG) {
   assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.");
   SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
-                           DAG.getIntPtrConstant(0, dl));
+                           DAG.getVectorIdxConstant(0, dl));
   SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
-                           DAG.getIntPtrConstant(8, dl));
+                           DAG.getVectorIdxConstant(8, dl));
   Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
   Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
   SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
@@ -20622,10 +20632,9 @@ static SDValue LowerZERO_EXTEND_Mask(SDValue Op, const SDLoc &DL,
   if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
     NumElts *= 512 / ExtVT.getSizeInBits();
     InVT = MVT::getVectorVT(MVT::i1, NumElts);
-    In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
-                     In, DAG.getIntPtrConstant(0, DL));
-    WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
-                              NumElts);
+    In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT), In,
+                     DAG.getVectorIdxConstant(0, DL));
+    WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
   }
 
   SDValue One = DAG.getConstant(1, DL, WideVT);
@@ -20642,7 +20651,7 @@ static SDValue LowerZERO_EXTEND_Mask(SDValue Op, const SDLoc &DL,
   // Extract back to 128/256-bit if we widened.
   if (WideVT != VT)
     SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
-                              DAG.getIntPtrConstant(0, DL));
+                              DAG.getVectorIdxConstant(0, DL));
 
   return SelectedVal;
 }
@@ -21141,13 +21150,13 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
       In = DAG.getBitcast(MVT::v8i32, In);
       In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
-                         DAG.getIntPtrConstant(0, DL));
+                         DAG.getVectorIdxConstant(0, DL));
     }
 
     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
-                               DAG.getIntPtrConstant(0, DL));
+                               DAG.getVectorIdxConstant(0, DL));
     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
-                               DAG.getIntPtrConstant(2, DL));
+                               DAG.getVectorIdxConstant(2, DL));
     static const int ShufMask[] = {0, 2, 4, 6};
     return DAG.getVectorShuffle(VT, DL, DAG.getBitcast(MVT::v4i32, OpLo),
                                 DAG.getBitcast(MVT::v4i32, OpHi), ShufMask);
@@ -21168,7 +21177,7 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
       static const int ShufMask2[] = {0, 2, -1, -1};
       In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
       In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
-                       DAG.getIntPtrConstant(0, DL));
+                       DAG.getVectorIdxConstant(0, DL));
       return DAG.getBitcast(MVT::v8i16, In);
     }
 
@@ -21266,7 +21275,7 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
         SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)
                                : DAG.getUNDEF(MVT::v8f64);
         Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,
-                          DAG.getIntPtrConstant(0, dl));
+                          DAG.getVectorIdxConstant(0, dl));
       }
       if (IsStrict) {
         Res = DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Chain, Src});
@@ -21277,7 +21286,7 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
 
       Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
       Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
-                        DAG.getIntPtrConstant(0, dl));
+                        DAG.getVectorIdxConstant(0, dl));
       if (IsStrict)
         return DAG.getMergeValues({Res, Chain}, dl);
       return Res;
@@ -21318,7 +21327,7 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
 
       if (ResVT != VT)
         Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
-                          DAG.getIntPtrConstant(0, dl));
+                          DAG.getVectorIdxConstant(0, dl));
 
       if (IsStrict)
         return DAG.getMergeValues({Res, Chain}, dl);
@@ -21370,7 +21379,7 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
       SDValue Tmp =
           IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
       Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
-                        DAG.getIntPtrConstant(0, dl));
+                        DAG.getVectorIdxConstant(0, dl));
 
       if (IsStrict) {
         Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},
@@ -21381,7 +21390,7 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
       }
 
       Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
-                        DAG.getIntPtrConstant(0, dl));
+                        DAG.getVectorIdxConstant(0, dl));
 
       if (IsStrict)
         return DAG.getMergeValues({Res, Chain}, dl);
@@ -21400,7 +21409,7 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
       SDValue Tmp =
           IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
       Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
-                        DAG.getIntPtrConstant(0, dl));
+                        DAG.getVectorIdxConstant(0, dl));
 
       if (IsStrict) {
         Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
@@ -21411,7 +21420,7 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
       }
 
       Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
-                        DAG.getIntPtrConstant(0, dl));
+                        DAG.getVectorIdxConstant(0, dl));
 
       if (IsStrict)
         return DAG.getMergeValues({Res, Chain}, dl);
@@ -21432,7 +21441,7 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
                           {Chain, Tmp});
         SDValue Chain = Tmp.getValue(1);
         Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
-                          DAG.getIntPtrConstant(0, dl));
+                          DAG.getVectorIdxConstant(0, dl));
         return DAG.getMergeValues({Tmp, Chain}, dl);
       }
 
@@ -21867,7 +21876,7 @@ SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
     In = DAG.getBitcast(MVT::i16, In);
     In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16,
                      getZeroVector(MVT::v8i16, Subtarget, DAG, DL), In,
-                     DAG.getIntPtrConstant(0, DL));
+                     DAG.getVectorIdxConstant(0, DL));
     SDValue Res;
     if (IsStrict) {
       Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, DL, {MVT::v4f32, MVT::Other},
@@ -21878,7 +21887,7 @@ SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
                         DAG.getTargetConstant(4, DL, MVT::i32));
     }
     Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Res,
-                      DAG.getIntPtrConstant(0, DL));
+                      DAG.getVectorIdxConstant(0, DL));
     if (IsStrict)
       return DAG.getMergeValues({Res, Chain}, DL);
     return Res;
@@ -21983,7 +21992,7 @@ SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
     if (IsStrict) {
       Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4f32,
                         DAG.getConstantFP(0, DL, MVT::v4f32), In,
-                        DAG.getIntPtrConstant(0, DL));
+                        DAG.getVectorIdxConstant(0, DL));
       Res = DAG.getNode(X86ISD::STRICT_CVTPS2PH, DL, {MVT::v8i16, MVT::Other},
                         {Chain, Res, Rnd});
       Chain = Res.getValue(1);
@@ -21994,7 +22003,7 @@ SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
     }
 
     Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
-                      DAG.getIntPtrConstant(0, DL));
+                      DAG.getVectorIdxConstant(0, DL));
     Res = DAG.getBitcast(MVT::f16, Res);
 
     if (IsStrict)
@@ -22015,7 +22024,7 @@ static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) {
   SDLoc dl(Op);
   SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,
                             DAG.getConstant(0, dl, MVT::v8i16), Src,
-                            DAG.getIntPtrConstant(0, dl));
+                            DAG.getVectorIdxConstant(0, dl));
 
   SDValue Chain;
   if (IsStrict) {
@@ -22027,7 +22036,7 @@ static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) {
   }
 
   Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
-                    DAG.getIntPtrConstant(0, dl));
+                    DAG.getVectorIdxConstant(0, dl));
 
   if (IsStrict)
     return DAG.getMergeValues({Res, Chain}, dl);
@@ -22046,7 +22055,7 @@ static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) {
   if (IsStrict) {
     Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,
                       DAG.getConstantFP(0, dl, MVT::v4f32), Src,
-                      DAG.getIntPtrConstant(0, dl));
+                      DAG.getVectorIdxConstant(0, dl));
     Res = DAG.getNode(
         X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
         {Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});
@@ -22059,7 +22068,7 @@ static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) {
   }
 
   Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,
-                    DAG.getIntPtrConstant(0, dl));
+                    DAG.getVectorIdxConstant(0, dl));
 
   if (IsStrict)
     return DAG.getMergeValues({Res, Chain}, dl);
@@ -22079,7 +22088,7 @@ SDValue X86TargetLowering::LowerFP_TO_BF16(SDValue Op,
     Res = DAG.getNode(X86ISD::CVTNEPS2BF16, DL, MVT::v8bf16, Res);
     Res = DAG.getBitcast(MVT::v8i16, Res);
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
-                       DAG.getIntPtrConstant(0, DL));
+                       DAG.getVectorIdxConstant(0, DL));
   }
 
   MakeLibCallOptions CallOptions;
@@ -22160,7 +22169,7 @@ static SDValue lowerAddSubToHorizontalOp(SDValue Op, const SDLoc &DL,
   // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
   SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
-                     DAG.getIntPtrConstant(LExtIndex / 2, DL));
+                     DAG.getVectorIdxConstant(LExtIndex / 2, DL));
 }
 
 /// Depending on uarch and/or optimizing for size, we might prefer to use a
@@ -22255,7 +22264,7 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
   Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
   SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
-                     DAG.getIntPtrConstant(0, dl));
+                     DAG.getVectorIdxConstant(0, dl));
 }
 
 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
@@ -22323,8 +22332,9 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
 
   // OR the magnitude value with the sign bit.
   SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
-  return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
-                                          DAG.getIntPtrConstant(0, dl));
+  return !IsFakeVector ? Or
+                       : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
+                                     DAG.getVectorIdxConstant(0, dl));
 }
 
 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
@@ -23231,7 +23241,7 @@ SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
       RefinementSteps = 0;
 
     if (VT == MVT::f16) {
-      SDValue Zero = DAG.getIntPtrConstant(0, DL);
+      SDValue Zero = DAG.getVectorIdxConstant(0, DL);
       SDValue Undef = DAG.getUNDEF(MVT::v8f16);
       Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
       Op = DAG.getNode(X86ISD::RSQRT14S, DL, MVT::v8f16, Undef, Op);
@@ -23282,7 +23292,7 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
       RefinementSteps = 0;
 
     if (VT == MVT::f16) {
-      SDValue Zero = DAG.getIntPtrConstant(0, DL);
+      SDValue Zero = DAG.getVectorIdxConstant(0, DL);
       SDValue Undef = DAG.getUNDEF(MVT::v8f16);
       Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
       Op = DAG.getNode(X86ISD::RCP14S, DL, MVT::v8f16, Undef, Op);
@@ -24635,8 +24645,8 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
 
         SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
 
-        return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
-                           VSel, DAG.getIntPtrConstant(0, DL));
+        return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VSel,
+                           DAG.getVectorIdxConstant(0, DL));
       }
       SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
       SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
@@ -24857,8 +24867,8 @@ static SDValue LowerSIGN_EXTEND_Mask(SDValue Op, const SDLoc &dl,
   if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
     NumElts *= 512 / ExtVT.getSizeInBits();
     InVT = MVT::getVectorVT(MVT::i1, NumElts);
-    In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),
-                     In, DAG.getIntPtrConstant(0, dl));
+    In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT), In,
+                     DAG.getVectorIdxConstant(0, dl));
     WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
   }
 
@@ -24882,7 +24892,7 @@ static SDValue LowerSIGN_EXTEND_Mask(SDValue Op, const SDLoc &dl,
   // Extract back to 128/256-bit if we widened.
   if (WideVT != VT)
     V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
-                    DAG.getIntPtrConstant(0, dl));
+                    DAG.getVectorIdxConstant(0, dl));
 
   return V;
 }
@@ -25138,7 +25148,7 @@ static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,
     SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),
                                            TypeSize::getFixed(Offset), DL);
     SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
-                              DAG.getIntPtrConstant(i, DL));
+                              DAG.getVectorIdxConstant(i, DL));
     SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,
                               Store->getPointerInfo().getWithOffset(Offset),
                               Store->getOriginalAlign(),
@@ -25166,7 +25176,7 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
     // We must pad with zeros to ensure we store zeroes to any unused bits.
     StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
                             DAG.getUNDEF(MVT::v16i1), StoredVal,
-                            DAG.getIntPtrConstant(0, dl));
+                            DAG.getVectorIdxConstant(0, dl));
     StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
     StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
     // Make sure we store zeros in the extra bits.
@@ -25215,7 +25225,7 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
     MVT CastVT = MVT::getVectorVT(StVT, 2);
     StoredVal = DAG.getBitcast(CastVT, StoredVal);
     StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
-                            DAG.getIntPtrConstant(0, dl));
+                            DAG.getVectorIdxConstant(0, dl));
 
     return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
                         St->getPointerInfo(), St->getOriginalAlign(),
@@ -25262,7 +25272,7 @@ static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
     SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);
     Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
                       DAG.getBitcast(MVT::v16i1, Val),
-                      DAG.getIntPtrConstant(0, dl));
+                      DAG.getVectorIdxConstant(0, dl));
     return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
   }
 
@@ -25830,7 +25840,7 @@ static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
     // are extracted by EXTRACT_SUBVECTOR.
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
                        DAG.getBitcast(BitcastVT, Mask),
-                       DAG.getIntPtrConstant(0, dl));
+                       DAG.getVectorIdxConstant(0, dl));
   }
 }
 
@@ -25878,7 +25888,7 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
   assert(Mask.getValueType() == MVT::i8 && "Unexpect type");
   SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
                               DAG.getBitcast(MVT::v8i1, Mask),
-                              DAG.getIntPtrConstant(0, dl));
+                              DAG.getVectorIdxConstant(0, dl));
   if (Op.getOpcode() == X86ISD::FSETCCM ||
       Op.getOpcode() == X86ISD::FSETCCM_SAE ||
       Op.getOpcode() == X86ISD::VFPCLASSS)
@@ -26359,8 +26369,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
       // Need to fill with zeros to ensure the bitcast will produce zeroes
       // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
       SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
-                                DAG.getConstant(0, dl, MVT::v8i1),
-                                FPclassMask, DAG.getIntPtrConstant(0, dl));
+                                DAG.getConstant(0, dl, MVT::v8i1), FPclassMask,
+                                DAG.getVectorIdxConstant(0, dl));
       return DAG.getBitcast(MVT::i8, Ins);
     }
 
@@ -26406,8 +26416,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
       // Need to fill with zeros to ensure the bitcast will produce zeroes
       // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
       SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
-                                DAG.getConstant(0, dl, MVT::v8i1),
-                                CmpMask, DAG.getIntPtrConstant(0, dl));
+                                DAG.getConstant(0, dl, MVT::v8i1), CmpMask,
+                                DAG.getVectorIdxConstant(0, dl));
       return DAG.getBitcast(MVT::i8, Ins);
     }
     case COMI: { // Comparison intrinsics
@@ -26486,8 +26496,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
       // Need to fill with zeros to ensure the bitcast will produce zeroes
       // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
       SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
-                                DAG.getConstant(0, dl, MVT::v16i1),
-                                FCmp, DAG.getIntPtrConstant(0, dl));
+                                DAG.getConstant(0, dl, MVT::v16i1), FCmp,
+                                DAG.getVectorIdxConstant(0, dl));
       return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
                          DAG.getBitcast(MVT::i16, Ins));
     }
@@ -28997,7 +29007,7 @@ static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget,
     SDValue IsNanZero = DAG.getNode(X86ISD::VFPCLASSS, DL, MVT::v1i1, VX, Imm);
     SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
                               DAG.getConstant(0, DL, MVT::v8i1), IsNanZero,
-                              DAG.getIntPtrConstant(0, DL));
+                              DAG.getVectorIdxConstant(0, DL));
     SDValue NeedSwap = DAG.getBitcast(MVT::i8, Ins);
     NewX = DAG.getSelect(DL, VT, NeedSwap, Y, X);
     NewY = DAG.getSelect(DL, VT, NeedSwap, X, Y);
@@ -29012,10 +29022,10 @@ static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget,
       assert(VT == MVT::f64);
       SDValue Ins = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v2f64,
                                 DAG.getConstantFP(0, DL, MVT::v2f64), X,
-                                DAG.getIntPtrConstant(0, DL));
+                                DAG.getVectorIdxConstant(0, DL));
       SDValue VX = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, Ins);
       SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VX,
-                               DAG.getIntPtrConstant(1, DL));
+                               DAG.getVectorIdxConstant(1, DL));
       Hi = DAG.getBitcast(MVT::i32, Hi);
       SDValue ZeroCst = DAG.getConstant(0, DL, MVT::i32);
       EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(),
@@ -29205,7 +29215,7 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
            "Should not custom lower when pmulld is available!");
 
     // Extract the odd parts.
-    static const int UnpackMask[] = { 1, -1, 3, -1 };
+    static const int UnpackMask[] = {1, 1, 3, 3};
     SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
     SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
 
@@ -29633,8 +29643,7 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons
     Entry.Node = StackPtr;
     InChain =
         DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));
-    Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
-    Entry.Ty = PointerType::get(ArgTy,0);
+    Entry.Ty = PointerType::get(*DAG.getContext(), 0);
     Entry.IsSExt = false;
     Entry.IsZExt = false;
     Args.push_back(Entry);
@@ -31244,7 +31253,7 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
   // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
   // that can then be OR'd with the lower 32-bits.
   assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected");
-  static const int OddMask[] = {1, -1, 3, -1};
+  static const int OddMask[] = {1, 1, 3, 3};
   SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
   SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
 
@@ -32001,7 +32010,7 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
     return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
 
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
-                     DAG.getIntPtrConstant(0, dl));
+                     DAG.getVectorIdxConstant(0, dl));
 }
 
 /// Compute the horizontal sum of bytes in V for the elements of VT.
@@ -32266,7 +32275,7 @@ static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
     SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
     Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
-                       DAG.getIntPtrConstant(0, DL));
+                       DAG.getVectorIdxConstant(0, DL));
   }
 
   int NumElts = VT.getVectorNumElements();
@@ -32328,8 +32337,9 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
     SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
     Res = DAG.getNode(ISD::BITREVERSE, DL, MVT::v16i8,
                       DAG.getBitcast(MVT::v16i8, Res));
-    Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
-                      DAG.getBitcast(VecVT, Res), DAG.getIntPtrConstant(0, DL));
+    Res =
+        DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, DAG.getBitcast(VecVT, Res),
+                    DAG.getVectorIdxConstant(0, DL));
     return (VT == MVT::i8) ? Res : DAG.getNode(ISD::BSWAP, DL, VT, Res);
   }
 
@@ -32702,10 +32712,12 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
     return CallResult.first;
 
   // Returned in bits 0:31 and 32:64 xmm0.
-  SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
-                               CallResult.first, DAG.getIntPtrConstant(0, dl));
-  SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
-                               CallResult.first, DAG.getIntPtrConstant(1, dl));
+  SDValue SinVal =
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
+                  DAG.getVectorIdxConstant(0, dl));
+  SDValue CosVal =
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
+                  DAG.getVectorIdxConstant(1, dl));
   SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
   return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
 }
@@ -32752,7 +32764,7 @@ static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
   SDValue FillVal =
       FillWithZeroes ? DAG.getConstant(0, dl, NVT) : DAG.getUNDEF(NVT);
   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal, InOp,
-                     DAG.getIntPtrConstant(0, dl));
+                     DAG.getVectorIdxConstant(0, dl));
 }
 
 static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
@@ -32879,7 +32891,7 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
 
   SDValue Extract =
       DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),
-                  DAG.getIntPtrConstant(0, dl));
+                  DAG.getVectorIdxConstant(0, dl));
   SDValue RetOps[] = {Extract, NewLoad.getValue(1)};
   return DAG.getMergeValues(RetOps, dl);
 }
@@ -32974,8 +32986,8 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
   SDValue NewGather = DAG.getMemIntrinsicNode(
       X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),
       N->getMemOperand());
-  SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,
-                                NewGather, DAG.getIntPtrConstant(0, dl));
+  SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT, NewGather,
+                                DAG.getVectorIdxConstant(0, dl));
   return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);
 }
 
@@ -33402,7 +33414,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
       // extend to i64. Otherwise we end up extracting bits 63:32 separately.
       Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
       Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
-                         DAG.getIntPtrConstant(0, dl));
+                         DAG.getVectorIdxConstant(0, dl));
       Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
       Results.push_back(Wide);
     }
@@ -33441,7 +33453,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     SDValue Hi = DAG.getBitcast(MVT::v4i32, Res);
     Hi = DAG.getVectorShuffle(MVT::v4i32, dl, Hi, Hi, {1, 3, -1, -1});
     Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Hi,
-                     DAG.getIntPtrConstant(0, dl));
+                     DAG.getVectorIdxConstant(0, dl));
 
     // Truncate the low bits of the result. This will become PSHUFD.
     Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
@@ -33844,7 +33856,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
 
       if (PromoteVT == MVT::v2i32)
         Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
-                          DAG.getIntPtrConstant(0, dl));
+                          DAG.getVectorIdxConstant(0, dl));
 
       // Truncate back to the original width.
       Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
@@ -33946,7 +33958,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
           Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
       }
 
-      SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
+      SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl);
       SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
                                 DAG.getConstantFP(0.0, dl, VecInVT), Src,
                                 ZeroIdx);
@@ -34046,7 +34058,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
       SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));
       for (int i = 0; i != 2; ++i) {
         SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
-                                  SignSrc, DAG.getIntPtrConstant(i, dl));
+                                  SignSrc, DAG.getVectorIdxConstant(i, dl));
         if (IsStrict)
           SignCvts[i] =
               DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},
@@ -34289,9 +34301,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
           SDValue Ld = DAG.getLoad(MVT::v2i64, dl, Node->getChain(),
                                    Node->getBasePtr(), Node->getMemOperand());
           SDValue ResL = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
-                                     DAG.getIntPtrConstant(0, dl));
+                                     DAG.getVectorIdxConstant(0, dl));
           SDValue ResH = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
-                                     DAG.getIntPtrConstant(1, dl));
+                                     DAG.getVectorIdxConstant(1, dl));
           Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, N->getValueType(0),
                                         {ResL, ResH}));
           Results.push_back(Ld.getValue(1));
@@ -34309,7 +34321,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
                                              MVT::i64, Node->getMemOperand());
         if (Subtarget.hasSSE2()) {
           SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
-                                    DAG.getIntPtrConstant(0, dl));
+                                    DAG.getVectorIdxConstant(0, dl));
           Results.push_back(Res);
           Results.push_back(Ld.getValue(1));
           return;
@@ -34318,7 +34330,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
         // then casts to i64. This avoids a 128-bit stack temporary being
         // created by type legalization if we were to cast v4f32->v2i64.
         SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,
-                                  DAG.getIntPtrConstant(0, dl));
+                                  DAG.getVectorIdxConstant(0, dl));
         Res = DAG.getBitcast(MVT::i64, Res);
         Results.push_back(Res);
         Results.push_back(Ld.getValue(1));
@@ -41845,7 +41857,7 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
         DCI.recursivelyDeleteUnusedNodes(LN);
       } else {
         SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,
-                                  DAG.getIntPtrConstant(0, DL));
+                                  DAG.getVectorIdxConstant(0, DL));
         DCI.CombineTo(LN, Scl, BcastLd.getValue(1));
       }
       return N; // Return N so it doesn't get rechecked!
@@ -42294,7 +42306,7 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
         if (N10 != N0)
           std::swap(N10, N11);
         MVT SVT = VT.getVectorElementType();
-        SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
+        SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
         N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
         N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
         SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
@@ -42425,7 +42437,7 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
                                       DL, WideVT.getSizeInBits());
         SDValue Perm = DAG.getNode(X86ISD::VPERMV, DL, WideVT, Mask, ConcatSrc);
         return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Perm,
-                           DAG.getIntPtrConstant(0, DL));
+                           DAG.getVectorIdxConstant(0, DL));
       }
     }
     SmallVector<SDValue, 2> Ops;
@@ -44939,7 +44951,7 @@ static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL,
       if (SDValue N0 = combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG,
                                                   Subtarget, Depth + 1))
         return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,
-                           DAG.getIntPtrConstant(0, DL));
+                           DAG.getVectorIdxConstant(0, DL));
     break;
   }
   case ISD::ANY_EXTEND:
@@ -44954,7 +44966,7 @@ static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL,
         return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
                            Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)
                                                   : DAG.getConstant(0, DL, VT),
-                           N0, DAG.getIntPtrConstant(0, DL));
+                           N0, DAG.getVectorIdxConstant(0, DL));
     break;
   }
   case ISD::OR:
@@ -45019,7 +45031,7 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
       N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
       N0 = DAG.getBitcast(MVT::v8i1, N0);
       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
-                         DAG.getIntPtrConstant(0, dl));
+                         DAG.getVectorIdxConstant(0, dl));
     }
 
     // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
@@ -45053,7 +45065,7 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
       N0 = DAG.getBitcast(MVT::i8, N0);
       return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
     }
-  } else {
+  } else if (DCI.isAfterLegalizeDAG()) {
     // If we're bitcasting from iX to vXi1, see if the integer originally
     // began as a vXi1 and whether we can remove the bitcast entirely.
     if (VT.isVector() && VT.getScalarType() == MVT::i1 &&
@@ -45481,7 +45493,7 @@ static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG,
     MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
 
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
-                     DAG.getIntPtrConstant(0, DL));
+                     DAG.getVectorIdxConstant(0, DL));
 }
 
 // Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
@@ -45929,7 +45941,7 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
         ((Idx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VecVT.getScalarType(),
                          DAG.getBitcast(VecVT, Vec),
-                         DAG.getIntPtrConstant(Idx, dl));
+                         DAG.getVectorIdxConstant(Idx, dl));
     }
     if ((VecVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
         (VecVT == MVT::v16i8 && Subtarget.hasSSE41())) {
@@ -46157,7 +46169,7 @@ static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,
         V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
                         DAG.getConstant(0, DL, MVT::v4i32),
                         DAG.getBitcast(MVT::i32, V),
-                        DAG.getIntPtrConstant(0, DL));
+                        DAG.getVectorIdxConstant(0, DL));
         return DAG.getBitcast(MVT::v16i8, V);
       }
       V = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, V,
@@ -46358,8 +46370,9 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
       if (Src.getValueType().getScalarType() == MVT::i1 &&
           TLI.isTypeLegal(Src.getValueType())) {
         MVT SubVT = MVT::getVectorVT(MVT::i1, NumEltBits);
-        SDValue Sub = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, Src,
-            DAG.getIntPtrConstant(CIdx->getZExtValue() * NumEltBits, dl));
+        SDValue Sub = DAG.getNode(
+            ISD::EXTRACT_SUBVECTOR, dl, SubVT, Src,
+            DAG.getVectorIdxConstant(CIdx->getZExtValue() * NumEltBits, dl));
         return DAG.getBitcast(VT, Sub);
       }
     }
@@ -47290,7 +47303,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
                             VT.getSizeInBits());
       Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,
                          DAG.getUNDEF(SrcCondVT), Cond,
-                         DAG.getIntPtrConstant(0, DL));
+                         DAG.getVectorIdxConstant(0, DL));
       SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);
       return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
     }
@@ -48040,6 +48053,18 @@ static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,
                            DAG.getAllOnesConstant(DL, NotOp1.getValueType())));
       }
     }
+    // PTESTC(PCMPEQ(X,0),-1) == PTESTZ(X,X)
+    if (EFLAGS.getOpcode() == X86ISD::PTEST &&
+        ISD::isBuildVectorAllOnes(Op1.getNode())) {
+      SDValue BC0 = peekThroughBitcasts(Op0);
+      if (BC0.getOpcode() == X86ISD::PCMPEQ &&
+          ISD::isBuildVectorAllZeros(BC0.getOperand(1).getNode())) {
+        SDLoc DL(EFLAGS);
+        CC = (CC == X86::COND_B ? X86::COND_E : X86::COND_NE);
+        SDValue X = DAG.getBitcast(OpVT, BC0.getOperand(0));
+        return DAG.getNode(EFLAGS.getOpcode(), DL, VT, X, X);
+      }
+    }
   }
 
   if (CC == X86::COND_E || CC == X86::COND_NE) {
@@ -50097,7 +50122,7 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
             // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
             SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
                                       DAG.getConstant(0, DL, MVT::v16i1),
-                                      FSetCC, DAG.getIntPtrConstant(0, DL));
+                                      FSetCC, DAG.getVectorIdxConstant(0, DL));
             return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
                                       N->getSimpleValueType(0));
           }
@@ -50114,11 +50139,12 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
             // OnesOrZeroesF is all ones or all zeroes, we don't need all the
             // bits, but can do this little dance to extract the lowest 32 bits
             // and work with those going forward.
-            SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
-                                           OnesOrZeroesF);
+            SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL,
+                                           MVT::v2f64, OnesOrZeroesF);
             SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
-            OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
-                                        Vector32, DAG.getIntPtrConstant(0, DL));
+            OnesOrZeroesF =
+                DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Vector32,
+                            DAG.getVectorIdxConstant(0, DL));
             IntVT = MVT::i32;
           }
 
@@ -52257,7 +52283,7 @@ static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
       EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);
       SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);
       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
-                         DAG.getIntPtrConstant(0, DL));
+                         DAG.getVectorIdxConstant(0, DL));
     }
   }
 
@@ -53785,7 +53811,7 @@ static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,
     if (ExtVT.getVectorNumElements() != NumElems * 2) {
       MVT NVT = MVT::getVectorVT(MVT::i8, NumElems * 2);
       Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NVT, Ext,
-                        DAG.getIntPtrConstant(0, DL));
+                        DAG.getVectorIdxConstant(0, DL));
     }
   };
   ExtractVec(ZExtIn);
@@ -56878,11 +56904,11 @@ static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDNode *N,
                                VT.getVectorNumElements() * 2);
   if (OutVT16.bitsLT(In0.getValueType())) {
     In0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In0,
-                      DAG.getIntPtrConstant(0, DL));
+                      DAG.getVectorIdxConstant(0, DL));
   }
   if (OutVT16.bitsLT(In1.getValueType())) {
     In1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In1,
-                      DAG.getIntPtrConstant(0, DL));
+                      DAG.getVectorIdxConstant(0, DL));
   }
   return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
                           PMADDBuilder);
@@ -57360,7 +57386,7 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
       return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
                                      Op0.getOperand(0),
-                                     DAG.getIntPtrConstant(0, DL)));
+                                     DAG.getVectorIdxConstant(0, DL)));
 
     // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
     if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
@@ -58115,7 +58141,7 @@ static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
       return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
                          getZeroVector(OpVT, Subtarget, DAG, dl),
                          SubVec.getOperand(1),
-                         DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
+                         DAG.getVectorIdxConstant(IdxVal + Idx2Val, dl));
     }
 
     // If we're inserting into a zero vector and our input was extracted from an
@@ -58188,7 +58214,7 @@ static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
         ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))
       return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
                          getZeroVector(OpVT, Subtarget, DAG, dl),
-                         SubVectorOps[0], DAG.getIntPtrConstant(0, dl));
+                         SubVectorOps[0], DAG.getVectorIdxConstant(0, dl));
 
     // Attempt to recursively combine to a shuffle.
     if (all_of(SubVectorOps, [](SDValue SubOp) {
@@ -58898,7 +58924,7 @@ static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG,
         SDValue Shift = DAG.getNode(X86ISD::KSHIFTR, DL, SrcVT, Src,
                                     DAG.getTargetConstant(Amt, DL, MVT::i8));
         return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shift,
-                           DAG.getIntPtrConstant(0, DL));
+                           DAG.getVectorIdxConstant(0, DL));
       }
     }
   }
@@ -58932,7 +58958,7 @@ static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG,
                     DAG.getTargetConstant(4, dl, MVT::i32));
   Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
-                     DAG.getIntPtrConstant(0, dl));
+                     DAG.getVectorIdxConstant(0, dl));
 }
 
 static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG,
@@ -59012,7 +59038,7 @@ static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG,
   if (NumElts < 4) {
     assert(NumElts == 2 && "Unexpected size");
     Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,
-                      DAG.getIntPtrConstant(0, dl));
+                      DAG.getVectorIdxConstant(0, dl));
   }
 
   if (IsStrict) {
@@ -59148,7 +59174,7 @@ static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG,
   if (NumElts < 8) {
     EVT IntVT = VT.changeVectorElementTypeToInteger();
     Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,
-                      DAG.getIntPtrConstant(0, dl));
+                      DAG.getVectorIdxConstant(0, dl));
   }
 
   Cvt = DAG.getBitcast(VT, Cvt);
diff --git llvm/lib/Target/X86/X86ISelLowering.h llvm/lib/Target/X86/X86ISelLowering.h
index eaedaa0b88d2..03f10a3c83e3 100644
--- llvm/lib/Target/X86/X86ISelLowering.h
+++ llvm/lib/Target/X86/X86ISelLowering.h
@@ -1803,7 +1803,8 @@ namespace llvm {
     bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
                         bool isVarArg,
                         const SmallVectorImpl<ISD::OutputArg> &Outs,
-                        LLVMContext &Context) const override;
+                        LLVMContext &Context,
+                        const Type *RetTy) const override;
 
     const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;
     ArrayRef<MCPhysReg> getRoundingControlRegisters() const override;
diff --git llvm/lib/Target/X86/X86ISelLoweringCall.cpp llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index b1c1ab4aa855..10aa2a5e5dac 100644
--- llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -659,7 +659,8 @@ X86TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {
 
 bool X86TargetLowering::CanLowerReturn(
     CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
-    const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
+    const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
+    const Type *RetTy) const {
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
   return CCInfo.CheckReturn(Outs, RetCC_X86);
diff --git llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
index 7c9738bf0821..a44c583a1ca5 100644
--- llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
+++ llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
@@ -67,15 +67,9 @@ static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) {
   // The mask is constant or extended from a bool vector. Convert this x86
   // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
   if (Value *BoolMask = getBoolVecFromMask(Mask, IC.getDataLayout())) {
-    // First, cast the x86 intrinsic scalar pointer to a vector pointer to match
-    // the LLVM intrinsic definition for the pointer argument.
-    unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
-    PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace);
-    Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
-
     // The pass-through vector for an x86 masked load is a zero vector.
     CallInst *NewMaskedLoad = IC.Builder.CreateMaskedLoad(
-        II.getType(), PtrCast, Align(1), BoolMask, ZeroVec);
+        II.getType(), Ptr, Align(1), BoolMask, ZeroVec);
     return IC.replaceInstUsesWith(II, NewMaskedLoad);
   }
 
@@ -105,7 +99,7 @@ static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) {
   // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
   if (Value *BoolMask = getBoolVecFromMask(Mask, IC.getDataLayout())) {
     unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
-    PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace);
+    PointerType *VecPtrTy = PointerType::get(Vec->getContext(), AddrSpace);
     Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
 
     IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask);
diff --git llvm/lib/Target/X86/X86InstrAVX10.td llvm/lib/Target/X86/X86InstrAVX10.td
index edbcb1729760..557169b4aa67 100644
--- llvm/lib/Target/X86/X86InstrAVX10.td
+++ llvm/lib/Target/X86/X86InstrAVX10.td
@@ -447,8 +447,8 @@ multiclass avx10_minmax_scalar<string OpStr, X86VectorVTInfo _, SDNode OpNode,
 
 
 let mayRaiseFPException = 0 in
-defm VMINMAXNEPBF16 : avx10_minmax_packed<"vminmaxnepbf16", avx512vl_bf16_info, X86vminmax>,
-                      AVX512XDIi8Base, EVEX_CD8<16, CD8VF>, TA;
+defm VMINMAXBF16 : avx10_minmax_packed<"vminmaxbf16", avx512vl_bf16_info, X86vminmax>,
+                   AVX512XDIi8Base, EVEX_CD8<16, CD8VF>, TA;
 
 defm VMINMAXPD : avx10_minmax_packed<"vminmaxpd", avx512vl_f64_info, X86vminmax>,
                  avx10_minmax_packed_sae<"vminmaxpd", avx512vl_f64_info, X86vminmaxSae>,
diff --git llvm/lib/Target/X86/X86IntrinsicsInfo.h llvm/lib/Target/X86/X86IntrinsicsInfo.h
index 86fd04046d16..863cb668431c 100644
--- llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -848,12 +848,9 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
                        X86ISD::FMAX_SAE),
     X86_INTRINSIC_DATA(avx10_vmaxps256, INTR_TYPE_2OP_SAE, X86ISD::FMAX,
                        X86ISD::FMAX_SAE),
-    X86_INTRINSIC_DATA(avx10_vminmaxnepbf16128, INTR_TYPE_3OP, X86ISD::VMINMAX,
-                       0),
-    X86_INTRINSIC_DATA(avx10_vminmaxnepbf16256, INTR_TYPE_3OP, X86ISD::VMINMAX,
-                       0),
-    X86_INTRINSIC_DATA(avx10_vminmaxnepbf16512, INTR_TYPE_3OP, X86ISD::VMINMAX,
-                       0),
+    X86_INTRINSIC_DATA(avx10_vminmaxbf16128, INTR_TYPE_3OP, X86ISD::VMINMAX, 0),
+    X86_INTRINSIC_DATA(avx10_vminmaxbf16256, INTR_TYPE_3OP, X86ISD::VMINMAX, 0),
+    X86_INTRINSIC_DATA(avx10_vminmaxbf16512, INTR_TYPE_3OP, X86ISD::VMINMAX, 0),
     X86_INTRINSIC_DATA(avx10_vminpd256, INTR_TYPE_2OP_SAE, X86ISD::FMIN,
                        X86ISD::FMIN_SAE),
     X86_INTRINSIC_DATA(avx10_vminph256, INTR_TYPE_2OP_SAE, X86ISD::FMIN,
diff --git llvm/lib/Target/X86/X86LowerAMXType.cpp llvm/lib/Target/X86/X86LowerAMXType.cpp
index cd5813a5338e..41cf0fc2cef4 100644
--- llvm/lib/Target/X86/X86LowerAMXType.cpp
+++ llvm/lib/Target/X86/X86LowerAMXType.cpp
@@ -248,7 +248,11 @@ std::pair<Value *, Value *> ShapeCalculator::getShape(IntrinsicInst *II,
   case Intrinsic::x86_tdpbuud_internal:
   case Intrinsic::x86_tdpbf16ps_internal:
   case Intrinsic::x86_tdpfp16ps_internal:
-  case Intrinsic::x86_tmmultf32ps_internal: {
+  case Intrinsic::x86_tmmultf32ps_internal:
+  case Intrinsic::x86_tdpbf8ps_internal:
+  case Intrinsic::x86_tdpbhf8ps_internal:
+  case Intrinsic::x86_tdphbf8ps_internal:
+  case Intrinsic::x86_tdphf8ps_internal: {
     switch (OpNo) {
     case 3:
       Row = II->getArgOperand(0);
diff --git llvm/lib/Target/X86/X86RegisterInfo.cpp llvm/lib/Target/X86/X86RegisterInfo.cpp
index 164d42059551..4faf8bca4f9e 100644
--- llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -1183,8 +1183,7 @@ bool X86RegisterInfo::getRegAllocationHints(Register VirtReg,
 
     auto TryAddNDDHint = [&](const MachineOperand &MO) {
       Register Reg = MO.getReg();
-      Register PhysReg =
-          Register::isPhysicalRegister(Reg) ? Reg : Register(VRM->getPhys(Reg));
+      Register PhysReg = Reg.isPhysical() ? Reg : Register(VRM->getPhys(Reg));
       if (PhysReg && !MRI->isReserved(PhysReg) && !is_contained(Hints, PhysReg))
         TwoAddrHints.insert(PhysReg);
     };
diff --git llvm/lib/Target/XCore/XCoreISelLowering.cpp llvm/lib/Target/XCore/XCoreISelLowering.cpp
index 9a9acaca3188..ac199230b2c0 100644
--- llvm/lib/Target/XCore/XCoreISelLowering.cpp
+++ llvm/lib/Target/XCore/XCoreISelLowering.cpp
@@ -1325,7 +1325,7 @@ bool XCoreTargetLowering::
 CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
                bool isVarArg,
                const SmallVectorImpl<ISD::OutputArg> &Outs,
-               LLVMContext &Context) const {
+               LLVMContext &Context, const Type *RetTy) const {
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
   if (!CCInfo.CheckReturn(Outs, RetCC_XCore))
diff --git llvm/lib/Target/XCore/XCoreISelLowering.h llvm/lib/Target/XCore/XCoreISelLowering.h
index eaa36d40cba9..1e036ea31697 100644
--- llvm/lib/Target/XCore/XCoreISelLowering.h
+++ llvm/lib/Target/XCore/XCoreISelLowering.h
@@ -217,7 +217,7 @@ namespace llvm {
     bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
                         bool isVarArg,
                         const SmallVectorImpl<ISD::OutputArg> &ArgsFlags,
-                        LLVMContext &Context) const override;
+                        LLVMContext &Context, const Type *RetTy) const override;
   };
 }
 
diff --git llvm/lib/Target/Xtensa/XtensaISelLowering.cpp llvm/lib/Target/Xtensa/XtensaISelLowering.cpp
index e8ede330bbac..cdf38a066947 100644
--- llvm/lib/Target/Xtensa/XtensaISelLowering.cpp
+++ llvm/lib/Target/Xtensa/XtensaISelLowering.cpp
@@ -621,7 +621,8 @@ XtensaTargetLowering::LowerCall(CallLoweringInfo &CLI,
 
 bool XtensaTargetLowering::CanLowerReturn(
     CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
-    const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
+    const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
+    const Type *RetTy) const {
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
   return CCInfo.CheckReturn(Outs, RetCC_Xtensa);
diff --git llvm/lib/Target/Xtensa/XtensaISelLowering.h llvm/lib/Target/Xtensa/XtensaISelLowering.h
index cebd7d2016c8..a959299d8ca6 100644
--- llvm/lib/Target/Xtensa/XtensaISelLowering.h
+++ llvm/lib/Target/Xtensa/XtensaISelLowering.h
@@ -105,7 +105,7 @@ public:
   bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
                       bool isVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
-                      LLVMContext &Context) const override;
+                      LLVMContext &Context, const Type *RetTy) const override;
 
   SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
diff --git llvm/lib/TargetParser/Host.cpp llvm/lib/TargetParser/Host.cpp
index 979b44b22338..ba7032025150 100644
--- llvm/lib/TargetParser/Host.cpp
+++ llvm/lib/TargetParser/Host.cpp
@@ -424,8 +424,11 @@ StringRef getCPUNameFromS390Model(unsigned int Id, bool HaveVectorSupport) {
       return HaveVectorSupport? "z15" : "zEC12";
     case 3931:
     case 3932:
-    default:
       return HaveVectorSupport? "z16" : "zEC12";
+    case 9175:
+    case 9176:
+    default:
+      return HaveVectorSupport? "arch15" : "zEC12";
   }
 }
 } // end anonymous namespace
diff --git llvm/lib/TargetParser/TargetParser.cpp llvm/lib/TargetParser/TargetParser.cpp
index 02295fdb0ecd..0a605dfd017c 100644
--- llvm/lib/TargetParser/TargetParser.cpp
+++ llvm/lib/TargetParser/TargetParser.cpp
@@ -323,43 +323,59 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
                                   StringMap<bool> &Features) {
   // XXX - What does the member GPU mean if device name string passed here?
   if (T.isSPIRV() && T.getOS() == Triple::OSType::AMDHSA) {
-    // AMDGCN SPIRV must support the union of all AMDGCN features.
+    // AMDGCN SPIRV must support the union of all AMDGCN features. This list
+    // should be kept in sorted order and updated whenever new features are
+    // added.
+    Features["16-bit-insts"] = true;
+    Features["ashr-pk-insts"] = true;
+    Features["atomic-buffer-pk-add-bf16-inst"] = true;
+    Features["atomic-buffer-global-pk-add-f16-insts"] = true;
     Features["atomic-ds-pk-add-16-insts"] = true;
+    Features["atomic-fadd-rtn-insts"] = true;
     Features["atomic-flat-pk-add-16-insts"] = true;
-    Features["atomic-buffer-global-pk-add-f16-insts"] = true;
     Features["atomic-global-pk-add-bf16-inst"] = true;
-    Features["atomic-fadd-rtn-insts"] = true;
+    Features["bf8-cvt-scale-insts"] = true;
+    Features["bitop3-insts"] = true;
     Features["ci-insts"] = true;
+    Features["dl-insts"] = true;
     Features["dot1-insts"] = true;
     Features["dot2-insts"] = true;
     Features["dot3-insts"] = true;
     Features["dot4-insts"] = true;
     Features["dot5-insts"] = true;
+    Features["dot6-insts"] = true;
     Features["dot7-insts"] = true;
     Features["dot8-insts"] = true;
     Features["dot9-insts"] = true;
     Features["dot10-insts"] = true;
     Features["dot11-insts"] = true;
-    Features["dl-insts"] = true;
-    Features["16-bit-insts"] = true;
+    Features["dot12-insts"] = true;
+    Features["dot13-insts"] = true;
     Features["dpp"] = true;
+    Features["f16bf16-to-fp6bf6-cvt-scale-insts"] = true;
+    Features["f32-to-f16bf16-cvt-sr-insts"] = true;
+    Features["fp4-cvt-scale-insts"] = true;
+    Features["fp6bf6-cvt-scale-insts"] = true;
+    Features["fp8-insts"] = true;
+    Features["fp8-conversion-insts"] = true;
+    Features["fp8-cvt-scale-insts"] = true;
     Features["gfx8-insts"] = true;
     Features["gfx9-insts"] = true;
     Features["gfx90a-insts"] = true;
     Features["gfx940-insts"] = true;
+    Features["gfx950-insts"] = true;
     Features["gfx10-insts"] = true;
     Features["gfx10-3-insts"] = true;
     Features["gfx11-insts"] = true;
     Features["gfx12-insts"] = true;
+    Features["gws"] = true;
     Features["image-insts"] = true;
-    Features["fp8-conversion-insts"] = true;
     Features["s-memrealtime"] = true;
     Features["s-memtime-inst"] = true;
-    Features["gws"] = true;
-    Features["fp8-insts"] = true;
-    Features["fp8-conversion-insts"] = true;
-    Features["atomic-ds-pk-add-16-insts"] = true;
     Features["mai-insts"] = true;
+    Features["permlane16-swap"] = true;
+    Features["permlane32-swap"] = true;
+    Features["prng-inst"] = true;
     Features["wavefrontsize32"] = true;
     Features["wavefrontsize64"] = true;
   } else if (T.isAMDGCN()) {
diff --git llvm/lib/Transforms/CFGuard/CFGuard.cpp llvm/lib/Transforms/CFGuard/CFGuard.cpp
index 0e1a0a6ed947..41d68b62eb8d 100644
--- llvm/lib/Transforms/CFGuard/CFGuard.cpp
+++ llvm/lib/Transforms/CFGuard/CFGuard.cpp
@@ -247,7 +247,7 @@ bool CFGuardImpl::doInitialization(Module &M) {
   GuardFnType =
       FunctionType::get(Type::getVoidTy(M.getContext()),
                         {PointerType::getUnqual(M.getContext())}, false);
-  GuardFnPtrType = PointerType::get(GuardFnType, 0);
+  GuardFnPtrType = PointerType::get(M.getContext(), 0);
 
   GuardFnGlobal = M.getOrInsertGlobal(GuardFnName, GuardFnPtrType, [&] {
     auto *Var = new GlobalVariable(M, GuardFnPtrType, false,
diff --git llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp
index 6327cea64c0d..cc462011a624 100644
--- llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp
+++ llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp
@@ -81,9 +81,9 @@ struct RematGraph {
           !Checker.isDefinitionAcrossSuspend(*D, FirstUse))
         continue;
 
-      if (Remats.count(D)) {
+      if (auto It = Remats.find(D); It != Remats.end()) {
         // Already have this in the graph
-        N->Operands.push_back(Remats[D].get());
+        N->Operands.push_back(It->second.get());
         continue;
       }
 
diff --git llvm/lib/Transforms/HipStdPar/HipStdPar.cpp llvm/lib/Transforms/HipStdPar/HipStdPar.cpp
index 92042ddab38d..895c8c9d4868 100644
--- llvm/lib/Transforms/HipStdPar/HipStdPar.cpp
+++ llvm/lib/Transforms/HipStdPar/HipStdPar.cpp
@@ -279,10 +279,11 @@ HipStdParAllocationInterpositionPass::run(Module &M, ModuleAnalysisManager&) {
   for (auto &&F : M) {
     if (!F.hasName())
       continue;
-    if (!AllocReplacements.contains(F.getName()))
+    auto It = AllocReplacements.find(F.getName());
+    if (It == AllocReplacements.end())
       continue;
 
-    if (auto R = M.getFunction(AllocReplacements[F.getName()])) {
+    if (auto R = M.getFunction(It->second)) {
       F.replaceAllUsesWith(R);
     } else {
       std::string W;
diff --git llvm/lib/Transforms/IPO/AttributorAttributes.cpp llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index 46d6d66593b4..e897632489bb 100644
--- llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -12344,7 +12344,7 @@ struct AAIndirectCallInfoCallSite : public AAIndirectCallInfo {
     ChangeStatus Changed = ChangeStatus::UNCHANGED;
     Value *FP = CB->getCalledOperand();
     if (FP->getType()->getPointerAddressSpace())
-      FP = new AddrSpaceCastInst(FP, PointerType::get(FP->getType(), 0),
+      FP = new AddrSpaceCastInst(FP, PointerType::get(FP->getContext(), 0),
                                  FP->getName() + ".as0", CB->getIterator());
 
     bool CBIsVoid = CB->getType()->isVoidTy();
diff --git llvm/lib/Transforms/IPO/FunctionAttrs.cpp llvm/lib/Transforms/IPO/FunctionAttrs.cpp
index 56bfc8432cbb..17f946e5acdf 100644
--- llvm/lib/Transforms/IPO/FunctionAttrs.cpp
+++ llvm/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -633,10 +633,12 @@ ArgumentAccessInfo getArgmentAccessInfo(const Instruction *I,
       [](Value *Length,
          std::optional<int64_t> Offset) -> std::optional<ConstantRange> {
     auto *ConstantLength = dyn_cast<ConstantInt>(Length);
-    if (ConstantLength && Offset && !ConstantLength->isNegative())
+    if (ConstantLength && Offset &&
+        ConstantLength->getValue().isStrictlyPositive()) {
       return ConstantRange(
           APInt(64, *Offset, true),
           APInt(64, *Offset + ConstantLength->getSExtValue(), true));
+    }
     return std::nullopt;
   };
   if (auto *SI = dyn_cast<StoreInst>(I)) {
diff --git llvm/lib/Transforms/IPO/GlobalOpt.cpp llvm/lib/Transforms/IPO/GlobalOpt.cpp
index bf0cacc6224b..eb97d8b4a74f 100644
--- llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -2785,8 +2785,16 @@ static bool OptimizeNonTrivialIFuncs(
       } else {
         // We can't reason much about non-FMV callers. Just pick the highest
         // priority callee if it matches, otherwise bail.
-        if (I > 0 || !implies(CallerBits, CalleeBits))
-          continue;
+        // if (I > 0 || !implies(CallerBits, CalleeBits))
+        //
+        // FIXME: This is causing a regression in the llvm test suite,
+        // specifically a 'predres' version is unexpectedly trapping on
+        // GravitonG4. My explanation is that when the caller in not a
+        // versioned function, the compiler exclusively relies on the
+        // command line option, or target attribute to deduce whether a
+        // feature is available. However, there is no guarantee that in
+        // reality the host supports those implied features.
+        continue;
       }
       auto &Calls = CallSites[Caller];
       for (CallBase *CS : Calls)
diff --git llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index 61a8f4a448bb..988e912b2de8 100644
--- llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -821,19 +821,31 @@ struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> {
 
   IndexCall *operator->() { return this; }
 
-  PointerUnion<CallsiteInfo *, AllocInfo *> getBase() const { return *this; }
-
   void print(raw_ostream &OS) const {
-    if (auto *AI = llvm::dyn_cast_if_present<AllocInfo *>(getBase())) {
+    PointerUnion<CallsiteInfo *, AllocInfo *> Base = *this;
+    if (auto *AI = llvm::dyn_cast_if_present<AllocInfo *>(Base)) {
       OS << *AI;
     } else {
-      auto *CI = llvm::dyn_cast_if_present<CallsiteInfo *>(getBase());
+      auto *CI = llvm::dyn_cast_if_present<CallsiteInfo *>(Base);
       assert(CI);
       OS << *CI;
     }
   }
 };
+} // namespace
+
+namespace llvm {
+template <> struct simplify_type<IndexCall> {
+  using SimpleType = PointerUnion<CallsiteInfo *, AllocInfo *>;
+  static SimpleType getSimplifiedValue(IndexCall &Val) { return Val; }
+};
+template <> struct simplify_type<const IndexCall> {
+  using SimpleType = const PointerUnion<CallsiteInfo *, AllocInfo *>;
+  static SimpleType getSimplifiedValue(const IndexCall &Val) { return Val; }
+};
+} // namespace llvm
 
+namespace {
 /// CRTP derived class for graphs built from summary index (ThinLTO).
 class IndexCallsiteContextGraph
     : public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
@@ -1877,9 +1889,9 @@ uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) {
 }
 
 uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) {
-  assert(isa<CallsiteInfo *>(Call.getBase()));
+  assert(isa<CallsiteInfo *>(Call));
   CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
-      CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call.getBase()));
+      CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call));
   // Need to convert index into stack id.
   return Index.getStackIdAtIndex(CallsiteContext.back());
 }
@@ -1911,10 +1923,10 @@ std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func,
                                                 unsigned CloneNo) const {
   auto VI = FSToVIMap.find(Func);
   assert(VI != FSToVIMap.end());
-  if (isa<AllocInfo *>(Call.getBase()))
+  if (isa<AllocInfo *>(Call))
     return (VI->second.name() + " -> alloc").str();
   else {
-    auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Call.getBase());
+    auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Call);
     return (VI->second.name() + " -> " +
             getMemProfFuncName(Callsite->Callee.name(),
                                Callsite->Clones[CloneNo]))
@@ -1933,9 +1945,9 @@ ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall(
 
 std::vector<uint64_t>
 IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) {
-  assert(isa<CallsiteInfo *>(Call.getBase()));
+  assert(isa<CallsiteInfo *>(Call));
   CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
-      CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call.getBase()));
+      CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call));
   return getStackIdsWithContextNodes<CallsiteInfo,
                                      SmallVector<unsigned>::const_iterator>(
       CallsiteContext);
@@ -2696,8 +2708,7 @@ bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
 
 const FunctionSummary *
 IndexCallsiteContextGraph::getCalleeFunc(IndexCall &Call) {
-  ValueInfo Callee =
-      dyn_cast_if_present<CallsiteInfo *>(Call.getBase())->Callee;
+  ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Call)->Callee;
   if (Callee.getSummaryList().empty())
     return nullptr;
   return dyn_cast<FunctionSummary>(Callee.getSummaryList()[0]->getBaseObject());
@@ -2707,8 +2718,7 @@ bool IndexCallsiteContextGraph::calleeMatchesFunc(
     IndexCall &Call, const FunctionSummary *Func,
     const FunctionSummary *CallerFunc,
     std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain) {
-  ValueInfo Callee =
-      dyn_cast_if_present<CallsiteInfo *>(Call.getBase())->Callee;
+  ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Call)->Callee;
   // If there is no summary list then this is a call to an externally defined
   // symbol.
   AliasSummary *Alias =
@@ -2751,10 +2761,8 @@ bool IndexCallsiteContextGraph::calleeMatchesFunc(
 }
 
 bool IndexCallsiteContextGraph::sameCallee(IndexCall &Call1, IndexCall &Call2) {
-  ValueInfo Callee1 =
-      dyn_cast_if_present<CallsiteInfo *>(Call1.getBase())->Callee;
-  ValueInfo Callee2 =
-      dyn_cast_if_present<CallsiteInfo *>(Call2.getBase())->Callee;
+  ValueInfo Callee1 = dyn_cast_if_present<CallsiteInfo *>(Call1)->Callee;
+  ValueInfo Callee2 = dyn_cast_if_present<CallsiteInfo *>(Call2)->Callee;
   return Callee1 == Callee2;
 }
 
@@ -3610,7 +3618,7 @@ IndexCallsiteContextGraph::cloneFunctionForCallsite(
   // Confirm this matches the CloneNo provided by the caller, which is based on
   // the number of function clones we have.
   assert(CloneNo ==
-         (isa<AllocInfo *>(Call.call().getBase())
+         (isa<AllocInfo *>(Call.call())
               ? Call.call().dyn_cast<AllocInfo *>()->Versions.size()
               : Call.call().dyn_cast<CallsiteInfo *>()->Clones.size()));
   // Walk all the instructions in this function. Create a new version for
diff --git llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 842881156dc6..e2b81ba864c3 100644
--- llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1860,6 +1860,33 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
         return CastInst::Create(Instruction::ZExt, NarrowMaxMin, II->getType());
       }
     }
+    // If C is not 0:
+    //   umax(nuw_shl(x, C), x + 1) -> x == 0 ? 1 : nuw_shl(x, C)
+    // If C is not 0 or 1:
+    //   umax(nuw_mul(x, C), x + 1) -> x == 0 ? 1 : nuw_mul(x, C)
+    auto foldMaxMulShift = [&](Value *A, Value *B) -> Instruction * {
+      const APInt *C;
+      Value *X;
+      if (!match(A, m_NUWShl(m_Value(X), m_APInt(C))) &&
+          !(match(A, m_NUWMul(m_Value(X), m_APInt(C))) && !C->isOne()))
+        return nullptr;
+      if (C->isZero())
+        return nullptr;
+      if (!match(B, m_OneUse(m_Add(m_Specific(X), m_One()))))
+        return nullptr;
+
+      Value *Cmp = Builder.CreateICmpEQ(X, ConstantInt::get(X->getType(), 0));
+      Value *NewSelect =
+          Builder.CreateSelect(Cmp, ConstantInt::get(X->getType(), 1), A);
+      return replaceInstUsesWith(*II, NewSelect);
+    };
+
+    if (IID == Intrinsic::umax) {
+      if (Instruction *I = foldMaxMulShift(I0, I1))
+        return I;
+      if (Instruction *I = foldMaxMulShift(I1, I0))
+        return I;
+    }
     // If both operands of unsigned min/max are sign-extended, it is still ok
     // to narrow the operation.
     [[fallthrough]];
@@ -4179,13 +4206,14 @@ Instruction *InstCombinerImpl::visitCallBase(CallBase &Call) {
     DenseMap<Value *, unsigned> Val2Idx;
     std::vector<Value *> NewLiveGc;
     for (Value *V : Bundle->Inputs) {
-      if (Val2Idx.count(V))
+      auto [It, Inserted] = Val2Idx.try_emplace(V);
+      if (!Inserted)
         continue;
       if (LiveGcValues.count(V)) {
-        Val2Idx[V] = NewLiveGc.size();
+        It->second = NewLiveGc.size();
         NewLiveGc.push_back(V);
       } else
-        Val2Idx[V] = NumOfGCLives;
+        It->second = NumOfGCLives;
     }
     // Update all gc.relocates
     for (const GCRelocateInst *Reloc : GCSP.getGCRelocates()) {
diff --git llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 2e4572575994..5a4791870ac7 100644
--- llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -2674,10 +2674,41 @@ Instruction *InstCombinerImpl::foldICmpShrConstant(ICmpInst &Cmp,
 Instruction *InstCombinerImpl::foldICmpSRemConstant(ICmpInst &Cmp,
                                                     BinaryOperator *SRem,
                                                     const APInt &C) {
+  const ICmpInst::Predicate Pred = Cmp.getPredicate();
+  if (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_ULT) {
+    // Canonicalize unsigned predicates to signed:
+    // (X s% DivisorC) u> C -> (X s% DivisorC) s< 0
+    //   iff (C s< 0 ? ~C : C) u>= abs(DivisorC)-1
+    // (X s% DivisorC) u< C+1 -> (X s% DivisorC) s> -1
+    //   iff (C+1 s< 0 ? ~C : C) u>= abs(DivisorC)-1
+
+    const APInt *DivisorC;
+    if (!match(SRem->getOperand(1), m_APInt(DivisorC)))
+      return nullptr;
+
+    APInt NormalizedC = C;
+    if (Pred == ICmpInst::ICMP_ULT) {
+      assert(!NormalizedC.isZero() &&
+             "ult X, 0 should have been simplified already.");
+      --NormalizedC;
+    }
+    if (C.isNegative())
+      NormalizedC.flipAllBits();
+    assert(!DivisorC->isZero() &&
+           "srem X, 0 should have been simplified already.");
+    if (!NormalizedC.uge(DivisorC->abs() - 1))
+      return nullptr;
+
+    Type *Ty = SRem->getType();
+    if (Pred == ICmpInst::ICMP_UGT)
+      return new ICmpInst(ICmpInst::ICMP_SLT, SRem,
+                          ConstantInt::getNullValue(Ty));
+    return new ICmpInst(ICmpInst::ICMP_SGT, SRem,
+                        ConstantInt::getAllOnesValue(Ty));
+  }
   // Match an 'is positive' or 'is negative' comparison of remainder by a
   // constant power-of-2 value:
   // (X % pow2C) sgt/slt 0
-  const ICmpInst::Predicate Pred = Cmp.getPredicate();
   if (Pred != ICmpInst::ICMP_SGT && Pred != ICmpInst::ICMP_SLT &&
       Pred != ICmpInst::ICMP_EQ && Pred != ICmpInst::ICMP_NE)
     return nullptr;
diff --git llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index cb8458831849..cca6f78084b4 100644
--- llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -1035,7 +1035,8 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
                         RuntimeCallInserter &RTCI)
       : F(F), ASan(ASan), RTCI(RTCI),
         DIB(*F.getParent(), /*AllowUnresolved*/ false), C(ASan.C),
-        IntptrTy(ASan.IntptrTy), IntptrPtrTy(PointerType::get(IntptrTy, 0)),
+        IntptrTy(ASan.IntptrTy),
+        IntptrPtrTy(PointerType::get(IntptrTy->getContext(), 0)),
         Mapping(ASan.Mapping),
         PoisonStack(ClStack &&
                     !Triple(F.getParent()->getTargetTriple()).isAMDGPU()) {}
@@ -1882,7 +1883,7 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
 
   Type *ShadowTy =
       IntegerType::get(*C, std::max(8U, TypeStoreSize >> Mapping.Scale));
-  Type *ShadowPtrTy = PointerType::get(ShadowTy, 0);
+  Type *ShadowPtrTy = PointerType::get(*C, 0);
   Value *ShadowPtr = memToShadow(AddrLong, IRB);
   const uint64_t ShadowAlign =
       std::max<uint64_t>(Alignment.valueOrOne().value() >> Mapping.Scale, 1);
diff --git llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
index e226727e64d3..fd69b3f244ec 100644
--- llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
+++ llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
@@ -1790,13 +1790,12 @@ Value *DFSanFunction::getArgTLS(Type *T, unsigned ArgOffset, IRBuilder<> &IRB) {
   Value *Base = IRB.CreatePointerCast(DFS.ArgTLS, DFS.IntptrTy);
   if (ArgOffset)
     Base = IRB.CreateAdd(Base, ConstantInt::get(DFS.IntptrTy, ArgOffset));
-  return IRB.CreateIntToPtr(Base, PointerType::get(DFS.getShadowTy(T), 0),
-                            "_dfsarg");
+  return IRB.CreateIntToPtr(Base, PointerType::get(*DFS.Ctx, 0), "_dfsarg");
 }
 
 Value *DFSanFunction::getRetvalTLS(Type *T, IRBuilder<> &IRB) {
-  return IRB.CreatePointerCast(
-      DFS.RetvalTLS, PointerType::get(DFS.getShadowTy(T), 0), "_dfsret");
+  return IRB.CreatePointerCast(DFS.RetvalTLS, PointerType::get(*DFS.Ctx, 0),
+                               "_dfsret");
 }
 
 Value *DFSanFunction::getRetvalOriginTLS() { return DFS.RetvalOriginTLS; }
@@ -1925,9 +1924,7 @@ DataFlowSanitizer::getShadowOriginAddress(Value *Addr, Align InstAlignment,
     ShadowLong =
         IRB.CreateAdd(ShadowLong, ConstantInt::get(IntptrTy, ShadowBase));
   }
-  IntegerType *ShadowTy = IntegerType::get(*Ctx, ShadowWidthBits);
-  Value *ShadowPtr =
-      IRB.CreateIntToPtr(ShadowLong, PointerType::get(ShadowTy, 0));
+  Value *ShadowPtr = IRB.CreateIntToPtr(ShadowLong, PointerType::get(*Ctx, 0));
   Value *OriginPtr = nullptr;
   if (shouldTrackOrigins()) {
     Value *OriginLong = ShadowOffset;
@@ -2491,8 +2488,8 @@ void DFSanFunction::paintOrigin(IRBuilder<> &IRB, Value *Origin,
   Align CurrentAlignment = Alignment;
   if (Alignment >= IntptrAlignment && IntptrSize > OriginSize) {
     Value *IntptrOrigin = originToIntptr(IRB, Origin);
-    Value *IntptrStoreOriginPtr = IRB.CreatePointerCast(
-        StoreOriginAddr, PointerType::get(DFS.IntptrTy, 0));
+    Value *IntptrStoreOriginPtr =
+        IRB.CreatePointerCast(StoreOriginAddr, PointerType::get(*DFS.Ctx, 0));
     for (unsigned I = 0; I < StoreOriginSize / IntptrSize; ++I) {
       Value *Ptr =
           I ? IRB.CreateConstGEP1_32(DFS.IntptrTy, IntptrStoreOriginPtr, I)
diff --git llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index 6e86ffdc8027..5e204d736237 100644
--- llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -1012,7 +1012,7 @@ void GCOVProfiler::emitGlobalConstructor(
   IRBuilder<> Builder(BB);
 
   FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
-  auto *PFTy = PointerType::get(FTy, 0);
+  auto *PFTy = PointerType::get(*Ctx, 0);
   FTy = FunctionType::get(Builder.getVoidTy(), {PFTy, PFTy}, false);
 
   // Initialize the environment and register the local writeout, flush and
diff --git llvm/lib/Transforms/Instrumentation/MemProfiler.cpp llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
index f1580b025efc..9a7eec76cc1d 100644
--- llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
+++ llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
@@ -511,7 +511,7 @@ void MemProfiler::instrumentAddress(Instruction *OrigIns,
   }
 
   Type *ShadowTy = ClHistogram ? Type::getInt8Ty(*C) : Type::getInt64Ty(*C);
-  Type *ShadowPtrTy = PointerType::get(ShadowTy, 0);
+  Type *ShadowPtrTy = PointerType::get(*C, 0);
 
   Value *ShadowPtr = memToShadow(AddrLong, IRB);
   Value *ShadowAddr = IRB.CreateIntToPtr(ShadowPtr, ShadowPtrTy);
diff --git llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 6daee7a3b6e8..587e5c1cc842 100644
--- llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -6188,7 +6188,7 @@ struct VarArgI386Helper : public VarArgHelperBase {
       Type *RegSaveAreaPtrTy = PointerType::getUnqual(*MS.C);
       Value *RegSaveAreaPtrPtr =
           IRB.CreateIntToPtr(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
-                             PointerType::get(RegSaveAreaPtrTy, 0));
+                             PointerType::get(*MS.C, 0));
       Value *RegSaveAreaPtr =
           IRB.CreateLoad(RegSaveAreaPtrTy, RegSaveAreaPtrPtr);
       Value *RegSaveAreaShadowPtr, *RegSaveAreaOriginPtr;
@@ -6273,7 +6273,7 @@ struct VarArgGenericHelper : public VarArgHelperBase {
       Type *RegSaveAreaPtrTy = PointerType::getUnqual(*MS.C);
       Value *RegSaveAreaPtrPtr =
           IRB.CreateIntToPtr(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
-                             PointerType::get(RegSaveAreaPtrTy, 0));
+                             PointerType::get(*MS.C, 0));
       Value *RegSaveAreaPtr =
           IRB.CreateLoad(RegSaveAreaPtrTy, RegSaveAreaPtrPtr);
       Value *RegSaveAreaShadowPtr, *RegSaveAreaOriginPtr;
diff --git llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp
index 2ae810621866..94101f9663a8 100644
--- llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp
+++ llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp
@@ -598,7 +598,7 @@ bool TypeSanitizer::instrumentWithShadowUpdate(
 
   Value *ShadowDataInt = convertToShadowDataInt(IRB, Ptr, IntptrTy, PtrShift,
                                                 ShadowBase, AppMemMask);
-  Type *Int8PtrPtrTy = PointerType::get(IRB.getPtrTy(), 0);
+  Type *Int8PtrPtrTy = PointerType::get(IRB.getContext(), 0);
   Value *ShadowData =
       IRB.CreateIntToPtr(ShadowDataInt, Int8PtrPtrTy, "shadow.ptr");
 
diff --git llvm/lib/Transforms/Scalar/LoopInterchange.cpp llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index a0c0080c0bda..5bcc5e41a0e8 100644
--- llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -57,6 +57,14 @@ static cl::opt<int> LoopInterchangeCostThreshold(
     "loop-interchange-threshold", cl::init(0), cl::Hidden,
     cl::desc("Interchange if you gain more than this number"));
 
+// Maximum number of load-stores that can be handled in the dependency matrix.
+static cl::opt<unsigned int> MaxMemInstrCount(
+    "loop-interchange-max-meminstr-count", cl::init(64), cl::Hidden,
+    cl::desc(
+        "Maximum number of load-store instructions that should be handled "
+        "in the dependency matrix. Higher value may lead to more interchanges "
+        "at the cost of compile-time"));
+
 namespace {
 
 using LoopVector = SmallVector<Loop *, 8>;
@@ -66,9 +74,6 @@ using CharMatrix = std::vector<std::vector<char>>;
 
 } // end anonymous namespace
 
-// Maximum number of dependencies that can be handled in the dependency matrix.
-static const unsigned MaxMemInstrCount = 100;
-
 // Maximum loop depth supported.
 static const unsigned MaxLoopNestDepth = 10;
 
@@ -84,7 +89,8 @@ static void printDepMatrix(CharMatrix &DepMatrix) {
 
 static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
                                      Loop *L, DependenceInfo *DI,
-                                     ScalarEvolution *SE) {
+                                     ScalarEvolution *SE,
+                                     OptimizationRemarkEmitter *ORE) {
   using ValueVector = SmallVector<Value *, 16>;
 
   ValueVector MemInstr;
@@ -109,7 +115,18 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
 
   LLVM_DEBUG(dbgs() << "Found " << MemInstr.size()
                     << " Loads and Stores to analyze\n");
-
+  if (MemInstr.size() > MaxMemInstrCount) {
+    LLVM_DEBUG(dbgs() << "The transform doesn't support more than "
+                      << MaxMemInstrCount << " load/stores in a loop\n");
+    ORE->emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedLoop",
+                                      L->getStartLoc(), L->getHeader())
+             << "Number of loads/stores exceeded, the supported maximum "
+                "can be increased with option "
+                "-loop-interchange-maxmeminstr-count.";
+    });
+    return false;
+  }
   ValueVector::iterator I, IE, J, JE;
   StringSet<> Seen;
 
@@ -136,23 +153,17 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
         unsigned Levels = D->getLevels();
         char Direction;
         for (unsigned II = 1; II <= Levels; ++II) {
-          if (D->isScalar(II)) {
-            Direction = 'S';
-            Dep.push_back(Direction);
-          } else {
-            unsigned Dir = D->getDirection(II);
-            if (Dir == Dependence::DVEntry::LT ||
-                Dir == Dependence::DVEntry::LE)
-              Direction = '<';
-            else if (Dir == Dependence::DVEntry::GT ||
-                     Dir == Dependence::DVEntry::GE)
-              Direction = '>';
-            else if (Dir == Dependence::DVEntry::EQ)
-              Direction = '=';
-            else
-              Direction = '*';
-            Dep.push_back(Direction);
-          }
+          unsigned Dir = D->getDirection(II);
+          if (Dir == Dependence::DVEntry::LT || Dir == Dependence::DVEntry::LE)
+            Direction = '<';
+          else if (Dir == Dependence::DVEntry::GT ||
+                   Dir == Dependence::DVEntry::GE)
+            Direction = '>';
+          else if (Dir == Dependence::DVEntry::EQ)
+            Direction = '=';
+          else
+            Direction = '*';
+          Dep.push_back(Direction);
         }
         while (Dep.size() != Level) {
           Dep.push_back('I');
@@ -161,12 +172,6 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
         // Make sure we only add unique entries to the dependency matrix.
         if (Seen.insert(StringRef(Dep.data(), Dep.size())).second)
           DepMatrix.push_back(Dep);
-
-        if (DepMatrix.size() > MaxMemInstrCount) {
-          LLVM_DEBUG(dbgs() << "Cannot handle more than " << MaxMemInstrCount
-                            << " dependencies inside loop\n");
-          return false;
-        }
       }
     }
   }
@@ -450,7 +455,7 @@ struct LoopInterchange {
     CharMatrix DependencyMatrix;
     Loop *OuterMostLoop = *(LoopList.begin());
     if (!populateDependencyMatrix(DependencyMatrix, LoopNestDepth,
-                                  OuterMostLoop, DI, SE)) {
+                                  OuterMostLoop, DI, SE, ORE)) {
       LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n");
       return false;
     }
@@ -1725,10 +1730,15 @@ PreservedAnalyses LoopInterchangePass::run(LoopNest &LN,
                                            LPMUpdater &U) {
   Function &F = *LN.getParent();
   SmallVector<Loop *, 8> LoopList(LN.getLoops());
+
+  if (MaxMemInstrCount < 1) {
+    LLVM_DEBUG(dbgs() << "MaxMemInstrCount should be at least 1");
+    return PreservedAnalyses::all();
+  }
+
   // Ensure minimum depth of the loop nest to do the interchange.
   if (!hasMinimumLoopDepth(LoopList))
     return PreservedAnalyses::all();
-
   DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI);
   std::unique_ptr<CacheCost> CC =
       CacheCost::getCacheCost(LN.getOutermostLoop(), AR, DI);
diff --git llvm/lib/Transforms/Scalar/Reassociate.cpp llvm/lib/Transforms/Scalar/Reassociate.cpp
index bc50f23d8eb2..9361ea063c1d 100644
--- llvm/lib/Transforms/Scalar/Reassociate.cpp
+++ llvm/lib/Transforms/Scalar/Reassociate.cpp
@@ -2174,13 +2174,14 @@ void ReassociatePass::OptimizeInst(Instruction *I) {
   if (isa<FPMathOperator>(I) && !hasFPAssociativeFlags(I))
     return;
 
-  // Do not reassociate boolean (i1) expressions.  We want to preserve the
+  // Do not reassociate boolean (i1/vXi1) expressions.  We want to preserve the
   // original order of evaluation for short-circuited comparisons that
   // SimplifyCFG has folded to AND/OR expressions.  If the expression
   // is not further optimized, it is likely to be transformed back to a
   // short-circuited form for code gen, and the source order may have been
-  // optimized for the most likely conditions.
-  if (I->getType()->isIntegerTy(1))
+  // optimized for the most likely conditions. For vector boolean expressions,
+  // we should be optimizing for ILP and not serializing the logical operations.
+  if (I->getType()->isIntOrIntVectorTy(1))
     return;
 
   // If this is a bitwise or instruction of operands
diff --git llvm/lib/Transforms/Utils/CodeExtractor.cpp llvm/lib/Transforms/Utils/CodeExtractor.cpp
index af9813775f24..ecc91739a796 100644
--- llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -849,7 +849,7 @@ Function *CodeExtractor::constructFunctionDeclaration(
       StructValues.insert(output);
     } else
       ParamTy.push_back(
-          PointerType::get(output->getType(), DL.getAllocaAddrSpace()));
+          PointerType::get(output->getContext(), DL.getAllocaAddrSpace()));
   }
 
   assert(
@@ -863,7 +863,7 @@ Function *CodeExtractor::constructFunctionDeclaration(
   if (!AggParamTy.empty()) {
     StructTy = StructType::get(M->getContext(), AggParamTy);
     ParamTy.push_back(PointerType::get(
-        StructTy, ArgsInZeroAddressSpace ? 0 : DL.getAllocaAddrSpace()));
+        M->getContext(), ArgsInZeroAddressSpace ? 0 : DL.getAllocaAddrSpace()));
   }
 
   Type *RetTy = getSwitchType();
diff --git llvm/lib/Transforms/Utils/LowerGlobalDtors.cpp llvm/lib/Transforms/Utils/LowerGlobalDtors.cpp
index cd7960065703..ff72ba073ad0 100644
--- llvm/lib/Transforms/Utils/LowerGlobalDtors.cpp
+++ llvm/lib/Transforms/Utils/LowerGlobalDtors.cpp
@@ -138,7 +138,7 @@ static bool runImpl(Module &M) {
   FunctionCallee AtExit = M.getOrInsertFunction(
       "__cxa_atexit",
       FunctionType::get(Type::getInt32Ty(C),
-                        {PointerType::get(AtExitFuncTy, 0), VoidStar, VoidStar},
+                        {PointerType::get(C, 0), VoidStar, VoidStar},
                         /*isVarArg=*/false));
 
   // If __cxa_atexit is defined (e.g. in the case of LTO) and arg0 is not
diff --git llvm/lib/Transforms/Utils/ModuleUtils.cpp llvm/lib/Transforms/Utils/ModuleUtils.cpp
index 7249571f3449..1c31e851ef4b 100644
--- llvm/lib/Transforms/Utils/ModuleUtils.cpp
+++ llvm/lib/Transforms/Utils/ModuleUtils.cpp
@@ -29,7 +29,6 @@ using namespace llvm;
 static void appendToGlobalArray(StringRef ArrayName, Module &M, Function *F,
                                 int Priority, Constant *Data) {
   IRBuilder<> IRB(M.getContext());
-  FunctionType *FnTy = FunctionType::get(IRB.getVoidTy(), false);
 
   // Get the current set of static global constructors and add the new ctor
   // to the list.
@@ -45,9 +44,9 @@ static void appendToGlobalArray(StringRef ArrayName, Module &M, Function *F,
     }
     GVCtor->eraseFromParent();
   } else {
-    EltTy = StructType::get(IRB.getInt32Ty(),
-                            PointerType::get(FnTy, F->getAddressSpace()),
-                            IRB.getPtrTy());
+    EltTy = StructType::get(
+        IRB.getInt32Ty(),
+        PointerType::get(M.getContext(), F->getAddressSpace()), IRB.getPtrTy());
   }
 
   // Build a 3 field global_ctor entry.  We don't take a comdat key.
@@ -268,7 +267,7 @@ std::pair<Function *, FunctionCallee> llvm::createSanitizerCtorAndInitFunctions(
         BasicBlock::Create(M.getContext(), "callfunc", Ctor, RetBB);
     auto *InitFn = cast<Function>(InitFunction.getCallee());
     auto *InitFnPtr =
-        PointerType::get(InitFn->getType(), InitFn->getAddressSpace());
+        PointerType::get(M.getContext(), InitFn->getAddressSpace());
     IRB.SetInsertPoint(EntryBB);
     Value *InitNotNull =
         IRB.CreateICmpNE(InitFn, ConstantPointerNull::get(InitFnPtr));
diff --git llvm/lib/Transforms/Vectorize/CMakeLists.txt llvm/lib/Transforms/Vectorize/CMakeLists.txt
index 6a025652f92f..e5fabd318b82 100644
--- llvm/lib/Transforms/Vectorize/CMakeLists.txt
+++ llvm/lib/Transforms/Vectorize/CMakeLists.txt
@@ -13,6 +13,7 @@ add_llvm_component_library(LLVMVectorize
   SandboxVectorizer/SandboxVectorizerPassBuilder.cpp
   SandboxVectorizer/Scheduler.cpp
   SandboxVectorizer/SeedCollector.cpp
+  SandboxVectorizer/VecUtils.cpp
   SLPVectorizer.cpp
   Vectorize.cpp
   VectorCombine.cpp
diff --git llvm/lib/Transforms/Vectorize/LoopVectorize.cpp llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index d79d9e8445b3..29f3940ed6fa 100644
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -543,11 +543,6 @@ public:
 protected:
   friend class LoopVectorizationPlanner;
 
-  /// Set up the values of the IVs correctly when exiting the vector loop.
-  virtual void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
-                            Value *VectorTripCount, BasicBlock *MiddleBlock,
-                            VPTransformState &State);
-
   /// Iteratively sink the scalarized operands of a predicated instruction into
   /// the block that was created for it.
   void sinkScalarOperands(Instruction *PredInst);
@@ -785,10 +780,6 @@ protected:
   BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
   void printDebugTracesAtStart() override;
   void printDebugTracesAtEnd() override;
-
-  void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
-                    Value *VectorTripCount, BasicBlock *MiddleBlock,
-                    VPTransformState &State) override {};
 };
 
 // A specialized derived class of inner loop vectorizer that performs
@@ -1447,11 +1438,11 @@ public:
     // Override forced styles if needed.
     // FIXME: use actual opcode/data type for analysis here.
     // FIXME: Investigate opportunity for fixed vector factor.
+    // FIXME: support fixed-order recurrences by fixing splice of non VFxUF
+    // penultimate EVL.
     bool EVLIsLegal =
         UserIC <= 1 && TTI.hasActiveVectorLength(0, nullptr, Align()) &&
-        !EnableVPlanNativePath &&
-        // FIXME: remove this once fixed-ordered recurrence is supported.
-        Legal->getFixedOrderRecurrences().empty();
+        !EnableVPlanNativePath && Legal->getFixedOrderRecurrences().empty();
     if (!EVLIsLegal) {
       // If for some reason EVL mode is unsupported, fallback to
       // DataWithoutLaneMask to try to vectorize the loop with folded tail
@@ -2782,97 +2773,6 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton(
   return LoopVectorPreHeader;
 }
 
-// Fix up external users of the induction variable. At this point, we are
-// in LCSSA form, with all external PHIs that use the IV having one input value,
-// coming from the remainder loop. We need those PHIs to also have a correct
-// value for the IV when arriving directly from the middle block.
-void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
-                                       const InductionDescriptor &II,
-                                       Value *VectorTripCount,
-                                       BasicBlock *MiddleBlock,
-                                       VPTransformState &State) {
-  // There are two kinds of external IV usages - those that use the value
-  // computed in the last iteration (the PHI) and those that use the penultimate
-  // value (the value that feeds into the phi from the loop latch).
-  // We allow both, but they, obviously, have different values.
-
-  DenseMap<Value *, Value *> MissingVals;
-
-  Value *EndValue = cast<PHINode>(OrigPhi->getIncomingValueForBlock(
-                                      OrigLoop->getLoopPreheader()))
-                        ->getIncomingValueForBlock(MiddleBlock);
-
-  // An external user of the last iteration's value should see the value that
-  // the remainder loop uses to initialize its own IV.
-  Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
-  for (User *U : PostInc->users()) {
-    Instruction *UI = cast<Instruction>(U);
-    if (!OrigLoop->contains(UI)) {
-      assert(isa<PHINode>(UI) && "Expected LCSSA form");
-      MissingVals[UI] = EndValue;
-    }
-  }
-
-  // An external user of the penultimate value need to see EndValue - Step.
-  // The simplest way to get this is to recompute it from the constituent SCEVs,
-  // that is Start + (Step * (CRD - 1)).
-  for (User *U : OrigPhi->users()) {
-    auto *UI = cast<Instruction>(U);
-    if (!OrigLoop->contains(UI)) {
-      assert(isa<PHINode>(UI) && "Expected LCSSA form");
-      IRBuilder<> B(MiddleBlock->getTerminator());
-
-      // Fast-math-flags propagate from the original induction instruction.
-      if (isa_and_nonnull<FPMathOperator>(II.getInductionBinOp()))
-        B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
-
-      VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep());
-      assert(StepVPV && "step must have been expanded during VPlan execution");
-      Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue()
-                                        : State.get(StepVPV, VPLane(0));
-      Value *Escape = nullptr;
-      if (EndValue->getType()->isIntegerTy())
-        Escape = B.CreateSub(EndValue, Step);
-      else if (EndValue->getType()->isPointerTy())
-        Escape = B.CreatePtrAdd(EndValue, B.CreateNeg(Step));
-      else {
-        assert(EndValue->getType()->isFloatingPointTy() &&
-               "Unexpected induction type");
-        Escape = B.CreateBinOp(II.getInductionBinOp()->getOpcode() ==
-                                       Instruction::FAdd
-                                   ? Instruction::FSub
-                                   : Instruction::FAdd,
-                               EndValue, Step);
-      }
-      Escape->setName("ind.escape");
-      MissingVals[UI] = Escape;
-    }
-  }
-
-  assert((MissingVals.empty() ||
-          all_of(MissingVals,
-                 [MiddleBlock, this](const std::pair<Value *, Value *> &P) {
-                   return all_of(
-                       predecessors(cast<Instruction>(P.first)->getParent()),
-                       [MiddleBlock, this](BasicBlock *Pred) {
-                         return Pred == MiddleBlock ||
-                                Pred == OrigLoop->getLoopLatch();
-                       });
-                 })) &&
-         "Expected escaping values from latch/middle.block only");
-
-  for (auto &I : MissingVals) {
-    PHINode *PHI = cast<PHINode>(I.first);
-    // One corner case we have to handle is two IVs "chasing" each-other,
-    // that is %IV2 = phi [...], [ %IV1, %latch ]
-    // In this case, if IV1 has an external use, we need to avoid adding both
-    // "last value of IV1" and "penultimate value of IV2". So, verify that we
-    // don't already have an incoming value for the middle block.
-    if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
-      PHI->addIncoming(I.second, MiddleBlock);
-  }
-}
-
 namespace {
 
 struct CSEDenseMapInfo {
@@ -2999,24 +2899,6 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
     for (PHINode &PN : Exit->phis())
       PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN);
 
-  if (Cost->requiresScalarEpilogue(VF.isVector())) {
-    // No edge from the middle block to the unique exit block has been inserted
-    // and there is nothing to fix from vector loop; phis should have incoming
-    // from scalar loop only.
-  } else {
-    // TODO: Check in VPlan to see if IV users need fixing instead of checking
-    // the cost model.
-
-    // If we inserted an edge from the middle block to the unique exit block,
-    // update uses outside the loop (phis) to account for the newly inserted
-    // edge.
-
-    // Fix-up external users of the induction variables.
-    for (const auto &Entry : Legal->getInductionVars())
-      fixupIVUsers(Entry.first, Entry.second,
-                   getOrCreateVectorTripCount(nullptr), LoopMiddleBlock, State);
-  }
-
   // Don't apply optimizations below when no vector region remains, as they all
   // require a vector loop at the moment.
   if (!State.Plan->getVectorLoopRegion())
@@ -8829,7 +8711,7 @@ void VPRecipeBuilder::collectScaledReductions(VFRange &Range) {
     PartialReductionChain Chain = Pair.first;
     if (ExtendIsOnlyUsedByPartialReductions(Chain.ExtendA) &&
         ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB))
-      ScaledReductionExitInstrs.insert(std::make_pair(Chain.Reduction, Pair));
+      ScaledReductionMap.insert(std::make_pair(Chain.Reduction, Pair.second));
   }
 }
 
@@ -8921,9 +8803,8 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
              Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
 
       // If the PHI is used by a partial reduction, set the scale factor.
-      std::optional<std::pair<PartialReductionChain, unsigned>> Pair =
-          getScaledReductionForInstr(RdxDesc.getLoopExitInstr());
-      unsigned ScaleFactor = Pair ? Pair->second : 1;
+      unsigned ScaleFactor =
+          getScalingForReduction(RdxDesc.getLoopExitInstr()).value_or(1);
       PhiRecipe = new VPReductionPHIRecipe(
           Phi, RdxDesc, *StartV, CM.isInLoopReduction(Phi),
           CM.useOrderedReductions(RdxDesc), ScaleFactor);
@@ -8958,7 +8839,7 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
     return tryToWidenMemory(Instr, Operands, Range);
 
-  if (getScaledReductionForInstr(Instr))
+  if (getScalingForReduction(Instr))
     return tryToCreatePartialReduction(Instr, Operands);
 
   if (!shouldWiden(Instr, Range))
@@ -9049,11 +8930,9 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
 /// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the
 /// induction recipe is not canonical, creates a VPDerivedIVRecipe to compute
 /// the end value of the induction.
-static VPValue *addResumePhiRecipeForInduction(VPWidenInductionRecipe *WideIV,
-                                               VPBuilder &VectorPHBuilder,
-                                               VPBuilder &ScalarPHBuilder,
-                                               VPTypeAnalysis &TypeInfo,
-                                               VPValue *VectorTC) {
+static VPInstruction *addResumePhiRecipeForInduction(
+    VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder,
+    VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC) {
   auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
   // Truncated wide inductions resume from the last lane of their vector value
   // in the last vector iteration which is handled elsewhere.
@@ -9087,8 +8966,10 @@ static VPValue *addResumePhiRecipeForInduction(VPWidenInductionRecipe *WideIV,
 
 /// Create resume phis in the scalar preheader for first-order recurrences,
 /// reductions and inductions, and update the VPIRInstructions wrapping the
-/// original phis in the scalar header.
-static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) {
+/// original phis in the scalar header. End values for inductions are added to
+/// \p IVEndValues.
+static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan,
+                                DenseMap<VPValue *, VPValue *> &IVEndValues) {
   VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
   auto *ScalarPH = Plan.getScalarPreheader();
   auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getSinglePredecessor());
@@ -9105,11 +8986,16 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) {
     if (!ScalarPhiI)
       break;
 
+    // TODO: Extract final value from induction recipe initially, optimize to
+    // pre-computed end value together in optimizeInductionExitUsers.
     auto *VectorPhiR = cast<VPHeaderPHIRecipe>(Builder.getRecipe(ScalarPhiI));
     if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) {
-      if (VPValue *ResumePhi = addResumePhiRecipeForInduction(
+      if (VPInstruction *ResumePhi = addResumePhiRecipeForInduction(
               WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo,
               &Plan.getVectorTripCount())) {
+        assert(ResumePhi->getOpcode() == VPInstruction::ResumePhi &&
+               "Expected a ResumePhi");
+        IVEndValues[WideIVR] = ResumePhi->getOperand(0);
         ScalarPhiIRI->addOperand(ResumePhi);
         continue;
       }
@@ -9140,65 +9026,6 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) {
   }
 }
 
-/// Return true if \p VPV is an optimizable IV or IV use. That is, if \p VPV is
-/// either an untruncated wide induction, or if it increments a wide induction
-/// by its step.
-static bool isOptimizableIVOrUse(VPValue *VPV) {
-  VPRecipeBase *Def = VPV->getDefiningRecipe();
-  if (!Def)
-    return false;
-  auto *WideIV = dyn_cast<VPWidenInductionRecipe>(Def);
-  if (WideIV) {
-    // VPV itself is a wide induction, separately compute the end value for exit
-    // users if it is not a truncated IV.
-    return isa<VPWidenPointerInductionRecipe>(WideIV) ||
-           !cast<VPWidenIntOrFpInductionRecipe>(WideIV)->getTruncInst();
-  }
-
-  // Check if VPV is an optimizable induction increment.
-  if (Def->getNumOperands() != 2)
-    return false;
-  WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
-  if (!WideIV)
-    WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
-  if (!WideIV)
-    return false;
-
-  using namespace VPlanPatternMatch;
-  auto &ID = WideIV->getInductionDescriptor();
-
-  // Check if VPV increments the induction by the induction step.
-  VPValue *IVStep = WideIV->getStepValue();
-  switch (ID.getInductionOpcode()) {
-  case Instruction::Add:
-    return match(VPV, m_c_Binary<Instruction::Add>(m_Specific(WideIV),
-                                                   m_Specific(IVStep)));
-  case Instruction::FAdd:
-    return match(VPV, m_c_Binary<Instruction::FAdd>(m_Specific(WideIV),
-                                                    m_Specific(IVStep)));
-  case Instruction::FSub:
-    return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
-                                                  m_Specific(IVStep)));
-  case Instruction::Sub: {
-    // IVStep will be the negated step of the subtraction. Check if Step == -1 *
-    // IVStep.
-    VPValue *Step;
-    if (!match(VPV, m_Binary<Instruction::Sub>(m_VPValue(), m_VPValue(Step))) ||
-        !Step->isLiveIn() || !IVStep->isLiveIn())
-      return false;
-    auto *StepCI = dyn_cast<ConstantInt>(Step->getLiveInIRValue());
-    auto *IVStepCI = dyn_cast<ConstantInt>(IVStep->getLiveInIRValue());
-    return StepCI && IVStepCI &&
-           StepCI->getValue() == (-1 * IVStepCI->getValue());
-  }
-  default:
-    return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
-           match(VPV, m_GetElementPtr(m_Specific(WideIV),
-                                      m_Specific(WideIV->getStepValue())));
-  }
-  llvm_unreachable("should have been covered by switch above");
-}
-
 // Collect VPIRInstructions for phis in the exit blocks that are modeled
 // in VPlan and add the exiting VPValue as operand. Some exiting values are not
 // modeled explicitly yet and won't be included. Those are un-truncated
@@ -9228,12 +9055,6 @@ collectUsersInExitBlocks(Loop *OrigLoop, VPRecipeBuilder &Builder,
         }
         Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB);
         VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue);
-        // Exit values for inductions are computed and updated outside of VPlan
-        // and independent of induction recipes.
-        // TODO: Compute induction exit values in VPlan.
-        if (isOptimizableIVOrUse(V) &&
-            ExitVPBB->getSinglePredecessor() == MiddleVPBB)
-          continue;
         ExitUsersToFix.insert(ExitIRI);
         ExitIRI->addOperand(V);
       }
@@ -9253,6 +9074,7 @@ addUsersInExitBlocks(VPlan &Plan,
 
   auto *MiddleVPBB = Plan.getMiddleBlock();
   VPBuilder B(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
+  VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
 
   // Introduce extract for exiting values and update the VPIRInstructions
   // modeling the corresponding LCSSA phis.
@@ -9574,7 +9396,8 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
     VPlanTransforms::handleUncountableEarlyExit(
         *Plan, *PSE.getSE(), OrigLoop, UncountableExitingBlock, RecipeBuilder);
   }
-  addScalarResumePhis(RecipeBuilder, *Plan);
+  DenseMap<VPValue *, VPValue *> IVEndValues;
+  addScalarResumePhis(RecipeBuilder, *Plan, IVEndValues);
   SetVector<VPIRInstruction *> ExitUsersToFix =
       collectUsersInExitBlocks(OrigLoop, RecipeBuilder, *Plan);
   addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix);
@@ -9657,6 +9480,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
     VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
                                        WithoutRuntimeCheck);
   }
+  VPlanTransforms::optimizeInductionExitUsers(*Plan, IVEndValues);
 
   assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
   return Plan;
@@ -9708,7 +9532,10 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
     auto *HeaderR = cast<VPHeaderPHIRecipe>(&R);
     RecipeBuilder.setRecipe(HeaderR->getUnderlyingInstr(), HeaderR);
   }
-  addScalarResumePhis(RecipeBuilder, *Plan);
+  DenseMap<VPValue *, VPValue *> IVEndValues;
+  // TODO: IVEndValues are not used yet in the native path, to optimize exit
+  // values.
+  addScalarResumePhis(RecipeBuilder, *Plan, IVEndValues);
 
   assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
   return Plan;
diff --git llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 245b07bcaba5..4c14e4808c17 100644
--- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2414,15 +2414,17 @@ public:
     }
 
     /// Go through the instructions in VL and append their operands.
-    void appendOperandsOfVL(ArrayRef<Value *> VL, Instruction *VL0) {
+    void appendOperandsOfVL(ArrayRef<Value *> VL, const InstructionsState &S) {
       assert(!VL.empty() && "Bad VL");
       assert((empty() || VL.size() == getNumLanes()) &&
              "Expected same number of lanes");
+      assert(S.valid() && "InstructionsState is invalid.");
       // IntrinsicInst::isCommutative returns true if swapping the first "two"
       // arguments to the intrinsic produces the same result.
       constexpr unsigned IntrinsicNumOperands = 2;
-      unsigned NumOperands = VL0->getNumOperands();
-      ArgSize = isa<IntrinsicInst>(VL0) ? IntrinsicNumOperands : NumOperands;
+      Instruction *MainOp = S.getMainOp();
+      unsigned NumOperands = MainOp->getNumOperands();
+      ArgSize = isa<IntrinsicInst>(MainOp) ? IntrinsicNumOperands : NumOperands;
       OpsVec.resize(NumOperands);
       unsigned NumLanes = VL.size();
       for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
@@ -2441,19 +2443,19 @@ public:
           // operations or alternating sequences (e.g., +, -), we can safely
           // tell the inverse operations by checking commutativity.
           if (isa<PoisonValue>(VL[Lane])) {
-            if (auto *EI = dyn_cast<ExtractElementInst>(VL0)) {
+            if (auto *EI = dyn_cast<ExtractElementInst>(MainOp)) {
               if (OpIdx == 0) {
                 OpsVec[OpIdx][Lane] = {EI->getVectorOperand(), true, false};
                 continue;
               }
-            } else if (auto *EV = dyn_cast<ExtractValueInst>(VL0)) {
+            } else if (auto *EV = dyn_cast<ExtractValueInst>(MainOp)) {
               if (OpIdx == 0) {
                 OpsVec[OpIdx][Lane] = {EV->getAggregateOperand(), true, false};
                 continue;
               }
             }
             OpsVec[OpIdx][Lane] = {
-                PoisonValue::get(VL0->getOperand(OpIdx)->getType()), true,
+                PoisonValue::get(MainOp->getOperand(OpIdx)->getType()), true,
                 false};
             continue;
           }
@@ -2566,11 +2568,12 @@ public:
 
   public:
     /// Initialize with all the operands of the instruction vector \p RootVL.
-    VLOperands(ArrayRef<Value *> RootVL, Instruction *VL0, const BoUpSLP &R)
+    VLOperands(ArrayRef<Value *> RootVL, const InstructionsState &S,
+               const BoUpSLP &R)
         : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
-          L(R.LI->getLoopFor((VL0->getParent()))) {
+          L(R.LI->getLoopFor(S.getMainOp()->getParent())) {
       // Append all the operands of RootVL.
-      appendOperandsOfVL(RootVL, VL0);
+      appendOperandsOfVL(RootVL, S);
     }
 
     /// \Returns a value vector with the operands across all lanes for the
@@ -2653,7 +2656,9 @@ public:
         }
         // TODO: Check if we can remove a check for non-power-2 number of
         // scalars after full support of non-power-2 vectorization.
-        return UniqueValues.size() != 2 && has_single_bit(UniqueValues.size());
+        return UniqueValues.size() != 2 &&
+               hasFullVectorsOrPowerOf2(*R.TTI, Op0.front().V->getType(),
+                                        UniqueValues.size());
       };
 
       // If the initial strategy fails for any of the operand indexes, then we
@@ -3043,7 +3048,7 @@ private:
   /// non-identity permutation that allows to reuse extract instructions.
   /// \param ResizeAllowed indicates whether it is allowed to handle subvector
   /// extract order.
-  bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
+  bool canReuseExtract(ArrayRef<Value *> VL,
                        SmallVectorImpl<unsigned> &CurrentOrder,
                        bool ResizeAllowed = false) const;
 
@@ -3270,7 +3275,7 @@ private:
     };
 
     /// Checks if the current node is a gather node.
-    bool isGather() const {return State == NeedToGather; }
+    bool isGather() const { return State == NeedToGather; }
 
     /// A vector of scalars.
     ValueList Scalars;
@@ -3334,9 +3339,9 @@ private:
     /// reordering of operands during buildTree_rec() and vectorizeTree().
     SmallVector<ValueList, 2> Operands;
 
-    /// The main/alternate instruction.
-    Instruction *MainOp = nullptr;
-    Instruction *AltOp = nullptr;
+    /// MainOp and AltOp are recorded inside. S should be obtained from
+    /// newTreeEntry.
+    InstructionsState S = InstructionsState::invalid();
 
     /// Interleaving factor for interleaved loads Vectorize nodes.
     unsigned InterleaveFactor = 0;
@@ -3360,10 +3365,10 @@ private:
 
     /// Set this bundle's operand from Scalars.
     void setOperand(const BoUpSLP &R, bool RequireReorder = false) {
-      VLOperands Ops(Scalars, MainOp, R);
+      VLOperands Ops(Scalars, S, R);
       if (RequireReorder)
         Ops.reorder();
-      for (unsigned I : seq<unsigned>(MainOp->getNumOperands()))
+      for (unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands()))
         setOperand(I, Ops.getVL(I));
     }
 
@@ -3396,13 +3401,9 @@ private:
     }
 
     /// Some of the instructions in the list have alternate opcodes.
-    bool isAltShuffle() const { return MainOp != AltOp; }
+    bool isAltShuffle() const { return S.isAltShuffle(); }
 
-    bool isOpcodeOrAlt(Instruction *I) const {
-      unsigned CheckedOpcode = I->getOpcode();
-      return (getOpcode() == CheckedOpcode ||
-              getAltOpcode() == CheckedOpcode);
-    }
+    bool isOpcodeOrAlt(Instruction *I) const { return S.isOpcodeOrAlt(I); }
 
     /// Chooses the correct key for scheduling data. If \p Op has the same (or
     /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
@@ -3411,31 +3412,24 @@ private:
       auto *I = dyn_cast<Instruction>(Op);
       if (I && isOpcodeOrAlt(I))
         return Op;
-      return MainOp;
+      return S.getMainOp();
     }
 
     void setOperations(const InstructionsState &S) {
       assert(S && "InstructionsState is invalid.");
-      MainOp = S.getMainOp();
-      AltOp = S.getAltOp();
+      this->S = S;
     }
 
-    Instruction *getMainOp() const {
-      return MainOp;
-    }
+    Instruction *getMainOp() const { return S.getMainOp(); }
 
-    Instruction *getAltOp() const {
-      return AltOp;
-    }
+    Instruction *getAltOp() const { return S.getAltOp(); }
 
     /// The main/alternate opcodes for the list of instructions.
-    unsigned getOpcode() const {
-      return MainOp ? MainOp->getOpcode() : 0;
-    }
+    unsigned getOpcode() const { return S.getOpcode(); }
 
-    unsigned getAltOpcode() const {
-      return AltOp ? AltOp->getOpcode() : 0;
-    }
+    unsigned getAltOpcode() const { return S.getAltOpcode(); }
+
+    bool hasState() const { return S.valid(); }
 
     /// When ReuseReorderShuffleIndices is empty it just returns position of \p
     /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
@@ -3531,16 +3525,13 @@ private:
         dbgs() << "CombinedVectorize\n";
         break;
       }
-      dbgs() << "MainOp: ";
-      if (MainOp)
-        dbgs() << *MainOp << "\n";
-      else
-        dbgs() << "NULL\n";
-      dbgs() << "AltOp: ";
-      if (AltOp)
-        dbgs() << *AltOp << "\n";
-      else
-        dbgs() << "NULL\n";
+      if (S) {
+        dbgs() << "MainOp: " << *S.getMainOp() << "\n";
+        dbgs() << "AltOp: " << *S.getAltOp() << "\n";
+      } else {
+        dbgs() << "MainOp: NULL\n";
+        dbgs() << "AltOp: NULL\n";
+      }
       dbgs() << "VectorizedValue: ";
       if (VectorizedValue)
         dbgs() << *VectorizedValue << "\n";
@@ -3715,9 +3706,13 @@ private:
   }
 #endif
 
-  TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); }
+  TreeEntry *getTreeEntry(Value *V) {
+    assert(V && "V cannot be nullptr.");
+    return ScalarToTreeEntry.lookup(V);
+  }
 
   const TreeEntry *getTreeEntry(Value *V) const {
+    assert(V && "V cannot be nullptr.");
     return ScalarToTreeEntry.lookup(V);
   }
 
@@ -4979,7 +4974,7 @@ static Value *createInsertVector(
     // the subvector length.
     const unsigned VecVF = getNumElements(Vec->getType());
     SmallVector<int> Mask(VecVF, PoisonMaskElem);
-    std::iota(Mask.begin(), std::next(Mask.begin(), Index), 0);
+    std::iota(Mask.begin(), Mask.end(), 0);
     for (unsigned I : seq<unsigned>(SubVecVF))
       Mask[I + Index] = I + VecVF;
     if (Generator) {
@@ -5108,12 +5103,13 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
                  });
         });
     const unsigned AbsoluteDiff = std::abs(*Diff);
-    if (IsPossibleStrided && (IsAnyPointerUsedOutGraph ||
-                              ((Sz > MinProfitableStridedLoads ||
-                                (AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
-                                 has_single_bit(AbsoluteDiff))) &&
-                               AbsoluteDiff > Sz) ||
-                              *Diff == -(static_cast<int>(Sz) - 1))) {
+    if (IsPossibleStrided &&
+        (IsAnyPointerUsedOutGraph ||
+         (AbsoluteDiff > Sz &&
+          (Sz > MinProfitableStridedLoads ||
+           (AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
+            AbsoluteDiff % Sz == 0 && has_single_bit(AbsoluteDiff / Sz)))) ||
+         *Diff == -(static_cast<int>(Sz) - 1))) {
       int Stride = *Diff / static_cast<int>(Sz - 1);
       if (*Diff == Stride * static_cast<int>(Sz - 1)) {
         Align Alignment =
@@ -5199,9 +5195,9 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
       return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
 
     // FIXME: The following code has not been updated for non-power-of-2
-    // vectors.  The splitting logic here does not cover the original
-    // vector if the vector factor is not a power of two.  FIXME
-    if (!has_single_bit(VL.size()))
+    // vectors (and not whole registers).  The splitting logic here does not
+    // cover the original vector if the vector factor is not a power of two.
+    if (!hasFullVectorsOrPowerOf2(TTI, ScalarTy, VL.size()))
       return false;
 
     unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
@@ -5209,7 +5205,10 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
     DemandedElts.clearAllBits();
     // Iterate through possible vectorization factors and check if vectorized +
     // shuffles is better than just gather.
-    for (unsigned VF = VL.size() / 2; VF >= MinVF; VF /= 2) {
+    for (unsigned VF =
+             getFloorFullVectorNumberOfElements(TTI, ScalarTy, VL.size() - 1);
+         VF >= MinVF;
+         VF = getFloorFullVectorNumberOfElements(TTI, ScalarTy, VF - 1)) {
       SmallVector<LoadsState> States;
       for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; Cnt += VF) {
         ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
@@ -5615,7 +5614,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
     // Try build correct order for extractelement instructions.
     SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
                                 TE.ReuseShuffleIndices.end());
-    if (TE.getOpcode() == Instruction::ExtractElement &&
+    if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
         all_of(TE.Scalars, [Sz](Value *V) {
           if (isa<PoisonValue>(V))
             return true;
@@ -5777,10 +5776,11 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
       return std::nullopt; // No need to reorder.
     return std::move(Phis);
   }
-  if (TE.isGather() && !TE.isAltShuffle() && allSameType(TE.Scalars)) {
+  if (TE.isGather() && (!TE.hasState() || !TE.isAltShuffle()) &&
+      allSameType(TE.Scalars)) {
     // TODO: add analysis of other gather nodes with extractelement
     // instructions and other values/instructions, not only undefs.
-    if ((TE.getOpcode() == Instruction::ExtractElement ||
+    if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
          (all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&
           any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
         all_of(TE.Scalars, [](Value *V) {
@@ -5790,8 +5790,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
       // Check that gather of extractelements can be represented as
       // just a shuffle of a single vector.
       OrdersType CurrentOrder;
-      bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder,
-                                   /*ResizeAllowed=*/true);
+      bool Reuse =
+          canReuseExtract(TE.Scalars, CurrentOrder, /*ResizeAllowed=*/true);
       if (Reuse || !CurrentOrder.empty())
         return std::move(CurrentOrder);
     }
@@ -5840,7 +5840,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
         return Order;
     // Check if can include the order of vectorized loads. For masked gathers do
     // extra analysis later, so include such nodes into a special list.
-    if (TE.isGather() && TE.getOpcode() == Instruction::Load) {
+    if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
       SmallVector<Value *> PointerOps;
       OrdersType CurrentOrder;
       LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(),
@@ -5955,7 +5955,7 @@ void BoUpSLP::reorderTopToBottom() {
     // Patterns like [fadd,fsub] can be combined into a single instruction in
     // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
     // to take into account their order when looking for the most used order.
-    if (TE->isAltShuffle()) {
+    if (TE->hasState() && TE->isAltShuffle()) {
       VectorType *VecTy =
           getWidenedType(TE->Scalars[0]->getType(), TE->Scalars.size());
       unsigned Opcode0 = TE->getOpcode();
@@ -6034,7 +6034,7 @@ void BoUpSLP::reorderTopToBottom() {
           if (It != GathersToOrders.end())
             return It->second;
         }
-        if (OpTE->isAltShuffle()) {
+        if (OpTE->hasState() && OpTE->isAltShuffle()) {
           auto It = AltShufflesToOrders.find(OpTE);
           if (It != AltShufflesToOrders.end())
             return It->second;
@@ -7637,9 +7637,10 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
   }
   case Instruction::ExtractValue:
   case Instruction::ExtractElement: {
-    bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
-    // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
-    if (!has_single_bit(VL.size()))
+    bool Reuse = canReuseExtract(VL, CurrentOrder);
+    // FIXME: Vectorizing is not supported yet for non-power-of-2 ops (and
+    // non-full registers).
+    if (!hasFullVectorsOrPowerOf2(*TTI, VL0->getType(), VL.size()))
       return TreeEntry::NeedToGather;
     if (Reuse || !CurrentOrder.empty())
       return TreeEntry::Vectorize;
@@ -8095,7 +8096,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
       if ((UserTreeIdx.UserTE &&
            UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI)) ||
-          !has_single_bit(VL.size())) {
+          !hasFullVectorsOrPowerOf2(*TTI, VL.front()->getType(), VL.size())) {
         LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
                              "for nodes with padding.\n");
         newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
@@ -8657,7 +8658,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                  TE->dump());
 
       ValueList Left, Right;
-      VLOperands Ops(VL, VL0, *this);
+      VLOperands Ops(VL, S, *this);
       if (cast<CmpInst>(VL0)->isCommutative()) {
         // Commutative predicate - collect + sort operands of the instructions
         // so that each side is more likely to have the same opcode.
@@ -8925,7 +8926,7 @@ unsigned BoUpSLP::canMapToVector(Type *T) const {
   return N;
 }
 
-bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
+bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL,
                               SmallVectorImpl<unsigned> &CurrentOrder,
                               bool ResizeAllowed) const {
   const auto *It = find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);
@@ -9579,7 +9580,7 @@ void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
 
   // Do not reorder nodes if it small (just 2 elements), all-constant or all
   // instructions have same opcode already.
-  if (TE.Scalars.size() == 2 || (TE.getOpcode() && !TE.isAltShuffle()) ||
+  if (TE.Scalars.size() == 2 || (TE.hasState() && !TE.isAltShuffle()) ||
       all_of(TE.Scalars, isConstant))
     return;
 
@@ -9798,7 +9799,7 @@ void BoUpSLP::transformNodes() {
       // Do not try partial vectorization for small nodes (<= 2), nodes with the
       // same opcode and same parent block or all constants.
       if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
-          !(!E.getOpcode() || E.getOpcode() == Instruction::Load ||
+          !(!E.hasState() || E.getOpcode() == Instruction::Load ||
             E.isAltShuffle() || !allSameBlock(VL)) ||
           allConstant(VL) || isSplat(VL))
         continue;
@@ -9846,7 +9847,8 @@ void BoUpSLP::transformNodes() {
             if (!S || S.isAltShuffle() || !allSameBlock(Slice) ||
                 (S.getOpcode() == Instruction::Load &&
                  areKnownNonVectorizableLoads(Slice)) ||
-                (S.getOpcode() != Instruction::Load && !has_single_bit(VF)))
+                (S.getOpcode() != Instruction::Load &&
+                 !hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(), VF)))
               continue;
             if (VF == 2) {
               // Try to vectorize reduced values or if all users are vectorized.
@@ -9921,6 +9923,7 @@ void BoUpSLP::transformNodes() {
           buildTree_rec(Slice, 0, EdgeInfo(&E, UINT_MAX));
           if (PrevSize + 1 == VectorizableTree.size() &&
               VectorizableTree[PrevSize]->isGather() &&
+              VectorizableTree[PrevSize]->hasState() &&
               VectorizableTree[PrevSize]->getOpcode() !=
                   Instruction::ExtractElement &&
               !isSplat(Slice)) {
@@ -9941,6 +9944,8 @@ void BoUpSLP::transformNodes() {
         E.ReorderIndices.clear();
       }
     }
+    if (!E.hasState())
+      continue;
     switch (E.getOpcode()) {
     case Instruction::Load: {
       // No need to reorder masked gather loads, just reorder the scalar
@@ -10044,7 +10049,7 @@ void BoUpSLP::transformNodes() {
 
   if (LoadEntriesToVectorize.empty()) {
     // Single load node - exit.
-    if (VectorizableTree.size() <= 1 &&
+    if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
         VectorizableTree.front()->getOpcode() == Instruction::Load)
       return;
     // Small graph with small VF - exit.
@@ -10060,7 +10065,7 @@ void BoUpSLP::transformNodes() {
         getCanonicalGraphSize() <= SmallTree &&
         count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
                  [](const std::unique_ptr<TreeEntry> &TE) {
-                   return TE->isGather() &&
+                   return TE->isGather() && TE->hasState() &&
                           TE->getOpcode() == Instruction::Load &&
                           !allSameBlock(TE->Scalars);
                  }) == 1)
@@ -10076,13 +10081,13 @@ void BoUpSLP::transformNodes() {
   for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
     TreeEntry &E = *TE;
     if (E.isGather() &&
-        (E.getOpcode() == Instruction::Load ||
-         (!E.getOpcode() && any_of(E.Scalars,
-                                   [&](Value *V) {
-                                     return isa<LoadInst>(V) &&
-                                            !isVectorized(V) &&
-                                            !isDeleted(cast<Instruction>(V));
-                                   }))) &&
+        ((E.hasState() && E.getOpcode() == Instruction::Load) ||
+         (!E.hasState() && any_of(E.Scalars,
+                                  [&](Value *V) {
+                                    return isa<LoadInst>(V) &&
+                                           !isVectorized(V) &&
+                                           !isDeleted(cast<Instruction>(V));
+                                  }))) &&
         !isSplat(E.Scalars)) {
       for (Value *V : E.Scalars) {
         auto *LI = dyn_cast<LoadInst>(V);
@@ -10676,7 +10681,7 @@ public:
     bool PrevNodeFound = any_of(
         ArrayRef(R.VectorizableTree).take_front(E->Idx),
         [&](const std::unique_ptr<TreeEntry> &TE) {
-          return ((!TE->isAltShuffle() &&
+          return ((TE->hasState() && !TE->isAltShuffle() &&
                    TE->getOpcode() == Instruction::ExtractElement) ||
                   TE->isGather()) &&
                  all_of(enumerate(TE->Scalars), [&](auto &&Data) {
@@ -11801,7 +11806,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
       for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
         if (TE.get() == E)
           break;
-        if (TE->isAltShuffle() &&
+        if (TE->hasState() && TE->isAltShuffle() &&
             ((TE->getOpcode() == E->getOpcode() &&
               TE->getAltOpcode() == E->getAltOpcode()) ||
              (TE->getOpcode() == E->getAltOpcode() &&
@@ -11963,10 +11968,12 @@ bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
                    [this](Value *V) { return EphValues.contains(V); }) &&
            (allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
             TE->Scalars.size() < Limit ||
-            ((TE->getOpcode() == Instruction::ExtractElement ||
+            (((TE->hasState() &&
+               TE->getOpcode() == Instruction::ExtractElement) ||
               all_of(TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
              isFixedVectorShuffle(TE->Scalars, Mask, AC)) ||
-            (TE->getOpcode() == Instruction::Load && !TE->isAltShuffle()) ||
+            (TE->hasState() && TE->getOpcode() == Instruction::Load &&
+             !TE->isAltShuffle()) ||
             any_of(TE->Scalars, IsaPred<LoadInst>));
   };
 
@@ -12095,9 +12102,10 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
       !VectorizableTree.empty() &&
       all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
         return (TE->isGather() &&
-                TE->getOpcode() != Instruction::ExtractElement &&
+                (!TE->hasState() ||
+                 TE->getOpcode() != Instruction::ExtractElement) &&
                 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
-               TE->getOpcode() == Instruction::PHI;
+               (TE->hasState() && TE->getOpcode() == Instruction::PHI);
       }))
     return true;
 
@@ -12115,7 +12123,7 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
   // somewhere.
   bool IsAllowedSingleBVNode =
       VectorizableTree.size() > 1 ||
-      (VectorizableTree.size() == 1 && VectorizableTree.front()->getOpcode() &&
+      (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
        !VectorizableTree.front()->isAltShuffle() &&
        VectorizableTree.front()->getOpcode() != Instruction::PHI &&
        VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
@@ -12131,6 +12139,7 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
     return false;
 
   if (VectorizableTree.back()->isGather() &&
+      VectorizableTree.back()->hasState() &&
       VectorizableTree.back()->isAltShuffle() &&
       VectorizableTree.back()->getVectorFactor() > 2 &&
       allSameBlock(VectorizableTree.back()->Scalars) &&
@@ -12155,7 +12164,7 @@ bool BoUpSLP::isTreeNotExtendable() const {
         getCanonicalGraphSize() <= SmallTree &&
         count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
                  [](const std::unique_ptr<TreeEntry> &TE) {
-                   return TE->isGather() &&
+                   return TE->isGather() && TE->hasState() &&
                           TE->getOpcode() == Instruction::Load &&
                           !allSameBlock(TE->Scalars);
                  }) == 1)
@@ -12167,8 +12176,8 @@ bool BoUpSLP::isTreeNotExtendable() const {
     TreeEntry &E = *VectorizableTree[Idx];
     if (!E.isGather())
       continue;
-    if ((E.getOpcode() && E.getOpcode() != Instruction::Load) ||
-        (!E.getOpcode() &&
+    if ((E.hasState() && E.getOpcode() != Instruction::Load) ||
+        (!E.hasState() &&
          all_of(E.Scalars, IsaPred<ExtractElementInst, LoadInst>)) ||
         (isa<ExtractElementInst>(E.Scalars.front()) &&
          getSameOpcode(ArrayRef(E.Scalars).drop_front(), *TLI).getOpcode()))
@@ -12481,7 +12490,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
           TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
       continue;
     }
-    if (TE.isGather()) {
+    if (TE.isGather() && TE.hasState()) {
       if (const TreeEntry *E = getTreeEntry(TE.getMainOp());
           E && E->getVectorFactor() == TE.getVectorFactor() &&
           E->isSame(TE.Scalars)) {
@@ -13208,9 +13217,12 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
           VTE = *MIt;
         }
       }
-      Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
-      if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
-        continue;
+      if (none_of(TE->CombinedEntriesWithIndices,
+                  [&](const auto &P) { return P.first == VTE->Idx; })) {
+        Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
+        if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
+          continue;
+      }
       VToTEs.insert(VTE);
     }
     if (VToTEs.empty())
@@ -13618,8 +13630,9 @@ BoUpSLP::isGatherShuffledEntry(
                  return !TE->isGather();
                })))
     return {};
-  // FIXME: Gathering for non-power-of-2 nodes not implemented yet.
-  if (TE->isNonPowOf2Vec())
+  // FIXME: Gathering for non-power-of-2 (non whole registers) nodes not
+  // implemented yet.
+  if (TE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
     return {};
   Mask.assign(VL.size(), PoisonMaskElem);
   assert((TE->UserTreeIndices.size() == 1 ||
@@ -13630,9 +13643,11 @@ BoUpSLP::isGatherShuffledEntry(
   if (!TE->UserTreeIndices.empty() &&
       TE->UserTreeIndices.front().UserTE->isGather() &&
       TE->UserTreeIndices.front().EdgeIdx == UINT_MAX) {
-    assert((TE->Idx == 0 || TE->getOpcode() == Instruction::ExtractElement ||
-            isSplat(TE->Scalars)) &&
-           "Expected splat or extractelements only node.");
+    assert(
+        (TE->Idx == 0 ||
+         (TE->hasState() && TE->getOpcode() == Instruction::ExtractElement) ||
+         isSplat(TE->Scalars)) &&
+        "Expected splat or extractelements only node.");
     return {};
   }
   unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
@@ -13977,11 +13992,12 @@ Value *BoUpSLP::gather(
     Instruction *InsElt;
     if (auto *VecTy = dyn_cast<FixedVectorType>(Scalar->getType())) {
       assert(SLPReVec && "FixedVectorType is not expected.");
-      Vec = InsElt = cast<Instruction>(createInsertVector(
-          Builder, Vec, Scalar, Pos * getNumElements(VecTy)));
-      auto *II = dyn_cast<IntrinsicInst>(InsElt);
+      Vec =
+          createInsertVector(Builder, Vec, Scalar, Pos * getNumElements(VecTy));
+      auto *II = dyn_cast<IntrinsicInst>(Vec);
       if (!II || II->getIntrinsicID() != Intrinsic::vector_insert)
         return Vec;
+      InsElt = II;
     } else {
       Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
       InsElt = dyn_cast<InsertElementInst>(Vec);
@@ -14497,7 +14513,9 @@ public:
           break;
         }
     }
-    int VF = getVF(V1);
+    unsigned VF = 0;
+    for (Value *V : InVectors)
+      VF = std::max(VF, getVF(V));
     for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
       if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
         CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
@@ -14925,14 +14943,15 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
       }
     }
     // Gather extracts after we check for full matched gathers only.
-    if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load ||
-        ((E->getOpcode() == Instruction::Load ||
+    if (!ExtractShuffles.empty() || !E->hasState() ||
+        E->getOpcode() != Instruction::Load ||
+        (((E->hasState() && E->getOpcode() == Instruction::Load) ||
           any_of(E->Scalars, IsaPred<LoadInst>)) &&
          any_of(E->Scalars,
                 [this](Value *V) {
                   return isa<LoadInst>(V) && getTreeEntry(V);
                 })) ||
-        E->isAltShuffle() ||
+        (E->hasState() && E->isAltShuffle()) ||
         all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
         isSplat(E->Scalars) ||
         (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
@@ -15312,7 +15331,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
   auto *VecTy = getWidenedType(ScalarTy, E->Scalars.size());
   if (E->isGather()) {
     // Set insert point for non-reduction initial nodes.
-    if (E->getMainOp() && E->Idx == 0 && !UserIgnoreList)
+    if (E->hasState() && E->Idx == 0 && !UserIgnoreList)
       setInsertPointAfterBundle(E);
     Value *Vec = createBuildVector(E, ScalarTy, PostponedPHIs);
     E->VectorizedValue = Vec;
@@ -18157,8 +18176,9 @@ static RecurKind getRdxKind(Value *V);
 void BoUpSLP::computeMinimumValueSizes() {
   // We only attempt to truncate integer expressions.
   bool IsStoreOrInsertElt =
-      VectorizableTree.front()->getOpcode() == Instruction::Store ||
-      VectorizableTree.front()->getOpcode() == Instruction::InsertElement;
+      VectorizableTree.front()->hasState() &&
+      (VectorizableTree.front()->getOpcode() == Instruction::Store ||
+       VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
   if ((IsStoreOrInsertElt || UserIgnoreList) &&
       ExtraBitWidthNodes.size() <= 1 &&
       (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
@@ -18199,10 +18219,9 @@ void BoUpSLP::computeMinimumValueSizes() {
     return;
 
   SmallVector<unsigned> ToDemote;
-  auto ComputeMaxBitWidth = [&](const TreeEntry &E, bool IsTopRoot,
-                                bool IsProfitableToDemoteRoot, unsigned Opcode,
-                                unsigned Limit, bool IsTruncRoot,
-                                bool IsSignedCmp) -> unsigned {
+  auto ComputeMaxBitWidth =
+      [&](const TreeEntry &E, bool IsTopRoot, bool IsProfitableToDemoteRoot,
+          unsigned Limit, bool IsTruncRoot, bool IsSignedCmp) -> unsigned {
     ToDemote.clear();
     // Check if the root is trunc and the next node is gather/buildvector, then
     // keep trunc in scalars, which is free in most cases.
@@ -18243,11 +18262,14 @@ void BoUpSLP::computeMinimumValueSizes() {
       return MaxBitWidth;
     }
 
+    if (!E.hasState())
+      return 0u;
+
     unsigned VF = E.getVectorFactor();
     Type *ScalarTy = E.Scalars.front()->getType();
     unsigned ScalarTyNumElements = getNumElements(ScalarTy);
     auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->getScalarType());
-    if (!TreeRootIT || !Opcode)
+    if (!TreeRootIT)
       return 0u;
 
     if (any_of(E.Scalars,
@@ -18319,6 +18341,7 @@ void BoUpSLP::computeMinimumValueSizes() {
                 IntegerType::get(F->getContext(), bit_ceil(MaxBitWidth)), VF)))
       return 0u;
 
+    unsigned Opcode = E.getOpcode();
     bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
                                 Opcode == Instruction::SExt ||
                                 Opcode == Instruction::ZExt || NumParts > 1;
@@ -18399,15 +18422,14 @@ void BoUpSLP::computeMinimumValueSizes() {
   while (NodeIdx < VectorizableTree.size()) {
     ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
     unsigned Limit = 2;
-    unsigned Opcode = VectorizableTree[NodeIdx]->getOpcode();
     if (IsTopRoot &&
         ReductionBitWidth ==
             DL->getTypeSizeInBits(
                 VectorizableTree.front()->Scalars.front()->getType()))
       Limit = 3;
     unsigned MaxBitWidth = ComputeMaxBitWidth(
-        *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Opcode,
-        Limit, IsTruncRoot, IsSignedCmp);
+        *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
+        IsTruncRoot, IsSignedCmp);
     if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
       if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
         ReductionBitWidth = bit_ceil(MaxBitWidth);
@@ -18450,19 +18472,21 @@ void BoUpSLP::computeMinimumValueSizes() {
                  });
       IsSignedCmp =
           NodeIdx < VectorizableTree.size() &&
-          any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
-                 [&](const EdgeInfo &EI) {
-                   return EI.UserTE->getOpcode() == Instruction::ICmp &&
-                          any_of(EI.UserTE->Scalars, [&](Value *V) {
-                            auto *IC = dyn_cast<ICmpInst>(V);
-                            return IC &&
-                                   (IC->isSigned() ||
-                                    !isKnownNonNegative(IC->getOperand(0),
-                                                        SimplifyQuery(*DL)) ||
-                                    !isKnownNonNegative(IC->getOperand(1),
-                                                        SimplifyQuery(*DL)));
-                          });
-                 });
+          any_of(
+              VectorizableTree[NodeIdx]->UserTreeIndices,
+              [&](const EdgeInfo &EI) {
+                return (EI.UserTE->hasState() &&
+                        EI.UserTE->getOpcode() == Instruction::ICmp) &&
+                       any_of(EI.UserTE->Scalars, [&](Value *V) {
+                         auto *IC = dyn_cast<ICmpInst>(V);
+                         return IC &&
+                                (IC->isSigned() ||
+                                 !isKnownNonNegative(IC->getOperand(0),
+                                                     SimplifyQuery(*DL)) ||
+                                 !isKnownNonNegative(IC->getOperand(1),
+                                                     SimplifyQuery(*DL)));
+                       });
+              });
     }
 
     // If the maximum bit width we compute is less than the width of the roots'
@@ -19189,9 +19213,11 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
     }
   }
 
+  Type *ScalarTy = getValueType(VL[0]);
   unsigned Sz = R.getVectorElementSize(I0);
   unsigned MinVF = R.getMinVF(Sz);
-  unsigned MaxVF = std::max<unsigned>(llvm::bit_floor(VL.size()), MinVF);
+  unsigned MaxVF = std::max<unsigned>(
+      getFloorFullVectorNumberOfElements(*TTI, ScalarTy, VL.size()), MinVF);
   MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
   if (MaxVF < 2) {
     R.getORE()->emit([&]() {
@@ -19205,10 +19231,10 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
   bool Changed = false;
   bool CandidateFound = false;
   InstructionCost MinCost = SLPCostThreshold.getValue();
-  Type *ScalarTy = getValueType(VL[0]);
 
   unsigned NextInst = 0, MaxInst = VL.size();
-  for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
+  for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
+       VF = getFloorFullVectorNumberOfElements(*TTI, I0->getType(), VF - 1)) {
     // No actual vectorization should happen, if number of parts is the same as
     // provided vectorization factor (i.e. the scalar type is used for vector
     // code during codegen).
@@ -19223,7 +19249,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
 
       if (MaxVFOnly && ActualVF < MaxVF)
         break;
-      if ((VF > MinVF && ActualVF <= VF / 2) || (VF == MinVF && ActualVF < 2))
+      if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))
         break;
 
       SmallVector<Value *> Ops(ActualVF, nullptr);
@@ -20106,6 +20132,7 @@ public:
         NumRegs =
             TTI.getNumberOfRegisters(TTI.getRegisterClassForType(true, Tp));
         while (NumParts > NumRegs) {
+          assert(ReduxWidth > 0 && "ReduxWidth is unexpectedly 0.");
           ReduxWidth = bit_floor(ReduxWidth - 1);
           VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
           NumParts = TTI.getNumberOfParts(Tp);
diff --git llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp
index f8149c5bc663..ad3e38e2f1d9 100644
--- llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp
+++ llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp
@@ -20,6 +20,11 @@ namespace llvm::sandboxir {
 #define DEBUG_TYPE "SBVec:Legality"
 
 #ifndef NDEBUG
+void ShuffleMask::dump() const {
+  print(dbgs());
+  dbgs() << "\n";
+}
+
 void LegalityResult::dump() const {
   print(dbgs());
   dbgs() << "\n";
@@ -213,13 +218,12 @@ const LegalityResult &LegalityAnalysis::canVectorize(ArrayRef<Value *> Bndl,
   auto CollectDescrs = getHowToCollectValues(Bndl);
   if (CollectDescrs.hasVectorInputs()) {
     if (auto ValueShuffleOpt = CollectDescrs.getSingleInput()) {
-      auto [Vec, NeedsShuffle] = *ValueShuffleOpt;
-      if (!NeedsShuffle)
+      auto [Vec, Mask] = *ValueShuffleOpt;
+      if (Mask.isIdentity())
         return createLegalityResult<DiamondReuse>(Vec);
-      llvm_unreachable("TODO: Unimplemented");
-    } else {
-      llvm_unreachable("TODO: Unimplemented");
+      return createLegalityResult<DiamondReuseWithShuffle>(Vec, Mask);
     }
+    llvm_unreachable("TODO: Unimplemented");
   }
 
   if (auto ReasonOpt = notVectorizableBasedOnOpcodesAndTypes(Bndl))
diff --git llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp
index 6b2032be5356..d62023ea0188 100644
--- llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp
+++ llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp
@@ -161,7 +161,7 @@ Value *BottomUpVec::createVectorInstr(ArrayRef<Value *> Bndl,
   auto *VecI = CreateVectorInstr(Bndl, Operands);
   if (VecI != nullptr) {
     Change = true;
-    IMaps.registerVector(Bndl, VecI);
+    IMaps->registerVector(Bndl, VecI);
   }
   return VecI;
 }
@@ -179,6 +179,12 @@ void BottomUpVec::tryEraseDeadInstrs() {
   DeadInstrCandidates.clear();
 }
 
+Value *BottomUpVec::createShuffle(Value *VecOp, const ShuffleMask &Mask) {
+  BasicBlock::iterator WhereIt = getInsertPointAfterInstrs({VecOp});
+  return ShuffleVectorInst::create(VecOp, VecOp, Mask, WhereIt,
+                                   VecOp->getContext(), "VShuf");
+}
+
 Value *BottomUpVec::createPack(ArrayRef<Value *> ToPack) {
   BasicBlock::iterator WhereIt = getInsertPointAfterInstrs(ToPack);
 
@@ -295,6 +301,13 @@ Value *BottomUpVec::vectorizeRec(ArrayRef<Value *> Bndl, unsigned Depth) {
     NewVec = cast<DiamondReuse>(LegalityRes).getVector();
     break;
   }
+  case LegalityResultID::DiamondReuseWithShuffle: {
+    auto *VecOp = cast<DiamondReuseWithShuffle>(LegalityRes).getVector();
+    const ShuffleMask &Mask =
+        cast<DiamondReuseWithShuffle>(LegalityRes).getMask();
+    NewVec = createShuffle(VecOp, Mask);
+    break;
+  }
   case LegalityResultID::Pack: {
     // If we can't vectorize the seeds then just return.
     if (Depth == 0)
@@ -315,10 +328,10 @@ bool BottomUpVec::tryVectorize(ArrayRef<Value *> Bndl) {
 }
 
 bool BottomUpVec::runOnFunction(Function &F, const Analyses &A) {
-  IMaps.clear();
+  IMaps = std::make_unique<InstrMaps>(F.getContext());
   Legality = std::make_unique<LegalityAnalysis>(
       A.getAA(), A.getScalarEvolution(), F.getParent()->getDataLayout(),
-      F.getContext(), IMaps);
+      F.getContext(), *IMaps);
   Change = false;
   const auto &DL = F.getParent()->getDataLayout();
   unsigned VecRegBits =
diff --git llvm/lib/Transforms/Vectorize/SandboxVectorizer/VecUtils.cpp llvm/lib/Transforms/Vectorize/SandboxVectorizer/VecUtils.cpp
new file mode 100644
index 000000000000..6f9ef07e467d
--- /dev/null
+++ llvm/lib/Transforms/Vectorize/SandboxVectorizer/VecUtils.cpp
@@ -0,0 +1,32 @@
+//===- VecUtils.cpp -------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Vectorize/SandboxVectorizer/VecUtils.h"
+
+namespace llvm::sandboxir {
+
+unsigned VecUtils::getFloorPowerOf2(unsigned Num) {
+  if (Num == 0)
+    return Num;
+  unsigned Mask = Num;
+  Mask >>= 1;
+  for (unsigned ShiftBy = 1; ShiftBy < sizeof(Num) * 8; ShiftBy <<= 1)
+    Mask |= Mask >> ShiftBy;
+  return Num & ~Mask;
+}
+
+#ifndef NDEBUG
+template <typename T> static void dumpImpl(ArrayRef<T *> Bndl) {
+  for (auto [Idx, V] : enumerate(Bndl))
+    dbgs() << Idx << "." << *V << "\n";
+}
+void VecUtils::dump(ArrayRef<Value *> Bndl) { dumpImpl(Bndl); }
+void VecUtils::dump(ArrayRef<Instruction *> Bndl) { dumpImpl(Bndl); }
+#endif // NDEBUG
+
+} // namespace llvm::sandboxir
diff --git llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index cf653e2d3e65..44745bfd46f8 100644
--- llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -86,10 +86,8 @@ class VPRecipeBuilder {
   /// created.
   SmallVector<VPHeaderPHIRecipe *, 4> PhisToFix;
 
-  /// The set of reduction exit instructions that will be scaled to
-  /// a smaller VF via partial reductions, paired with the scaling factor.
-  DenseMap<const Instruction *, std::pair<PartialReductionChain, unsigned>>
-      ScaledReductionExitInstrs;
+  /// A mapping of partial reduction exit instructions to their scaling factor.
+  DenseMap<const Instruction *, unsigned> ScaledReductionMap;
 
   /// Check if \p I can be widened at the start of \p Range and possibly
   /// decrease the range such that the returned value holds for the entire \p
@@ -157,12 +155,10 @@ public:
       : Plan(Plan), OrigLoop(OrigLoop), TLI(TLI), TTI(TTI), Legal(Legal),
         CM(CM), PSE(PSE), Builder(Builder) {}
 
-  std::optional<std::pair<PartialReductionChain, unsigned>>
-  getScaledReductionForInstr(const Instruction *ExitInst) {
-    auto It = ScaledReductionExitInstrs.find(ExitInst);
-    return It == ScaledReductionExitInstrs.end()
-               ? std::nullopt
-               : std::make_optional(It->second);
+  std::optional<unsigned> getScalingForReduction(const Instruction *ExitInst) {
+    auto It = ScaledReductionMap.find(ExitInst);
+    return It == ScaledReductionMap.end() ? std::nullopt
+                                          : std::make_optional(It->second);
   }
 
   /// Find all possible partial reductions in the loop and track all of those
diff --git llvm/lib/Transforms/Vectorize/VPlan.h llvm/lib/Transforms/Vectorize/VPlan.h
index 784cee6ed4b0..db45ad8aadbb 100644
--- llvm/lib/Transforms/Vectorize/VPlan.h
+++ llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1422,6 +1422,12 @@ public:
            "Op must be an operand of the recipe");
     return true;
   }
+
+  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+    assert(is_contained(operands(), Op) &&
+           "Op must be an operand of the recipe");
+    return true;
+  }
 };
 
 /// VPWidenRecipe is a recipe for producing a widened instruction using the
diff --git llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index a04ad1b37053..9febd612c644 100644
--- llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -667,6 +667,131 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) {
   }
 }
 
+/// Check if \p VPV is an untruncated wide induction, either before or after the
+/// increment. If so return the header IV (before the increment), otherwise
+/// return null.
+static VPWidenInductionRecipe *getOptimizableIVOf(VPValue *VPV) {
+  auto *WideIV = dyn_cast<VPWidenInductionRecipe>(VPV);
+  if (WideIV) {
+    // VPV itself is a wide induction, separately compute the end value for exit
+    // users if it is not a truncated IV.
+    auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
+    return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
+  }
+
+  // Check if VPV is an optimizable induction increment.
+  VPRecipeBase *Def = VPV->getDefiningRecipe();
+  if (!Def || Def->getNumOperands() != 2)
+    return nullptr;
+  WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
+  if (!WideIV)
+    WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
+  if (!WideIV)
+    return nullptr;
+
+  auto IsWideIVInc = [&]() {
+    using namespace VPlanPatternMatch;
+    auto &ID = WideIV->getInductionDescriptor();
+
+    // Check if VPV increments the induction by the induction step.
+    VPValue *IVStep = WideIV->getStepValue();
+    switch (ID.getInductionOpcode()) {
+    case Instruction::Add:
+      return match(VPV, m_c_Binary<Instruction::Add>(m_Specific(WideIV),
+                                                     m_Specific(IVStep)));
+    case Instruction::FAdd:
+      return match(VPV, m_c_Binary<Instruction::FAdd>(m_Specific(WideIV),
+                                                      m_Specific(IVStep)));
+    case Instruction::FSub:
+      return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
+                                                    m_Specific(IVStep)));
+    case Instruction::Sub: {
+      // IVStep will be the negated step of the subtraction. Check if Step == -1
+      // * IVStep.
+      VPValue *Step;
+      if (!match(VPV,
+                 m_Binary<Instruction::Sub>(m_VPValue(), m_VPValue(Step))) ||
+          !Step->isLiveIn() || !IVStep->isLiveIn())
+        return false;
+      auto *StepCI = dyn_cast<ConstantInt>(Step->getLiveInIRValue());
+      auto *IVStepCI = dyn_cast<ConstantInt>(IVStep->getLiveInIRValue());
+      return StepCI && IVStepCI &&
+             StepCI->getValue() == (-1 * IVStepCI->getValue());
+    }
+    default:
+      return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
+             match(VPV, m_GetElementPtr(m_Specific(WideIV),
+                                        m_Specific(WideIV->getStepValue())));
+    }
+    llvm_unreachable("should have been covered by switch above");
+  };
+  return IsWideIVInc() ? WideIV : nullptr;
+}
+
+void VPlanTransforms::optimizeInductionExitUsers(
+    VPlan &Plan, DenseMap<VPValue *, VPValue *> &EndValues) {
+  using namespace VPlanPatternMatch;
+  SmallVector<VPIRBasicBlock *> ExitVPBBs(Plan.getExitBlocks());
+  if (ExitVPBBs.size() != 1)
+    return;
+
+  VPIRBasicBlock *ExitVPBB = ExitVPBBs[0];
+  VPBlockBase *PredVPBB = ExitVPBB->getSinglePredecessor();
+  if (!PredVPBB)
+    return;
+  assert(PredVPBB == Plan.getMiddleBlock() &&
+         "predecessor must be the middle block");
+
+  VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
+  VPBuilder B(Plan.getMiddleBlock()->getTerminator());
+  for (VPRecipeBase &R : *ExitVPBB) {
+    auto *ExitIRI = cast<VPIRInstruction>(&R);
+    if (!isa<PHINode>(ExitIRI->getInstruction()))
+      break;
+
+    VPValue *Incoming;
+    if (!match(ExitIRI->getOperand(0),
+               m_VPInstruction<VPInstruction::ExtractFromEnd>(
+                   m_VPValue(Incoming), m_SpecificInt(1))))
+      continue;
+
+    auto *WideIV = getOptimizableIVOf(Incoming);
+    if (!WideIV)
+      continue;
+    VPValue *EndValue = EndValues.lookup(WideIV);
+    assert(EndValue && "end value must have been pre-computed");
+
+    if (Incoming != WideIV) {
+      ExitIRI->setOperand(0, EndValue);
+      continue;
+    }
+
+    VPValue *Escape = nullptr;
+    VPValue *Step = WideIV->getStepValue();
+    Type *ScalarTy = TypeInfo.inferScalarType(WideIV);
+    if (ScalarTy->isIntegerTy()) {
+      Escape =
+          B.createNaryOp(Instruction::Sub, {EndValue, Step}, {}, "ind.escape");
+    } else if (ScalarTy->isPointerTy()) {
+      auto *Zero = Plan.getOrAddLiveIn(
+          ConstantInt::get(Step->getLiveInIRValue()->getType(), 0));
+      Escape = B.createPtrAdd(EndValue,
+                              B.createNaryOp(Instruction::Sub, {Zero, Step}),
+                              {}, "ind.escape");
+    } else if (ScalarTy->isFloatingPointTy()) {
+      const auto &ID = WideIV->getInductionDescriptor();
+      Escape = B.createNaryOp(
+          ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
+              ? Instruction::FSub
+              : Instruction::FAdd,
+          {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});
+    } else {
+      llvm_unreachable("all possible induction types must be handled");
+    }
+    ExitIRI->setOperand(0, Escape);
+  }
+}
+
 /// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing
 /// them with already existing recipes expanding the same SCEV expression.
 static void removeRedundantExpandSCEVRecipes(VPlan &Plan) {
@@ -1318,6 +1443,7 @@ void VPlanTransforms::optimize(VPlan &Plan) {
   removeRedundantInductionCasts(Plan);
 
   simplifyRecipes(Plan, Plan.getCanonicalIV()->getScalarType());
+  removeDeadRecipes(Plan);
   legalizeAndOptimizeInductions(Plan);
   removeRedundantExpandSCEVRecipes(Plan);
   simplifyRecipes(Plan, Plan.getCanonicalIV()->getScalarType());
diff --git llvm/lib/Transforms/Vectorize/VPlanTransforms.h llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index fddde8689116..a751b8b5e8dc 100644
--- llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -137,6 +137,13 @@ struct VPlanTransforms {
 
   /// Lower abstract recipes to concrete ones, that can be codegen'd.
   static void convertToConcreteRecipes(VPlan &Plan);
+
+  /// If there's a single exit block, optimize its phi recipes that use exiting
+  /// IV values by feeding them precomputed end values instead, possibly taken
+  /// one step backwards.
+  static void
+  optimizeInductionExitUsers(VPlan &Plan,
+                             DenseMap<VPValue *, VPValue *> &EndValues);
 };
 
 } // namespace llvm
diff --git llvm/lib/Transforms/Vectorize/VPlanUtils.h llvm/lib/Transforms/Vectorize/VPlanUtils.h
index 777944264f45..b88a1b142997 100644
--- llvm/lib/Transforms/Vectorize/VPlanUtils.h
+++ llvm/lib/Transforms/Vectorize/VPlanUtils.h
@@ -41,16 +41,21 @@ inline bool isUniformAfterVectorization(const VPValue *VPV) {
   // vectorization inside a vector region.
   if (VPV->isDefinedOutsideLoopRegions())
     return true;
-  const VPRecipeBase *Def = VPV->getDefiningRecipe();
-  assert(Def && "Must have definition for value defined inside vector region");
-  if (auto *Rep = dyn_cast<VPReplicateRecipe>(Def))
+  if (auto *Rep = dyn_cast<VPReplicateRecipe>(VPV))
     return Rep->isUniform();
-  if (auto *GEP = dyn_cast<VPWidenGEPRecipe>(Def))
-    return all_of(GEP->operands(), isUniformAfterVectorization);
-  if (auto *VPI = dyn_cast<VPInstruction>(Def))
-    return VPI->isSingleScalar() || VPI->isVectorToScalar();
+  if (isa<VPWidenGEPRecipe, VPDerivedIVRecipe>(VPV))
+    return all_of(VPV->getDefiningRecipe()->operands(),
+                  isUniformAfterVectorization);
+  if (auto *VPI = dyn_cast<VPInstruction>(VPV))
+    return VPI->isSingleScalar() || VPI->isVectorToScalar() ||
+           ((Instruction::isBinaryOp(VPI->getOpcode()) ||
+             VPI->getOpcode() == VPInstruction::PtrAdd) &&
+            all_of(VPI->operands(), isUniformAfterVectorization));
+  if (auto *IV = dyn_cast<VPDerivedIVRecipe>(VPV))
+    return all_of(IV->operands(), isUniformAfterVectorization);
+
   // VPExpandSCEVRecipes must be placed in the entry and are alway uniform.
-  return isa<VPExpandSCEVRecipe>(Def);
+  return isa<VPExpandSCEVRecipe>(VPV);
 }
 
 /// Return true if \p V is a header mask in \p Plan.
diff --git llvm/lib/Transforms/Vectorize/VPlanValue.h llvm/lib/Transforms/Vectorize/VPlanValue.h
index 7aaf4002b8b3..23e39ce89a3a 100644
--- llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -33,9 +33,11 @@ namespace llvm {
 class raw_ostream;
 class Value;
 class VPDef;
+struct VPDoubleValueDef;
 class VPSlotTracker;
 class VPUser;
 class VPRecipeBase;
+class VPInterleaveRecipe;
 
 // This is the base class of the VPlan Def/Use graph, used for modeling the data
 // flow into, within and out of the VPlan. VPValues can stand for live-ins
@@ -44,12 +46,15 @@ class VPRecipeBase;
 class VPValue {
   friend class VPBuilder;
   friend class VPDef;
+  friend struct VPDoubleValueDef;
   friend class VPInstruction;
+  friend class VPInterleaveRecipe;
   friend struct VPlanTransforms;
   friend class VPBasicBlock;
   friend class VPInterleavedAccessInfo;
   friend class VPSlotTracker;
   friend class VPRecipeBase;
+  friend class VPlan;
 
   const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast).
 
@@ -65,6 +70,13 @@ protected:
 
   VPValue(const unsigned char SC, Value *UV = nullptr, VPDef *Def = nullptr);
 
+  /// Create a live-in VPValue.
+  VPValue(Value *UV = nullptr) : VPValue(VPValueSC, UV, nullptr) {}
+  /// Create a VPValue for a \p Def which is a subclass of VPValue.
+  VPValue(VPDef *Def, Value *UV = nullptr) : VPValue(VPVRecipeSC, UV, Def) {}
+  /// Create a VPValue for a \p Def which defines multiple values.
+  VPValue(Value *UV, VPDef *Def) : VPValue(VPValueSC, UV, Def) {}
+
   // DESIGN PRINCIPLE: Access to the underlying IR must be strictly limited to
   // the front-end and back-end of VPlan so that the middle-end is as
   // independent as possible of the underlying IR. We grant access to the
@@ -84,12 +96,6 @@ public:
     VPVRecipeSC /// A VPValue sub-class that is a VPRecipeBase.
   };
 
-  /// Create a live-in VPValue.
-  VPValue(Value *UV = nullptr) : VPValue(VPValueSC, UV, nullptr) {}
-  /// Create a VPValue for a \p Def which is a subclass of VPValue.
-  VPValue(VPDef *Def, Value *UV = nullptr) : VPValue(VPVRecipeSC, UV, Def) {}
-  /// Create a VPValue for a \p Def which defines multiple values.
-  VPValue(Value *UV, VPDef *Def) : VPValue(VPValueSC, UV, Def) {}
   VPValue(const VPValue &) = delete;
   VPValue &operator=(const VPValue &) = delete;
 
diff --git llvm/test/Analysis/CostModel/SystemZ/divrem-reg.ll llvm/test/Analysis/CostModel/SystemZ/divrem-reg.ll
index 7edef36ee32f..2f13d7e3ef9b 100644
--- llvm/test/Analysis/CostModel/SystemZ/divrem-reg.ll
+++ llvm/test/Analysis/CostModel/SystemZ/divrem-reg.ll
@@ -1,4 +1,6 @@
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s --check-prefixes=CHECK,Z13
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=systemz-unknown -mcpu=arch15 | FileCheck %s --check-prefixes=CHECK,ARC15
 
 ; Check costs of divisions by register
 ;
@@ -8,279 +10,486 @@
 ; Scalar sdiv
 
 define i64 @fun0(i64 %a, i64 %b) {
+; CHECK-LABEL: 'fun0'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r = sdiv i64 %a, %b
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
+;
   %r = sdiv i64 %a, %b
   ret i64 %r
-; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %r = sdiv i64
 }
 
 define i32 @fun1(i32 %a, i32 %b) {
+; CHECK-LABEL: 'fun1'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r = sdiv i32 %a, %b
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %r
+;
   %r = sdiv i32 %a, %b
   ret i32 %r
-; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %r = sdiv i32 %a, %b
 }
 
 define i16 @fun2(i16 %a, i16 %b) {
+; CHECK-LABEL: 'fun2'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r = sdiv i16 %a, %b
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %r
+;
   %r = sdiv i16 %a, %b
   ret i16 %r
-; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %r = sdiv i16 %a, %b
 }
 
 define i8 @fun3(i8 %a, i8 %b) {
+; CHECK-LABEL: 'fun3'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r = sdiv i8 %a, %b
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %r
+;
   %r = sdiv i8 %a, %b
   ret i8 %r
-; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %r = sdiv i8 %a, %b
 }
 
 ; Vector sdiv
 
 define <2 x i64> @fun4(<2 x i64> %a, <2 x i64> %b) {
+; Z13-LABEL: 'fun4'
+; Z13-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %r = sdiv <2 x i64> %a, %b
+; Z13-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %r
+;
+; ARC15-LABEL: 'fun4'
+; ARC15-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r = sdiv <2 x i64> %a, %b
+; ARC15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %r
+;
   %r = sdiv <2 x i64> %a, %b
   ret <2 x i64> %r
-; CHECK: Cost Model: Found an estimated cost of 47 for instruction:   %r = sdiv <2 x i64>
 }
 
 define <4 x i32> @fun5(<4 x i32> %a, <4 x i32> %b) {
+; Z13-LABEL: 'fun5'
+; Z13-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %r = sdiv <4 x i32> %a, %b
+; Z13-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %r
+;
+; ARC15-LABEL: 'fun5'
+; ARC15-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r = sdiv <4 x i32> %a, %b
+; ARC15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %r
+;
   %r = sdiv <4 x i32> %a, %b
   ret <4 x i32> %r
-; CHECK: Cost Model: Found an estimated cost of 94 for instruction:   %r = sdiv <4 x i32>
 }
 
 define <2 x i32> @fun6(<2 x i32> %a, <2 x i32> %b) {
+; Z13-LABEL: 'fun6'
+; Z13-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r = sdiv <2 x i32> %a, %b
+; Z13-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %r
+;
+; ARC15-LABEL: 'fun6'
+; ARC15-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r = sdiv <2 x i32> %a, %b
+; ARC15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %r
+;
   %r = sdiv <2 x i32> %a, %b
   ret <2 x i32> %r
-; CHECK: Cost Model: Found an estimated cost of 48 for instruction:   %r = sdiv <2 x i32>
 }
 
 define <8 x i16> @fun7(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: 'fun7'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1000 for instruction: %r = sdiv <8 x i16> %a, %b
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %r
+;
   %r = sdiv <8 x i16> %a, %b
   ret <8 x i16> %r
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %r = sdiv <8 x i16>
 }
 
 define <4 x i16> @fun8(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: 'fun8'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %r = sdiv <4 x i16> %a, %b
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i16> %r
+;
   %r = sdiv <4 x i16> %a, %b
   ret <4 x i16> %r
-; CHECK: Cost Model: Found an estimated cost of 94 for instruction:   %r = sdiv <4 x i16>
 }
 
 define <16 x i8> @fun9(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: 'fun9'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1000 for instruction: %r = sdiv <16 x i8> %a, %b
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %r
+;
   %r = sdiv <16 x i8> %a, %b
   ret <16 x i8> %r
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %r = sdiv <16 x i8>
 }
 
 define <8 x i8> @fun10(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: 'fun10'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1000 for instruction: %r = sdiv <8 x i8> %a, %b
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i8> %r
+;
   %r = sdiv <8 x i8> %a, %b
   ret <8 x i8> %r
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %r = sdiv <8 x i8>
 }
 
 ; Scalar udiv
 
 define i64 @fun11(i64 %a, i64 %b) {
+; CHECK-LABEL: 'fun11'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r = udiv i64 %a, %b
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
+;
   %r = udiv i64 %a, %b
   ret i64 %r
-; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %r = udiv i64 %a, %b
 }
 
 define i32 @fun12(i32 %a, i32 %b) {
+; CHECK-LABEL: 'fun12'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r = udiv i32 %a, %b
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %r
+;
   %r = udiv i32 %a, %b
   ret i32 %r
-; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %r = udiv i32
 }
 
 define i16 @fun13(i16 %a, i16 %b) {
+; CHECK-LABEL: 'fun13'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r = udiv i16 %a, %b
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %r
+;
   %r = udiv i16 %a, %b
   ret i16 %r
-; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %r = udiv i16
 }
 
 define i8 @fun14(i8 %a, i8 %b) {
+; CHECK-LABEL: 'fun14'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r = udiv i8 %a, %b
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %r
+;
   %r = udiv i8 %a, %b
   ret i8 %r
-; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %r = udiv i8
 }
 
 ; Vector udiv
 
 define <2 x i64> @fun15(<2 x i64> %a, <2 x i64> %b) {
+; Z13-LABEL: 'fun15'
+; Z13-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %r = udiv <2 x i64> %a, %b
+; Z13-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %r
+;
+; ARC15-LABEL: 'fun15'
+; ARC15-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r = udiv <2 x i64> %a, %b
+; ARC15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %r
+;
   %r = udiv <2 x i64> %a, %b
   ret <2 x i64> %r
-; CHECK: Cost Model: Found an estimated cost of 47 for instruction:   %r = udiv <2 x i64>
 }
 
 define <4 x i32> @fun16(<4 x i32> %a, <4 x i32> %b) {
+; Z13-LABEL: 'fun16'
+; Z13-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %r = udiv <4 x i32> %a, %b
+; Z13-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %r
+;
+; ARC15-LABEL: 'fun16'
+; ARC15-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r = udiv <4 x i32> %a, %b
+; ARC15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %r
+;
   %r = udiv <4 x i32> %a, %b
   ret <4 x i32> %r
-; CHECK: Cost Model: Found an estimated cost of 94 for instruction:   %r = udiv <4 x i32>
 }
 
 define <2 x i32> @fun17(<2 x i32> %a, <2 x i32> %b) {
+; Z13-LABEL: 'fun17'
+; Z13-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r = udiv <2 x i32> %a, %b
+; Z13-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %r
+;
+; ARC15-LABEL: 'fun17'
+; ARC15-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r = udiv <2 x i32> %a, %b
+; ARC15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %r
+;
   %r = udiv <2 x i32> %a, %b
   ret <2 x i32> %r
-; CHECK: Cost Model: Found an estimated cost of 48 for instruction:   %r = udiv <2 x i32>
 }
 
 define <8 x i16> @fun18(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: 'fun18'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1000 for instruction: %r = udiv <8 x i16> %a, %b
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %r
+;
   %r = udiv <8 x i16> %a, %b
   ret <8 x i16> %r
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %r = udiv <8 x i16>
 }
 
 define <4 x i16> @fun19(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: 'fun19'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %r = udiv <4 x i16> %a, %b
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i16> %r
+;
   %r = udiv <4 x i16> %a, %b
   ret <4 x i16> %r
-; CHECK: Cost Model: Found an estimated cost of 94 for instruction:   %r = udiv <4 x i16>
 }
 
 define <16 x i8> @fun20(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: 'fun20'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1000 for instruction: %r = udiv <16 x i8> %a, %b
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %r
+;
   %r = udiv <16 x i8> %a, %b
   ret <16 x i8> %r
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %r = udiv <16 x i8>
 }
 
 define <8 x i8> @fun21(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: 'fun21'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1000 for instruction: %r = udiv <8 x i8> %a, %b
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i8> %r
+;
   %r = udiv <8 x i8> %a, %b
   ret <8 x i8> %r
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %r = udiv <8 x i8>
 }
 
 ; Scalar srem
 
 define i64 @fun22(i64 %a, i64 %b) {
+; CHECK-LABEL: 'fun22'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r = srem i64 %a, %b
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
+;
   %r = srem i64 %a, %b
   ret i64 %r
-; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %r = srem i64
 }
 
 define i32 @fun23(i32 %a, i32 %b) {
+; CHECK-LABEL: 'fun23'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r = srem i32 %a, %b
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %r
+;
   %r = srem i32 %a, %b
   ret i32 %r
-; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %r = srem i32
 }
 
 define i16 @fun24(i16 %a, i16 %b) {
+; CHECK-LABEL: 'fun24'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r = srem i16 %a, %b
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %r
+;
   %r = srem i16 %a, %b
   ret i16 %r
-; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %r = srem i16
 }
 
 define i8 @fun25(i8 %a, i8 %b) {
+; CHECK-LABEL: 'fun25'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r = srem i8 %a, %b
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %r
+;
   %r = srem i8 %a, %b
   ret i8 %r
-; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %r = srem i8
 }
 
 ; Vector srem
 
 define <2 x i64> @fun26(<2 x i64> %a, <2 x i64> %b) {
+; Z13-LABEL: 'fun26'
+; Z13-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %r = srem <2 x i64> %a, %b
+; Z13-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %r
+;
+; ARC15-LABEL: 'fun26'
+; ARC15-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r = srem <2 x i64> %a, %b
+; ARC15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %r
+;
   %r = srem <2 x i64> %a, %b
   ret <2 x i64> %r
-; CHECK: Cost Model: Found an estimated cost of 47 for instruction:   %r = srem <2 x i64>
 }
 
 define <4 x i32> @fun27(<4 x i32> %a, <4 x i32> %b) {
+; Z13-LABEL: 'fun27'
+; Z13-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %r = srem <4 x i32> %a, %b
+; Z13-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %r
+;
+; ARC15-LABEL: 'fun27'
+; ARC15-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r = srem <4 x i32> %a, %b
+; ARC15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %r
+;
   %r = srem <4 x i32> %a, %b
   ret <4 x i32> %r
-; CHECK: Cost Model: Found an estimated cost of 94 for instruction:   %r = srem <4 x i32>
 }
 
 define <2 x i32> @fun28(<2 x i32> %a, <2 x i32> %b) {
+; Z13-LABEL: 'fun28'
+; Z13-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r = srem <2 x i32> %a, %b
+; Z13-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %r
+;
+; ARC15-LABEL: 'fun28'
+; ARC15-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r = srem <2 x i32> %a, %b
+; ARC15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %r
+;
   %r = srem <2 x i32> %a, %b
   ret <2 x i32> %r
-; CHECK: Cost Model: Found an estimated cost of 48 for instruction:   %r = srem <2 x i32>
 }
 
 define <8 x i16> @fun29(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: 'fun29'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1000 for instruction: %r = srem <8 x i16> %a, %b
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %r
+;
   %r = srem <8 x i16> %a, %b
   ret <8 x i16> %r
-; CHECK: ost Model: Found an estimated cost of 1000 for instruction:   %r = srem <8 x i16>
 }
 
 define <4 x i16> @fun30(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: 'fun30'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %r = srem <4 x i16> %a, %b
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i16> %r
+;
   %r = srem <4 x i16> %a, %b
   ret <4 x i16> %r
-; CHECK: Cost Model: Found an estimated cost of 94 for instruction:   %r = srem <4 x i16>
 }
 
 define <16 x i8> @fun31(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: 'fun31'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1000 for instruction: %r = srem <16 x i8> %a, %b
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %r
+;
   %r = srem <16 x i8> %a, %b
   ret <16 x i8> %r
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %r = srem <16 x i8>
 }
 
 define <8 x i8> @fun32(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: 'fun32'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1000 for instruction: %r = srem <8 x i8> %a, %b
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i8> %r
+;
   %r = srem <8 x i8> %a, %b
   ret <8 x i8> %r
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %r = srem <8 x i8>
 }
 
 ; Scalar urem
 
 define i64 @fun33(i64 %a, i64 %b) {
+; CHECK-LABEL: 'fun33'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r = urem i64 %a, %b
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
+;
   %r = urem i64 %a, %b
   ret i64 %r
-; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %r = urem i64
 }
 
 define i32 @fun34(i32 %a, i32 %b) {
+; CHECK-LABEL: 'fun34'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r = urem i32 %a, %b
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %r
+;
   %r = urem i32 %a, %b
   ret i32 %r
-; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %r = urem i32
 }
 
 define i16 @fun35(i16 %a, i16 %b) {
+; CHECK-LABEL: 'fun35'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r = urem i16 %a, %b
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %r
+;
   %r = urem i16 %a, %b
   ret i16 %r
-; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %r = urem i16
 }
 
 define i8 @fun36(i8 %a, i8 %b) {
+; CHECK-LABEL: 'fun36'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r = urem i8 %a, %b
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %r
+;
   %r = urem i8 %a, %b
   ret i8 %r
-; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %r = urem i8
 }
 
 ; Vector urem
 
 define <2 x i64> @fun37(<2 x i64> %a, <2 x i64> %b) {
+; Z13-LABEL: 'fun37'
+; Z13-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %r = urem <2 x i64> %a, %b
+; Z13-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %r
+;
+; ARC15-LABEL: 'fun37'
+; ARC15-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r = urem <2 x i64> %a, %b
+; ARC15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %r
+;
   %r = urem <2 x i64> %a, %b
   ret <2 x i64> %r
-; CHECK: Cost Model: Found an estimated cost of 47 for instruction:   %r = urem <2 x i64>
 }
 
 define <4 x i32> @fun38(<4 x i32> %a, <4 x i32> %b) {
+; Z13-LABEL: 'fun38'
+; Z13-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %r = urem <4 x i32> %a, %b
+; Z13-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %r
+;
+; ARC15-LABEL: 'fun38'
+; ARC15-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r = urem <4 x i32> %a, %b
+; ARC15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %r
+;
   %r = urem <4 x i32> %a, %b
   ret <4 x i32> %r
-; CHECK: Cost Model: Found an estimated cost of 94 for instruction:   %r = urem <4 x i32>
 }
 
 define <2 x i32> @fun39(<2 x i32> %a, <2 x i32> %b) {
+; Z13-LABEL: 'fun39'
+; Z13-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r = urem <2 x i32> %a, %b
+; Z13-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %r
+;
+; ARC15-LABEL: 'fun39'
+; ARC15-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r = urem <2 x i32> %a, %b
+; ARC15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %r
+;
   %r = urem <2 x i32> %a, %b
   ret <2 x i32> %r
-; CHECK: Cost Model: Found an estimated cost of 48 for instruction:   %r = urem <2 x i32>
 }
 
 define <8 x i16> @fun40(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: 'fun40'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1000 for instruction: %r = urem <8 x i16> %a, %b
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %r
+;
   %r = urem <8 x i16> %a, %b
   ret <8 x i16> %r
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %r = urem <8 x i16>
 }
 
 define <4 x i16> @fun41(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: 'fun41'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %r = urem <4 x i16> %a, %b
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i16> %r
+;
   %r = urem <4 x i16> %a, %b
   ret <4 x i16> %r
-; CHECK: Cost Model: Found an estimated cost of 94 for instruction:   %r = urem <4 x i16>
 }
 
 define <16 x i8> @fun42(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: 'fun42'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1000 for instruction: %r = urem <16 x i8> %a, %b
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %r
+;
   %r = urem <16 x i8> %a, %b
   ret <16 x i8> %r
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %r = urem <16 x i8>
 }
 
 define <8 x i8> @fun43(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: 'fun43'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1000 for instruction: %r = urem <8 x i8> %a, %b
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i8> %r
+;
   %r = urem <8 x i8> %a, %b
   ret <8 x i8> %r
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %r = urem <8 x i8>
+}
+
+; Also test some wider inputs:
+define <8 x i64> @fun44(<8 x i64> %a, <8 x i64> %b) {
+; Z13-LABEL: 'fun44'
+; Z13-NEXT:  Cost Model: Found an estimated cost of 1000 for instruction: %r = sdiv <8 x i64> %a, %b
+; Z13-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %r
+;
+; ARC15-LABEL: 'fun44'
+; ARC15-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r = sdiv <8 x i64> %a, %b
+; ARC15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %r
+;
+  %r = sdiv <8 x i64> %a, %b
+  ret <8 x i64> %r
+}
+
+define <8 x i32> @fun45(<8 x i32> %a, <8 x i32> %b) {
+; Z13-LABEL: 'fun45'
+; Z13-NEXT:  Cost Model: Found an estimated cost of 1000 for instruction: %r = urem <8 x i32> %a, %b
+; Z13-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i32> %r
+;
+; ARC15-LABEL: 'fun45'
+; ARC15-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r = urem <8 x i32> %a, %b
+; ARC15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i32> %r
+;
+  %r = urem <8 x i32> %a, %b
+  ret <8 x i32> %r
 }
diff --git llvm/test/Analysis/CostModel/SystemZ/i128-cmp-ext-conv.ll llvm/test/Analysis/CostModel/SystemZ/i128-cmp-ext-conv.ll
index 66da6de3bc76..105e634cea1a 100644
--- llvm/test/Analysis/CostModel/SystemZ/i128-cmp-ext-conv.ll
+++ llvm/test/Analysis/CostModel/SystemZ/i128-cmp-ext-conv.ll
@@ -1,10 +1,12 @@
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s --check-prefixes=CHECK,Z13
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=systemz-unknown -mcpu=arch15 | FileCheck %s --check-prefixes=CHECK,ARC15
 ;
 
 define i128 @fun1(i128 %val1, i128 %val2) {
 ; CHECK-LABEL: 'fun1'
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %cmp = icmp eq i128 %val1, %val2
-; CHECK: Cost Model: Found an estimated cost of 5 for instruction:   %v128 = sext i1 %cmp to i128
+; Z13:   Cost Model: Found an estimated cost of 5 for instruction:   %v128 = sext i1 %cmp to i128
+; ARC15: Cost Model: Found an estimated cost of 0 for instruction:   %v128 = sext i1 %cmp to i128
   %cmp = icmp eq i128 %val1, %val2
   %v128 = sext i1 %cmp to i128
   ret i128 %v128
@@ -24,13 +26,39 @@ define i128 @fun3(i128 %val1, i128 %val2,
 ; CHECK-LABEL: 'fun3'
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %cmp = icmp eq i128 %val1, %val2
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %add = add i128 %val3, %val4
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %sel = select i1 %cmp, i128 %val3, i128 %add
+; Z13:   Cost Model: Found an estimated cost of 4 for instruction:   %sel = select i1 %cmp, i128 %val3, i128 %add
+; ARC15: Cost Model: Found an estimated cost of 1 for instruction:   %sel = select i1 %cmp, i128 %val3, i128 %add
   %cmp = icmp eq i128 %val1, %val2
   %add = add i128 %val3, %val4
   %sel = select i1 %cmp, i128 %val3, i128 %add
   ret i128 %sel
 }
 
+define i64 @fun3_sel64(i128 %val1, i128 %val2,
+                       i64 %val3, i64 %val4) {
+; CHECK-LABEL: 'fun3_sel64'
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %cmp = icmp ugt i128 %val1, %val2
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %add = add i64 %val3, %val4
+; Z13:   Cost Model: Found an estimated cost of 4 for instruction:   %sel = select i1 %cmp, i64 %val3, i64 %add
+; ARC15: Cost Model: Found an estimated cost of 1 for instruction:   %sel = select i1 %cmp, i64 %val3, i64 %add
+  %cmp = icmp ugt i128 %val1, %val2
+  %add = add i64 %val3, %val4
+  %sel = select i1 %cmp, i64 %val3, i64 %add
+  ret i64 %sel
+}
+
+define i128 @fun3_cmp64(i64 %val1, i64 %val2,
+                        i128 %val3, i128 %val4) {
+; CHECK-LABEL: 'fun3_cmp64'
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %cmp = icmp slt i64 %val1, %val2
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %add = add i128 %val3, %val4
+; CHECk: Cost Model: Found an estimated cost of 4 for instruction:   %sel = select i1 %cmp, i128 %val3, i128 %add
+  %cmp = icmp slt i64 %val1, %val2
+  %add = add i128 %val3, %val4
+  %sel = select i1 %cmp, i128 %val3, i128 %add
+  ret i128 %sel
+}
+
 define i128 @fun4(ptr %src) {
 ; CHECK-LABEL: 'fun4'
 ; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res = sext i64 %v to i128
diff --git llvm/test/Analysis/CostModel/SystemZ/int-arith.ll llvm/test/Analysis/CostModel/SystemZ/int-arith.ll
index fc4d19c5cdf9..bf5cbfb48a77 100644
--- llvm/test/Analysis/CostModel/SystemZ/int-arith.ll
+++ llvm/test/Analysis/CostModel/SystemZ/int-arith.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=systemz-unknown -mcpu=arch15 | FileCheck %s -check-prefix=ARC15
 ;
 ; Note: The scalarized vector instructions costs are not including any
 ; extracts, due to the undef operands.
@@ -131,18 +132,22 @@ define void @mul() {
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res5 = mul <2 x i16> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res6 = mul <2 x i32> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 3 for instruction:   %res7 = mul <2 x i64> undef, undef
+; ARC15: Cost Model: Found an estimated cost of 1 for instruction:   %res7 = mul <2 x i64> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res8 = mul <4 x i8> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res9 = mul <4 x i16> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res10 = mul <4 x i32> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %res11 = mul <4 x i64> undef, undef
+; ARC15: Cost Model: Found an estimated cost of 2 for instruction:   %res11 = mul <4 x i64> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res12 = mul <8 x i8> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res13 = mul <8 x i16> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res14 = mul <8 x i32> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %res15 = mul <8 x i64> undef, undef
+; ARC15: Cost Model: Found an estimated cost of 4 for instruction:   %res15 = mul <8 x i64> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res16 = mul <16 x i8> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res17 = mul <16 x i16> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res18 = mul <16 x i32> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %res19 = mul <16 x i64> undef, undef
+; ARC15: Cost Model: Found an estimated cost of 8 for instruction:   %res19 = mul <16 x i64> undef, undef
 
   ret void;
 }
diff --git llvm/test/CodeGen/AArch64/aarch64-large-stack-spbump.mir llvm/test/CodeGen/AArch64/aarch64-large-stack-spbump.mir
new file mode 100644
index 000000000000..f920813f2b42
--- /dev/null
+++ llvm/test/CodeGen/AArch64/aarch64-large-stack-spbump.mir
@@ -0,0 +1,46 @@
+# RUN: llc -mtriple=aarch64 -run-pass=prologepilog %s -o - | FileCheck %s
+--- |
+  define i32 @_Z4funcv() {
+  entry:
+    %array = alloca [1073741824 x i32], align 4
+    %arrayidx = getelementptr inbounds [1073741824 x i32], ptr %array, i64 0, i64 20
+    store i32 7, ptr %arrayidx, align 4
+    call void @_Z5func2v()
+    %arrayidx1 = getelementptr inbounds [1073741824 x i32], ptr %array, i64 0, i64 20
+    %0 = load i32, ptr %arrayidx1, align 4
+    ret i32 %0
+  }
+ 
+  declare void @_Z5func2v()
+...
+---
+name:            _Z4funcv
+alignment:       4
+legalized:       true
+regBankSelected: true
+selected:        true
+tracksRegLiveness: true
+noPhis:          true
+isSSA:           false
+noVRegs:         true
+hasFakeUses:     false
+frameInfo:
+  maxAlignment:    4
+  adjustsStack:    true
+  hasCalls:        true
+  maxCallFrameSize: 0
+stack:
+  - { id: 0, name: array, size: 4294967296, alignment: 4, local-offset: -4294967296 }
+machineFunctionInfo: {}
+body:             |
+  bb.1.entry:
+    renamable $w8 = MOVi32imm 7
+    STRWui killed renamable $w8, %stack.0.array, 20 :: (store (s32) into %ir.arrayidx)
+    ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+    BL @_Z5func2v, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp
+    ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+    renamable $w0 = LDRWui %stack.0.array, 20 :: (dereferenceable load (s32) from %ir.arrayidx1)
+    ; CHECK: early-clobber $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2
+    RET_ReallyLR implicit killed $w0
+
+...
diff --git llvm/test/CodeGen/AArch64/adds_cmn.ll llvm/test/CodeGen/AArch64/adds_cmn.ll
index 674a3893653a..7f1cb0df049b 100644
--- llvm/test/CodeGen/AArch64/adds_cmn.ll
+++ llvm/test/CodeGen/AArch64/adds_cmn.ll
@@ -62,10 +62,8 @@ entry:
 define { i32, i32 } @subs_cmp_c(i32 noundef %x, i32 noundef %y) {
 ; CHECK-LABEL: subs_cmp_c:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cmp w0, w1
-; CHECK-NEXT:    sub w1, w1, w0
-; CHECK-NEXT:    cset w8, hs
-; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    subs w1, w1, w0
+; CHECK-NEXT:    cset w0, ls
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %x, i32 %y)
diff --git llvm/test/CodeGen/AArch64/bf16-convert-intrinsics.ll llvm/test/CodeGen/AArch64/bf16-convert-intrinsics.ll
index 9d4e79d38d5d..64bc95f2f389 100644
--- llvm/test/CodeGen/AArch64/bf16-convert-intrinsics.ll
+++ llvm/test/CodeGen/AArch64/bf16-convert-intrinsics.ll
@@ -1,5 +1,8 @@
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64 -mattr=+neon -mattr=+bf16 | FileCheck %s
 
+; This test acts to test the old neon.bfcvt intrinsics, which are now
+; autoupgraded to fptrunc operations.
+
 declare bfloat @llvm.aarch64.neon.bfcvt(float)
 declare <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float>)
 declare <8 x bfloat> @llvm.aarch64.neon.bfcvtn2(<8 x bfloat>, <4 x float>)
diff --git llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll
index 9b6e19eba3f4..1cd0294b0083 100644
--- llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll
+++ llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll
@@ -22,7 +22,6 @@ define <4 x bfloat> @add_h(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-BF16-NEXT:    fadd v0.4s, v0.4s, v1.4s
 ; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
-; CHECK-BF16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-BF16-NEXT:    ret
 entry:
 
@@ -62,7 +61,6 @@ define <4 x bfloat> @sub_h(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-BF16-NEXT:    fsub v0.4s, v0.4s, v1.4s
 ; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
-; CHECK-BF16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-BF16-NEXT:    ret
 entry:
 
@@ -91,7 +89,6 @@ define <4 x bfloat> @mul_h(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-BF16-NEXT:    fmul v0.4s, v0.4s, v1.4s
 ; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
-; CHECK-BF16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-BF16-NEXT:    ret
 entry:
 
@@ -120,7 +117,6 @@ define <4 x bfloat> @div_h(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-BF16-NEXT:    fdiv v0.4s, v0.4s, v1.4s
 ; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
-; CHECK-BF16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-BF16-NEXT:    ret
 entry:
 
@@ -168,7 +164,6 @@ define <4 x bfloat> @s_to_h(<4 x float> %a) {
 ; CHECK-BF16-LABEL: s_to_h:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
-; CHECK-BF16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-BF16-NEXT:    ret
   %1 = fptrunc <4 x float> %a to <4 x bfloat>
   ret <4 x bfloat> %1
@@ -196,7 +191,6 @@ define <4 x bfloat> @d_to_h(<4 x double> %a) {
 ; CHECK-BF16-NEXT:    fcvtxn v0.2s, v0.2d
 ; CHECK-BF16-NEXT:    fcvtxn2 v0.4s, v1.2d
 ; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
-; CHECK-BF16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-BF16-NEXT:    ret
   %1 = fptrunc <4 x double> %a to <4 x bfloat>
   ret <4 x bfloat> %1
@@ -262,7 +256,6 @@ define <4 x bfloat> @sitofp_i8(<4 x i8> %a) #0 {
 ; CHECK-BF16-NEXT:    sshll v0.4s, v0.4h, #0
 ; CHECK-BF16-NEXT:    scvtf v0.4s, v0.4s
 ; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
-; CHECK-BF16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-BF16-NEXT:    ret
   %1 = sitofp <4 x i8> %a to <4 x bfloat>
   ret <4 x bfloat> %1
@@ -286,7 +279,6 @@ define <4 x bfloat> @sitofp_i16(<4 x i16> %a) #0 {
 ; CHECK-BF16-NEXT:    sshll v0.4s, v0.4h, #0
 ; CHECK-BF16-NEXT:    scvtf v0.4s, v0.4s
 ; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
-; CHECK-BF16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-BF16-NEXT:    ret
   %1 = sitofp <4 x i16> %a to <4 x bfloat>
   ret <4 x bfloat> %1
@@ -309,7 +301,6 @@ define <4 x bfloat> @sitofp_i32(<4 x i32> %a) #0 {
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    scvtf v0.4s, v0.4s
 ; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
-; CHECK-BF16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-BF16-NEXT:    ret
   %1 = sitofp <4 x i32> %a to <4 x bfloat>
   ret <4 x bfloat> %1
@@ -342,7 +333,6 @@ define <4 x bfloat> @sitofp_i64(<4 x i64> %a) #0 {
 ; CHECK-BF16-NEXT:    fcvtn v0.2s, v0.2d
 ; CHECK-BF16-NEXT:    fcvtn2 v0.4s, v1.2d
 ; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
-; CHECK-BF16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-BF16-NEXT:    ret
   %1 = sitofp <4 x i64> %a to <4 x bfloat>
   ret <4 x bfloat> %1
@@ -368,7 +358,6 @@ define <4 x bfloat> @uitofp_i8(<4 x i8> %a) #0 {
 ; CHECK-BF16-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-BF16-NEXT:    ucvtf v0.4s, v0.4s
 ; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
-; CHECK-BF16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-BF16-NEXT:    ret
   %1 = uitofp <4 x i8> %a to <4 x bfloat>
   ret <4 x bfloat> %1
@@ -393,7 +382,6 @@ define <4 x bfloat> @uitofp_i16(<4 x i16> %a) #0 {
 ; CHECK-BF16-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-BF16-NEXT:    ucvtf v0.4s, v0.4s
 ; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
-; CHECK-BF16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-BF16-NEXT:    ret
   %1 = uitofp <4 x i16> %a to <4 x bfloat>
   ret <4 x bfloat> %1
@@ -416,7 +404,6 @@ define <4 x bfloat> @uitofp_i32(<4 x i32> %a) #0 {
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    ucvtf v0.4s, v0.4s
 ; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
-; CHECK-BF16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-BF16-NEXT:    ret
   %1 = uitofp <4 x i32> %a to <4 x bfloat>
   ret <4 x bfloat> %1
@@ -449,7 +436,6 @@ define <4 x bfloat> @uitofp_i64(<4 x i64> %a) #0 {
 ; CHECK-BF16-NEXT:    fcvtn v0.2s, v0.2d
 ; CHECK-BF16-NEXT:    fcvtn2 v0.4s, v1.2d
 ; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
-; CHECK-BF16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-BF16-NEXT:    ret
   %1 = uitofp <4 x i64> %a to <4 x bfloat>
   ret <4 x bfloat> %1
diff --git llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll
index a609e33be935..2eaa58de9280 100644
--- llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll
+++ llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll
@@ -221,9 +221,8 @@ define <8 x bfloat> @s_to_h(<8 x float> %a) {
 ;
 ; CHECK-BF16-LABEL: s_to_h:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    bfcvtn v1.4h, v1.4s
 ; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
-; CHECK-BF16-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-BF16-NEXT:    bfcvtn2 v0.8h, v1.4s
 ; CHECK-BF16-NEXT:    ret
   %1 = fptrunc <8 x float> %a to <8 x bfloat>
   ret <8 x bfloat> %1
@@ -257,13 +256,12 @@ define <8 x bfloat> @d_to_h(<8 x double> %a) {
 ;
 ; CHECK-BF16-LABEL: d_to_h:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    fcvtxn v2.2s, v2.2d
 ; CHECK-BF16-NEXT:    fcvtxn v0.2s, v0.2d
-; CHECK-BF16-NEXT:    fcvtxn2 v2.4s, v3.2d
+; CHECK-BF16-NEXT:    fcvtxn v2.2s, v2.2d
 ; CHECK-BF16-NEXT:    fcvtxn2 v0.4s, v1.2d
-; CHECK-BF16-NEXT:    bfcvtn v1.4h, v2.4s
+; CHECK-BF16-NEXT:    fcvtxn2 v2.4s, v3.2d
 ; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
-; CHECK-BF16-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-BF16-NEXT:    bfcvtn2 v0.8h, v2.4s
 ; CHECK-BF16-NEXT:    ret
   %1 = fptrunc <8 x double> %a to <8 x bfloat>
   ret <8 x bfloat> %1
@@ -334,7 +332,6 @@ define <4 x bfloat> @sitofp_v4i8(<4 x i8> %a) #0 {
 ; CHECK-BF16-NEXT:    sshll v0.4s, v0.4h, #0
 ; CHECK-BF16-NEXT:    scvtf v0.4s, v0.4s
 ; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
-; CHECK-BF16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-BF16-NEXT:    ret
   %1 = sitofp <4 x i8> %a to <4 x bfloat>
   ret <4 x bfloat> %1
@@ -363,13 +360,12 @@ define <8 x bfloat> @sitofp_v8i8(<8 x i8> %a) #0 {
 ; CHECK-BF16-LABEL: sitofp_v8i8:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-BF16-NEXT:    sshll2 v1.4s, v0.8h, #0
-; CHECK-BF16-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-BF16-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-BF16-NEXT:    sshll2 v2.4s, v0.8h, #0
 ; CHECK-BF16-NEXT:    scvtf v1.4s, v1.4s
-; CHECK-BF16-NEXT:    scvtf v0.4s, v0.4s
-; CHECK-BF16-NEXT:    bfcvtn v1.4h, v1.4s
-; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
-; CHECK-BF16-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v1.4s
+; CHECK-BF16-NEXT:    scvtf v1.4s, v2.4s
+; CHECK-BF16-NEXT:    bfcvtn2 v0.8h, v1.4s
 ; CHECK-BF16-NEXT:    ret
   %1 = sitofp <8 x i8> %a to <8 x bfloat>
   ret <8 x bfloat> %1
@@ -412,20 +408,18 @@ define <16 x bfloat> @sitofp_v16i8(<16 x i8> %a) #0 {
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    sshll2 v1.8h, v0.16b, #0
 ; CHECK-BF16-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-BF16-NEXT:    sshll2 v2.4s, v1.8h, #0
-; CHECK-BF16-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-BF16-NEXT:    sshll2 v3.4s, v0.8h, #0
-; CHECK-BF16-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-BF16-NEXT:    sshll v2.4s, v1.4h, #0
+; CHECK-BF16-NEXT:    sshll v3.4s, v0.4h, #0
+; CHECK-BF16-NEXT:    sshll2 v4.4s, v1.8h, #0
+; CHECK-BF16-NEXT:    sshll2 v5.4s, v0.8h, #0
 ; CHECK-BF16-NEXT:    scvtf v2.4s, v2.4s
-; CHECK-BF16-NEXT:    scvtf v1.4s, v1.4s
 ; CHECK-BF16-NEXT:    scvtf v3.4s, v3.4s
-; CHECK-BF16-NEXT:    scvtf v0.4s, v0.4s
-; CHECK-BF16-NEXT:    bfcvtn v2.4h, v2.4s
-; CHECK-BF16-NEXT:    bfcvtn v1.4h, v1.4s
-; CHECK-BF16-NEXT:    bfcvtn v3.4h, v3.4s
-; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
-; CHECK-BF16-NEXT:    mov v1.d[1], v2.d[0]
-; CHECK-BF16-NEXT:    mov v0.d[1], v3.d[0]
+; CHECK-BF16-NEXT:    bfcvtn v1.4h, v2.4s
+; CHECK-BF16-NEXT:    scvtf v2.4s, v4.4s
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v3.4s
+; CHECK-BF16-NEXT:    scvtf v3.4s, v5.4s
+; CHECK-BF16-NEXT:    bfcvtn2 v1.8h, v2.4s
+; CHECK-BF16-NEXT:    bfcvtn2 v0.8h, v3.4s
 ; CHECK-BF16-NEXT:    ret
   %1 = sitofp <16 x i8> %a to <16 x bfloat>
   ret <16 x bfloat> %1
@@ -452,13 +446,12 @@ define <8 x bfloat> @sitofp_i16(<8 x i16> %a) #0 {
 ;
 ; CHECK-BF16-LABEL: sitofp_i16:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    sshll2 v1.4s, v0.8h, #0
-; CHECK-BF16-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-BF16-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-BF16-NEXT:    sshll2 v2.4s, v0.8h, #0
 ; CHECK-BF16-NEXT:    scvtf v1.4s, v1.4s
-; CHECK-BF16-NEXT:    scvtf v0.4s, v0.4s
-; CHECK-BF16-NEXT:    bfcvtn v1.4h, v1.4s
-; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
-; CHECK-BF16-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v1.4s
+; CHECK-BF16-NEXT:    scvtf v1.4s, v2.4s
+; CHECK-BF16-NEXT:    bfcvtn2 v0.8h, v1.4s
 ; CHECK-BF16-NEXT:    ret
   %1 = sitofp <8 x i16> %a to <8 x bfloat>
   ret <8 x bfloat> %1
@@ -483,11 +476,10 @@ define <8 x bfloat> @sitofp_i32(<8 x i32> %a) #0 {
 ;
 ; CHECK-BF16-LABEL: sitofp_i32:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    scvtf v1.4s, v1.4s
 ; CHECK-BF16-NEXT:    scvtf v0.4s, v0.4s
-; CHECK-BF16-NEXT:    bfcvtn v1.4h, v1.4s
+; CHECK-BF16-NEXT:    scvtf v1.4s, v1.4s
 ; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
-; CHECK-BF16-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-BF16-NEXT:    bfcvtn2 v0.8h, v1.4s
 ; CHECK-BF16-NEXT:    ret
   %1 = sitofp <8 x i32> %a to <8 x bfloat>
   ret <8 x bfloat> %1
@@ -526,17 +518,16 @@ define <8 x bfloat> @sitofp_i64(<8 x i64> %a) #0 {
 ;
 ; CHECK-BF16-LABEL: sitofp_i64:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    scvtf v2.2d, v2.2d
 ; CHECK-BF16-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-BF16-NEXT:    scvtf v3.2d, v3.2d
+; CHECK-BF16-NEXT:    scvtf v2.2d, v2.2d
 ; CHECK-BF16-NEXT:    scvtf v1.2d, v1.2d
-; CHECK-BF16-NEXT:    fcvtn v2.2s, v2.2d
+; CHECK-BF16-NEXT:    scvtf v3.2d, v3.2d
 ; CHECK-BF16-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-BF16-NEXT:    fcvtn2 v2.4s, v3.2d
+; CHECK-BF16-NEXT:    fcvtn v2.2s, v2.2d
 ; CHECK-BF16-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-BF16-NEXT:    bfcvtn v1.4h, v2.4s
+; CHECK-BF16-NEXT:    fcvtn2 v2.4s, v3.2d
 ; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
-; CHECK-BF16-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-BF16-NEXT:    bfcvtn2 v0.8h, v2.4s
 ; CHECK-BF16-NEXT:    ret
   %1 = sitofp <8 x i64> %a to <8 x bfloat>
   ret <8 x bfloat> %1
@@ -562,7 +553,6 @@ define <4 x bfloat> @uitofp_v4i8(<4 x i8> %a) #0 {
 ; CHECK-BF16-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-BF16-NEXT:    ucvtf v0.4s, v0.4s
 ; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
-; CHECK-BF16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-BF16-NEXT:    ret
   %1 = uitofp <4 x i8> %a to <4 x bfloat>
   ret <4 x bfloat> %1
@@ -591,13 +581,12 @@ define <8 x bfloat> @uitofp_v8i8(<8 x i8> %a) #0 {
 ; CHECK-BF16-LABEL: uitofp_v8i8:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-BF16-NEXT:    ushll2 v1.4s, v0.8h, #0
-; CHECK-BF16-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BF16-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-BF16-NEXT:    ushll2 v2.4s, v0.8h, #0
 ; CHECK-BF16-NEXT:    ucvtf v1.4s, v1.4s
-; CHECK-BF16-NEXT:    ucvtf v0.4s, v0.4s
-; CHECK-BF16-NEXT:    bfcvtn v1.4h, v1.4s
-; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
-; CHECK-BF16-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v1.4s
+; CHECK-BF16-NEXT:    ucvtf v1.4s, v2.4s
+; CHECK-BF16-NEXT:    bfcvtn2 v0.8h, v1.4s
 ; CHECK-BF16-NEXT:    ret
   %1 = uitofp <8 x i8> %a to <8 x bfloat>
   ret <8 x bfloat> %1
@@ -640,20 +629,18 @@ define <16 x bfloat> @uitofp_v16i8(<16 x i8> %a) #0 {
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    ushll2 v1.8h, v0.16b, #0
 ; CHECK-BF16-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-BF16-NEXT:    ushll2 v2.4s, v1.8h, #0
-; CHECK-BF16-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-BF16-NEXT:    ushll2 v3.4s, v0.8h, #0
-; CHECK-BF16-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BF16-NEXT:    ushll v2.4s, v1.4h, #0
+; CHECK-BF16-NEXT:    ushll v3.4s, v0.4h, #0
+; CHECK-BF16-NEXT:    ushll2 v4.4s, v1.8h, #0
+; CHECK-BF16-NEXT:    ushll2 v5.4s, v0.8h, #0
 ; CHECK-BF16-NEXT:    ucvtf v2.4s, v2.4s
-; CHECK-BF16-NEXT:    ucvtf v1.4s, v1.4s
 ; CHECK-BF16-NEXT:    ucvtf v3.4s, v3.4s
-; CHECK-BF16-NEXT:    ucvtf v0.4s, v0.4s
-; CHECK-BF16-NEXT:    bfcvtn v2.4h, v2.4s
-; CHECK-BF16-NEXT:    bfcvtn v1.4h, v1.4s
-; CHECK-BF16-NEXT:    bfcvtn v3.4h, v3.4s
-; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
-; CHECK-BF16-NEXT:    mov v1.d[1], v2.d[0]
-; CHECK-BF16-NEXT:    mov v0.d[1], v3.d[0]
+; CHECK-BF16-NEXT:    bfcvtn v1.4h, v2.4s
+; CHECK-BF16-NEXT:    ucvtf v2.4s, v4.4s
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v3.4s
+; CHECK-BF16-NEXT:    ucvtf v3.4s, v5.4s
+; CHECK-BF16-NEXT:    bfcvtn2 v1.8h, v2.4s
+; CHECK-BF16-NEXT:    bfcvtn2 v0.8h, v3.4s
 ; CHECK-BF16-NEXT:    ret
   %1 = uitofp <16 x i8> %a to <16 x bfloat>
   ret <16 x bfloat> %1
@@ -681,13 +668,12 @@ define <8 x bfloat> @uitofp_i16(<8 x i16> %a) #0 {
 ;
 ; CHECK-BF16-LABEL: uitofp_i16:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    ushll2 v1.4s, v0.8h, #0
-; CHECK-BF16-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BF16-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-BF16-NEXT:    ushll2 v2.4s, v0.8h, #0
 ; CHECK-BF16-NEXT:    ucvtf v1.4s, v1.4s
-; CHECK-BF16-NEXT:    ucvtf v0.4s, v0.4s
-; CHECK-BF16-NEXT:    bfcvtn v1.4h, v1.4s
-; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
-; CHECK-BF16-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v1.4s
+; CHECK-BF16-NEXT:    ucvtf v1.4s, v2.4s
+; CHECK-BF16-NEXT:    bfcvtn2 v0.8h, v1.4s
 ; CHECK-BF16-NEXT:    ret
   %1 = uitofp <8 x i16> %a to <8 x bfloat>
   ret <8 x bfloat> %1
@@ -713,11 +699,10 @@ define <8 x bfloat> @uitofp_i32(<8 x i32> %a) #0 {
 ;
 ; CHECK-BF16-LABEL: uitofp_i32:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    ucvtf v1.4s, v1.4s
 ; CHECK-BF16-NEXT:    ucvtf v0.4s, v0.4s
-; CHECK-BF16-NEXT:    bfcvtn v1.4h, v1.4s
+; CHECK-BF16-NEXT:    ucvtf v1.4s, v1.4s
 ; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
-; CHECK-BF16-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-BF16-NEXT:    bfcvtn2 v0.8h, v1.4s
 ; CHECK-BF16-NEXT:    ret
   %1 = uitofp <8 x i32> %a to <8 x bfloat>
   ret <8 x bfloat> %1
@@ -756,17 +741,16 @@ define <8 x bfloat> @uitofp_i64(<8 x i64> %a) #0 {
 ;
 ; CHECK-BF16-LABEL: uitofp_i64:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    ucvtf v2.2d, v2.2d
 ; CHECK-BF16-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-BF16-NEXT:    ucvtf v3.2d, v3.2d
+; CHECK-BF16-NEXT:    ucvtf v2.2d, v2.2d
 ; CHECK-BF16-NEXT:    ucvtf v1.2d, v1.2d
-; CHECK-BF16-NEXT:    fcvtn v2.2s, v2.2d
+; CHECK-BF16-NEXT:    ucvtf v3.2d, v3.2d
 ; CHECK-BF16-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-BF16-NEXT:    fcvtn2 v2.4s, v3.2d
+; CHECK-BF16-NEXT:    fcvtn v2.2s, v2.2d
 ; CHECK-BF16-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-BF16-NEXT:    bfcvtn v1.4h, v2.4s
+; CHECK-BF16-NEXT:    fcvtn2 v2.4s, v3.2d
 ; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
-; CHECK-BF16-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-BF16-NEXT:    bfcvtn2 v0.8h, v2.4s
 ; CHECK-BF16-NEXT:    ret
   %1 = uitofp <8 x i64> %a to <8 x bfloat>
   ret <8 x bfloat> %1
diff --git llvm/test/CodeGen/AArch64/csel-cmp-cse.ll llvm/test/CodeGen/AArch64/csel-cmp-cse.ll
index d8904cc6e35e..e74532632332 100644
--- llvm/test/CodeGen/AArch64/csel-cmp-cse.ll
+++ llvm/test/CodeGen/AArch64/csel-cmp-cse.ll
@@ -335,6 +335,300 @@ define i32 @test_eq0_multi_use_sub_i32(i32 %x0, i32 %x1) {
   ret i32 %ret
 }
 
+define i32 @test_eq_nonconst_sub_add_i32(i32 %x0, i32 %x1, i32 %x2) {
+; CHECK-LABEL: test_eq_nonconst_sub_add_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subs w8, w1, w2
+; CHECK-NEXT:    add w8, w8, w0
+; CHECK-NEXT:    csel w0, wzr, w8, eq
+; CHECK-NEXT:    ret
+  %cmp = icmp eq i32 %x1, %x2
+  %add = add nuw i32 %x0, %x1
+  %sub = sub i32 %add, %x2
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+define i32 @test_ne_nonconst_sub_add_i32(i32 %x0, i32 %x1, i32 %x2) {
+; CHECK-LABEL: test_ne_nonconst_sub_add_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subs w8, w1, w2
+; CHECK-NEXT:    add w8, w8, w0
+; CHECK-NEXT:    csel w0, wzr, w8, ne
+; CHECK-NEXT:    ret
+  %cmp = icmp ne i32 %x1, %x2
+  %add = add nuw i32 %x0, %x1
+  %sub = sub i32 %add, %x2
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+define i32 @test_ult_nonconst_i32(i32 %x0, i32 %x1, i32 %x2) {
+; CHECK-LABEL: test_ult_nonconst_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subs w8, w1, w2
+; CHECK-NEXT:    add w8, w8, w0
+; CHECK-NEXT:    csel w0, wzr, w8, lo
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i32 %x1, %x2
+  %add = add i32 %x0, %x1
+  %sub = sub i32 %add, %x2
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+define i32 @test_ule_nonconst_i32(i32 %x0, i32 %x1, i32 %x2) {
+; CHECK-LABEL: test_ule_nonconst_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subs w8, w1, w2
+; CHECK-NEXT:    add w8, w8, w0
+; CHECK-NEXT:    csel w0, wzr, w8, ls
+; CHECK-NEXT:    ret
+  %cmp = icmp ule i32 %x1, %x2
+  %add = add i32 %x0, %x1
+  %sub = sub i32 %add, %x2
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+define i32 @test_ugt_nonconst_i32(i32 %x0, i32 %x1, i32 %x2) {
+; CHECK-LABEL: test_ugt_nonconst_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subs w8, w1, w2
+; CHECK-NEXT:    add w8, w8, w0
+; CHECK-NEXT:    csel w0, wzr, w8, hi
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i32 %x1, %x2
+  %add = add i32 %x0, %x1
+  %sub = sub i32 %add, %x2
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+define i32 @test_uge_nonconst_i32(i32 %x0, i32 %x1, i32 %x2) {
+; CHECK-LABEL: test_uge_nonconst_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subs w8, w1, w2
+; CHECK-NEXT:    add w8, w8, w0
+; CHECK-NEXT:    csel w0, wzr, w8, hs
+; CHECK-NEXT:    ret
+  %cmp = icmp uge i32 %x1, %x2
+  %add = add i32 %x0, %x1
+  %sub = sub i32 %add, %x2
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+define i32 @test_slt_nonconst_i32(i32 %x0, i32 %x1, i32 %x2) {
+; CHECK-LABEL: test_slt_nonconst_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subs w8, w1, w2
+; CHECK-NEXT:    add w8, w8, w0
+; CHECK-NEXT:    csel w0, wzr, w8, lt
+; CHECK-NEXT:    ret
+  %cmp = icmp slt i32 %x1, %x2
+  %add = add i32 %x0, %x1
+  %sub = sub i32 %add, %x2
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+define i32 @test_sle_nonconst_i32(i32 %x0, i32 %x1, i32 %x2) {
+; CHECK-LABEL: test_sle_nonconst_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subs w8, w1, w2
+; CHECK-NEXT:    add w8, w8, w0
+; CHECK-NEXT:    csel w0, wzr, w8, le
+; CHECK-NEXT:    ret
+  %cmp = icmp sle i32 %x1, %x2
+  %add = add i32 %x0, %x1
+  %sub = sub i32 %add, %x2
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+define i32 @test_sgt_nonconst_i32(i32 %x0, i32 %x1, i32 %x2) {
+; CHECK-LABEL: test_sgt_nonconst_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subs w8, w1, w2
+; CHECK-NEXT:    add w8, w8, w0
+; CHECK-NEXT:    csel w0, wzr, w8, gt
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt i32 %x1, %x2
+  %add = add i32 %x0, %x1
+  %sub = sub i32 %add, %x2
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+define i32 @test_sge_nonconst_i32(i32 %x0, i32 %x1, i32 %x2) {
+; CHECK-LABEL: test_sge_nonconst_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subs w8, w1, w2
+; CHECK-NEXT:    add w8, w8, w0
+; CHECK-NEXT:    csel w0, wzr, w8, ge
+; CHECK-NEXT:    ret
+  %cmp = icmp sge i32 %x1, %x2
+  %add = add i32 %x0, %x1
+  %sub = sub i32 %add, %x2
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+define i64 @test_ult_nonconst_i64(i64 %x0, i64 %x1, i64 %x2) {
+; CHECK-LABEL: test_ult_nonconst_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subs x8, x1, x2
+; CHECK-NEXT:    add x8, x8, x0
+; CHECK-NEXT:    csel x0, xzr, x8, lo
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i64 %x1, %x2
+  %add = add i64 %x0, %x1
+  %sub = sub i64 %add, %x2
+  %ret = select i1 %cmp, i64 0, i64 %sub
+  ret i64 %ret
+}
+
+define i32 @test_eq_nonconst_sub_add_comm_i32(i32 %x0, i32 %x1, i32 %x2) {
+; CHECK-LABEL: test_eq_nonconst_sub_add_comm_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subs w8, w1, w2
+; CHECK-NEXT:    add w8, w8, w0
+; CHECK-NEXT:    csel w0, wzr, w8, eq
+; CHECK-NEXT:    ret
+  %cmp = icmp eq i32 %x2, %x1
+  %add = add nuw i32 %x0, %x1
+  %sub = sub i32 %add, %x2
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+define i32 @test_ne_nonconst_sub_add_comm_i32(i32 %x0, i32 %x1, i32 %x2) {
+; CHECK-LABEL: test_ne_nonconst_sub_add_comm_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subs w8, w1, w2
+; CHECK-NEXT:    add w8, w8, w0
+; CHECK-NEXT:    csel w0, wzr, w8, ne
+; CHECK-NEXT:    ret
+  %cmp = icmp ne i32 %x2, %x1
+  %add = add nuw i32 %x0, %x1
+  %sub = sub i32 %add, %x2
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+define i32 @test_ult_nonconst_sub_add_comm_i32(i32 %x0, i32 %x1, i32 %x2) {
+; CHECK-LABEL: test_ult_nonconst_sub_add_comm_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subs w8, w1, w2
+; CHECK-NEXT:    add w8, w8, w0
+; CHECK-NEXT:    csel w0, wzr, w8, hi
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i32 %x2, %x1
+  %add = add nuw i32 %x0, %x1
+  %sub = sub i32 %add, %x2
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+define i32 @test_ule_nonconst_sub_add_comm_i32(i32 %x0, i32 %x1, i32 %x2) {
+; CHECK-LABEL: test_ule_nonconst_sub_add_comm_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subs w8, w1, w2
+; CHECK-NEXT:    add w8, w8, w0
+; CHECK-NEXT:    csel w0, wzr, w8, hs
+; CHECK-NEXT:    ret
+  %cmp = icmp ule i32 %x2, %x1
+  %add = add nuw i32 %x0, %x1
+  %sub = sub i32 %add, %x2
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+define i32 @test_ugt_nonconst_sub_add_comm_i32(i32 %x0, i32 %x1, i32 %x2) {
+; CHECK-LABEL: test_ugt_nonconst_sub_add_comm_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subs w8, w1, w2
+; CHECK-NEXT:    add w8, w8, w0
+; CHECK-NEXT:    csel w0, wzr, w8, lo
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i32 %x2, %x1
+  %add = add nuw i32 %x0, %x1
+  %sub = sub i32 %add, %x2
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+define i32 @test_uge_nonconst_sub_add_comm_i32(i32 %x0, i32 %x1, i32 %x2) {
+; CHECK-LABEL: test_uge_nonconst_sub_add_comm_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subs w8, w1, w2
+; CHECK-NEXT:    add w8, w8, w0
+; CHECK-NEXT:    csel w0, wzr, w8, ls
+; CHECK-NEXT:    ret
+  %cmp = icmp uge i32 %x2, %x1
+  %add = add nuw i32 %x0, %x1
+  %sub = sub i32 %add, %x2
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+define i32 @test_slt_nonconst_sub_add_comm_i32(i32 %x0, i32 %x1, i32 %x2) {
+; CHECK-LABEL: test_slt_nonconst_sub_add_comm_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subs w8, w1, w2
+; CHECK-NEXT:    add w8, w8, w0
+; CHECK-NEXT:    csel w0, wzr, w8, gt
+; CHECK-NEXT:    ret
+  %cmp = icmp slt i32 %x2, %x1
+  %add = add nuw i32 %x0, %x1
+  %sub = sub i32 %add, %x2
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+define i32 @test_sle_nonconst_sub_add_comm_i32(i32 %x0, i32 %x1, i32 %x2) {
+; CHECK-LABEL: test_sle_nonconst_sub_add_comm_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subs w8, w1, w2
+; CHECK-NEXT:    add w8, w8, w0
+; CHECK-NEXT:    csel w0, wzr, w8, ge
+; CHECK-NEXT:    ret
+  %cmp = icmp sle i32 %x2, %x1
+  %add = add nuw i32 %x0, %x1
+  %sub = sub i32 %add, %x2
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+define i32 @test_sgt_nonconst_sub_add_comm_i32(i32 %x0, i32 %x1, i32 %x2) {
+; CHECK-LABEL: test_sgt_nonconst_sub_add_comm_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subs w8, w1, w2
+; CHECK-NEXT:    add w8, w8, w0
+; CHECK-NEXT:    csel w0, wzr, w8, lt
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt i32 %x2, %x1
+  %add = add nuw i32 %x0, %x1
+  %sub = sub i32 %add, %x2
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+define i32 @test_sge_nonconst_sub_add_comm_i32(i32 %x0, i32 %x1, i32 %x2) {
+; CHECK-LABEL: test_sge_nonconst_sub_add_comm_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subs w8, w1, w2
+; CHECK-NEXT:    add w8, w8, w0
+; CHECK-NEXT:    csel w0, wzr, w8, le
+; CHECK-NEXT:    ret
+  %cmp = icmp sge i32 %x2, %x1
+  %add = add nuw i32 %x0, %x1
+  %sub = sub i32 %add, %x2
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
 ; Negative test
 define i32 @test_eq0_multi_use_cmp_i32(i32 %x0, i32 %x1) {
 ; CHECK-LABEL: test_eq0_multi_use_cmp_i32:
@@ -421,22 +715,6 @@ define i32 @test_ugtsmax_sub_add_i32(i32 %x0, i32 %x1) {
   ret i32 %ret
 }
 
-; Negative test
-define i32 @test_ult_nonconst_i32(i32 %x0, i32 %x1, i32 %x2) {
-; CHECK-LABEL: test_ult_nonconst_i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    add w8, w0, w1
-; CHECK-NEXT:    cmp w1, w2
-; CHECK-NEXT:    sub w8, w8, w2
-; CHECK-NEXT:    csel w0, wzr, w8, lo
-; CHECK-NEXT:    ret
-  %cmp = icmp ult i32 %x1, %x2
-  %add = add i32 %x0, %x1
-  %sub = sub i32 %add, %x2
-  %ret = select i1 %cmp, i32 0, i32 %sub
-  ret i32 %ret
-}
-
 ; Negative test
 define i32 @test_eq_const_mismatch_i32(i32 %x0, i32 %x1) {
 ; CHECK-LABEL: test_eq_const_mismatch_i32:
@@ -629,6 +907,40 @@ define i16 @test_eq0_sub_add_i16(i16 %x0, i16 %x1) {
   ret i16 %ret
 }
 
+; Negative test
+define i8 @test_eq_nonconst_sub_add_i8(i8 %x0, i8 %x1, i8 %x2) {
+; CHECK-LABEL: test_eq_nonconst_sub_add_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and w8, w1, #0xff
+; CHECK-NEXT:    add w9, w0, w1
+; CHECK-NEXT:    sub w9, w9, w2
+; CHECK-NEXT:    cmp w8, w2, uxtb
+; CHECK-NEXT:    csel w0, wzr, w9, eq
+; CHECK-NEXT:    ret
+  %cmp = icmp eq i8 %x1, %x2
+  %add = add nuw i8 %x0, %x1
+  %sub = sub i8 %add, %x2
+  %ret = select i1 %cmp, i8 0, i8 %sub
+  ret i8 %ret
+}
+
+; Negative test
+define i16 @test_eq_nonconst_sub_add_i16(i16 %x0, i16 %x1, i16 %x2) {
+; CHECK-LABEL: test_eq_nonconst_sub_add_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and w8, w1, #0xffff
+; CHECK-NEXT:    add w9, w0, w1
+; CHECK-NEXT:    sub w9, w9, w2
+; CHECK-NEXT:    cmp w8, w2, uxth
+; CHECK-NEXT:    csel w0, wzr, w9, eq
+; CHECK-NEXT:    ret
+  %cmp = icmp eq i16 %x1, %x2
+  %add = add nuw i16 %x0, %x1
+  %sub = sub i16 %add, %x2
+  %ret = select i1 %cmp, i16 0, i16 %sub
+  ret i16 %ret
+}
+
 ; Negative test
 define i32 @test_ule_unsigned_overflow(i32 %x0, i32 %x1) {
 ; CHECK-LABEL: test_ule_unsigned_overflow:
@@ -771,3 +1083,51 @@ define i32 @test_eq0_bitwidth_mismatch_2(i32 %x0, i64 %x1) {
   %ret = select i1 %cmp, i32 0, i32 %sub
   ret i32 %ret
 }
+
+; Negative test
+define i32 @test_ult_nonconst_op_mismatch_i32(i32 %x0, i32 %x1, i32 %x2) {
+; CHECK-LABEL: test_ult_nonconst_op_mismatch_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add w8, w0, w1
+; CHECK-NEXT:    cmp w1, w2
+; CHECK-NEXT:    add w8, w8, w2
+; CHECK-NEXT:    csel w0, wzr, w8, lo
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i32 %x1, %x2
+  %add = add i32 %x0, %x1
+  %sub = add i32 %add, %x2
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+; Negative test
+define i32 @test_ult_nonconst_unrelated_i32(i32 %x0, i32 %x1, i32 %x2, i32 %x3) {
+; CHECK-LABEL: test_ult_nonconst_unrelated_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add w8, w0, w1
+; CHECK-NEXT:    cmp w1, w2
+; CHECK-NEXT:    sub w8, w8, w3
+; CHECK-NEXT:    csel w0, wzr, w8, lo
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i32 %x1, %x2
+  %add = add i32 %x0, %x1
+  %sub = sub i32 %add, %x3
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
+
+; Negative test
+define i32 @test_ult_nonconst_unrelated_2_i32(i32 %x0, i32 %x1, i32 %x2, i32 %x3) {
+; CHECK-LABEL: test_ult_nonconst_unrelated_2_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add w8, w0, w1
+; CHECK-NEXT:    cmp w2, w1
+; CHECK-NEXT:    sub w8, w8, w3
+; CHECK-NEXT:    csel w0, wzr, w8, lo
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i32 %x2, %x1
+  %add = add i32 %x0, %x1
+  %sub = sub i32 %add, %x3
+  %ret = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %ret
+}
diff --git llvm/test/CodeGen/AArch64/csel-subs-swapped.ll llvm/test/CodeGen/AArch64/csel-subs-swapped.ll
index 7c628cf1683d..3971da27cddd 100644
--- llvm/test/CodeGen/AArch64/csel-subs-swapped.ll
+++ llvm/test/CodeGen/AArch64/csel-subs-swapped.ll
@@ -5,8 +5,7 @@ define i32 @eq_i32(i32 %x) {
 ; CHECK-LABEL: eq_i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #-2097152 // =0xffe00000
-; CHECK-NEXT:    cmn w0, #512, lsl #12 // =2097152
-; CHECK-NEXT:    sub w8, w8, w0
+; CHECK-NEXT:    subs w8, w8, w0
 ; CHECK-NEXT:    csel w0, w0, w8, eq
 ; CHECK-NEXT:    ret
   %cmp = icmp eq i32 %x, -2097152
@@ -19,8 +18,7 @@ define i32 @ne_i32(i32 %x) {
 ; CHECK-LABEL: ne_i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #-2097152 // =0xffe00000
-; CHECK-NEXT:    cmn w0, #512, lsl #12 // =2097152
-; CHECK-NEXT:    sub w8, w8, w0
+; CHECK-NEXT:    subs w8, w8, w0
 ; CHECK-NEXT:    csel w0, w0, w8, ne
 ; CHECK-NEXT:    ret
   %cmp = icmp ne i32 %x, -2097152
@@ -33,9 +31,8 @@ define i32 @sgt_i32(i32 %x) {
 ; CHECK-LABEL: sgt_i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #-2097152 // =0xffe00000
-; CHECK-NEXT:    cmn w0, #512, lsl #12 // =2097152
-; CHECK-NEXT:    sub w8, w8, w0
-; CHECK-NEXT:    csel w0, w0, w8, gt
+; CHECK-NEXT:    subs w8, w8, w0
+; CHECK-NEXT:    csel w0, w0, w8, lt
 ; CHECK-NEXT:    ret
   %cmp = icmp sgt i32 %x, -2097152
   %sub = sub i32 -2097152, %x
@@ -62,9 +59,8 @@ define i32 @slt_i32(i32 %x) {
 ; CHECK-LABEL: slt_i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #-2097152 // =0xffe00000
-; CHECK-NEXT:    cmn w0, #512, lsl #12 // =2097152
-; CHECK-NEXT:    sub w8, w8, w0
-; CHECK-NEXT:    csel w0, w0, w8, lt
+; CHECK-NEXT:    subs w8, w8, w0
+; CHECK-NEXT:    csel w0, w0, w8, gt
 ; CHECK-NEXT:    ret
   %cmp = icmp slt i32 %x, -2097152
   %sub = sub i32 -2097152, %x
@@ -91,9 +87,8 @@ define i32 @ugt_i32(i32 %x) {
 ; CHECK-LABEL: ugt_i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #-2097152 // =0xffe00000
-; CHECK-NEXT:    cmn w0, #512, lsl #12 // =2097152
-; CHECK-NEXT:    sub w8, w8, w0
-; CHECK-NEXT:    csel w0, w0, w8, hi
+; CHECK-NEXT:    subs w8, w8, w0
+; CHECK-NEXT:    csel w0, w0, w8, lo
 ; CHECK-NEXT:    ret
   %cmp = icmp ugt i32 %x, -2097152
   %sub = sub i32 -2097152, %x
@@ -120,9 +115,8 @@ define i32 @ult_i32(i32 %x) {
 ; CHECK-LABEL: ult_i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #-2097152 // =0xffe00000
-; CHECK-NEXT:    cmn w0, #512, lsl #12 // =2097152
-; CHECK-NEXT:    sub w8, w8, w0
-; CHECK-NEXT:    csel w0, w0, w8, lo
+; CHECK-NEXT:    subs w8, w8, w0
+; CHECK-NEXT:    csel w0, w0, w8, hi
 ; CHECK-NEXT:    ret
   %cmp = icmp ult i32 %x, -2097152
   %sub = sub i32 -2097152, %x
@@ -150,8 +144,7 @@ define i64 @eq_i64(i64 %x) {
 ; CHECK-LABEL: eq_i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #100 // =0x64
-; CHECK-NEXT:    cmp x0, #100
-; CHECK-NEXT:    sub x8, x8, x0
+; CHECK-NEXT:    subs x8, x8, x0
 ; CHECK-NEXT:    csel x0, x0, x8, eq
 ; CHECK-NEXT:    ret
   %cmp = icmp eq i64 %x, 100
@@ -164,8 +157,7 @@ define i64 @ne_i64(i64 %x) {
 ; CHECK-LABEL: ne_i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #100 // =0x64
-; CHECK-NEXT:    cmp x0, #100
-; CHECK-NEXT:    sub x8, x8, x0
+; CHECK-NEXT:    subs x8, x8, x0
 ; CHECK-NEXT:    csel x0, x0, x8, ne
 ; CHECK-NEXT:    ret
   %cmp = icmp ne i64 %x, 100
@@ -178,9 +170,8 @@ define i64 @sgt_i64(i64 %x) {
 ; CHECK-LABEL: sgt_i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #100 // =0x64
-; CHECK-NEXT:    cmp x0, #100
-; CHECK-NEXT:    sub x8, x8, x0
-; CHECK-NEXT:    csel x0, x0, x8, gt
+; CHECK-NEXT:    subs x8, x8, x0
+; CHECK-NEXT:    csel x0, x0, x8, lt
 ; CHECK-NEXT:    ret
   %cmp = icmp sgt i64 %x, 100
   %sub = sub i64 100, %x
@@ -206,9 +197,8 @@ define i64 @slt_i64(i64 %x) {
 ; CHECK-LABEL: slt_i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #100 // =0x64
-; CHECK-NEXT:    cmp x0, #100
-; CHECK-NEXT:    sub x8, x8, x0
-; CHECK-NEXT:    csel x0, x0, x8, lt
+; CHECK-NEXT:    subs x8, x8, x0
+; CHECK-NEXT:    csel x0, x0, x8, gt
 ; CHECK-NEXT:    ret
   %cmp = icmp slt i64 %x, 100
   %sub = sub i64 100, %x
@@ -234,9 +224,8 @@ define i64 @ugt_i64(i64 %x) {
 ; CHECK-LABEL: ugt_i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #100 // =0x64
-; CHECK-NEXT:    cmp x0, #100
-; CHECK-NEXT:    sub x8, x8, x0
-; CHECK-NEXT:    csel x0, x0, x8, hi
+; CHECK-NEXT:    subs x8, x8, x0
+; CHECK-NEXT:    csel x0, x0, x8, lo
 ; CHECK-NEXT:    ret
   %cmp = icmp ugt i64 %x, 100
   %sub = sub i64 100, %x
@@ -262,9 +251,8 @@ define i64 @ult_i64(i64 %x) {
 ; CHECK-LABEL: ult_i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #100 // =0x64
-; CHECK-NEXT:    cmp x0, #100
-; CHECK-NEXT:    sub x8, x8, x0
-; CHECK-NEXT:    csel x0, x0, x8, lo
+; CHECK-NEXT:    subs x8, x8, x0
+; CHECK-NEXT:    csel x0, x0, x8, hi
 ; CHECK-NEXT:    ret
   %cmp = icmp ult i64 %x, 100
   %sub = sub i64 100, %x
diff --git llvm/test/CodeGen/AArch64/extract-vector-cmp.ll llvm/test/CodeGen/AArch64/extract-vector-cmp.ll
index 12bd2db2297d..8345fdfa46b4 100644
--- llvm/test/CodeGen/AArch64/extract-vector-cmp.ll
+++ llvm/test/CodeGen/AArch64/extract-vector-cmp.ll
@@ -58,10 +58,11 @@ define i128 @extract_icmp_v1i128(ptr %p) {
 ; CHECK-LABEL: extract_icmp_v1i128:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp x9, x8, [x0]
-; CHECK-NEXT:    mov x1, xzr
 ; CHECK-NEXT:    orr x8, x9, x8
 ; CHECK-NEXT:    cmp x8, #0
-; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    cset w8, eq
+; CHECK-NEXT:    sbfx x0, x8, #0, #1
+; CHECK-NEXT:    mov x1, x0
 ; CHECK-NEXT:    ret
   %load = load <1 x i128>, ptr %p, align 16
   %cmp = icmp eq <1 x i128> %load, zeroinitializer
@@ -141,6 +142,26 @@ for.cond.cleanup:
 }
 
 
+; TODO: Combine the sbfx(cset) into a csetm
+define i32 @issue_121372(<4 x i32> %v) {
+; CHECK-LABEL: issue_121372:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    cmp w8, #0
+; CHECK-NEXT:    cset w8, eq
+; CHECK-NEXT:    sbfx w8, w8, #0, #1
+; CHECK-NEXT:    cmp w8, #1
+; CHECK-NEXT:    csetm w0, lt
+; CHECK-NEXT:    ret
+  %cmp_ule = icmp ule <4 x i32> %v, zeroinitializer
+  %sext_v4i1 = sext <4 x i1> %cmp_ule to <4 x i32>
+  %cmp_sge = icmp sge <4 x i32> zeroinitializer, %sext_v4i1
+  %ext = extractelement <4 x i1> %cmp_sge, i32 0
+  %res = sext i1 %ext to i32
+  ret i32 %res
+}
+
+
 ; Negative tests
 
 define i1 @extract_icmp_v4i32_splat_rhs(<4 x i32> %a, i32 %b) {
@@ -163,9 +184,9 @@ define i1 @extract_icmp_v4i32_splat_rhs_mul_use(<4 x i32> %a, ptr %p) {
 ; CHECK-LABEL: extract_icmp_v4i32_splat_rhs_mul_use:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v1.4s, #235
-; CHECK-NEXT:    adrp x9, .LCPI7_0
+; CHECK-NEXT:    adrp x9, .LCPI8_0
 ; CHECK-NEXT:    mov x8, x0
-; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI7_0]
+; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI8_0]
 ; CHECK-NEXT:    cmhi v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    xtn v1.4h, v0.4s
 ; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
diff --git llvm/test/CodeGen/AArch64/fsh.ll llvm/test/CodeGen/AArch64/fsh.ll
new file mode 100644
index 000000000000..b3ce00aeb36e
--- /dev/null
+++ llvm/test/CodeGen/AArch64/fsh.ll
@@ -0,0 +1,4636 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+define i8 @rotl_i8(i8 %a, i8 %c) {
+; CHECK-SD-LABEL: rotl_i8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    neg w8, w1
+; CHECK-SD-NEXT:    and w9, w0, #0xff
+; CHECK-SD-NEXT:    and w10, w1, #0x7
+; CHECK-SD-NEXT:    and w8, w8, #0x7
+; CHECK-SD-NEXT:    lsl w10, w0, w10
+; CHECK-SD-NEXT:    lsr w8, w9, w8
+; CHECK-SD-NEXT:    orr w0, w10, w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rotl_i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov x8, xzr
+; CHECK-GI-NEXT:    neg w9, w1
+; CHECK-GI-NEXT:    and w10, w0, #0xff
+; CHECK-GI-NEXT:    sub x8, x8, w9, uxtb
+; CHECK-GI-NEXT:    and x9, x9, #0x7
+; CHECK-GI-NEXT:    lsr w9, w10, w9
+; CHECK-GI-NEXT:    and x8, x8, #0x7
+; CHECK-GI-NEXT:    lsl w8, w0, w8
+; CHECK-GI-NEXT:    orr w0, w9, w8
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call i8 @llvm.fshl(i8 %a, i8 %a, i8 %c)
+  ret i8 %d
+}
+
+define i8 @rotr_i8(i8 %a, i8 %c) {
+; CHECK-SD-LABEL: rotr_i8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    neg w8, w1
+; CHECK-SD-NEXT:    and w9, w0, #0xff
+; CHECK-SD-NEXT:    and w10, w1, #0x7
+; CHECK-SD-NEXT:    and w8, w8, #0x7
+; CHECK-SD-NEXT:    lsr w9, w9, w10
+; CHECK-SD-NEXT:    lsl w8, w0, w8
+; CHECK-SD-NEXT:    orr w0, w9, w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rotr_i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov x8, xzr
+; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-GI-NEXT:    and x9, x1, #0x7
+; CHECK-GI-NEXT:    and w10, w0, #0xff
+; CHECK-GI-NEXT:    sub x8, x8, w1, uxtb
+; CHECK-GI-NEXT:    lsr w9, w10, w9
+; CHECK-GI-NEXT:    and x8, x8, #0x7
+; CHECK-GI-NEXT:    lsl w8, w0, w8
+; CHECK-GI-NEXT:    orr w0, w9, w8
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call i8 @llvm.fshr(i8 %a, i8 %a, i8 %c)
+  ret i8 %d
+}
+
+define i16 @rotl_i16(i16 %a, i16 %c) {
+; CHECK-SD-LABEL: rotl_i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    neg w8, w1
+; CHECK-SD-NEXT:    and w9, w0, #0xffff
+; CHECK-SD-NEXT:    and w10, w1, #0xf
+; CHECK-SD-NEXT:    and w8, w8, #0xf
+; CHECK-SD-NEXT:    lsl w10, w0, w10
+; CHECK-SD-NEXT:    lsr w8, w9, w8
+; CHECK-SD-NEXT:    orr w0, w10, w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rotl_i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov x8, xzr
+; CHECK-GI-NEXT:    neg w9, w1
+; CHECK-GI-NEXT:    and w10, w0, #0xffff
+; CHECK-GI-NEXT:    sub x8, x8, w9, uxth
+; CHECK-GI-NEXT:    and x9, x9, #0xf
+; CHECK-GI-NEXT:    lsr w9, w10, w9
+; CHECK-GI-NEXT:    and x8, x8, #0xf
+; CHECK-GI-NEXT:    lsl w8, w0, w8
+; CHECK-GI-NEXT:    orr w0, w9, w8
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call i16 @llvm.fshl(i16 %a, i16 %a, i16 %c)
+  ret i16 %d
+}
+
+define i16 @rotr_i16(i16 %a, i16 %c) {
+; CHECK-SD-LABEL: rotr_i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    neg w8, w1
+; CHECK-SD-NEXT:    and w9, w0, #0xffff
+; CHECK-SD-NEXT:    and w10, w1, #0xf
+; CHECK-SD-NEXT:    and w8, w8, #0xf
+; CHECK-SD-NEXT:    lsr w9, w9, w10
+; CHECK-SD-NEXT:    lsl w8, w0, w8
+; CHECK-SD-NEXT:    orr w0, w9, w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rotr_i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov x8, xzr
+; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-GI-NEXT:    and x9, x1, #0xf
+; CHECK-GI-NEXT:    and w10, w0, #0xffff
+; CHECK-GI-NEXT:    sub x8, x8, w1, uxth
+; CHECK-GI-NEXT:    lsr w9, w10, w9
+; CHECK-GI-NEXT:    and x8, x8, #0xf
+; CHECK-GI-NEXT:    lsl w8, w0, w8
+; CHECK-GI-NEXT:    orr w0, w9, w8
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call i16 @llvm.fshr(i16 %a, i16 %a, i16 %c)
+  ret i16 %d
+}
+
+define i32 @rotl_i32(i32 %a, i32 %c) {
+; CHECK-LABEL: rotl_i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    neg w8, w1
+; CHECK-NEXT:    ror w0, w0, w8
+; CHECK-NEXT:    ret
+entry:
+  %d = call i32 @llvm.fshl(i32 %a, i32 %a, i32 %c)
+  ret i32 %d
+}
+
+define i32 @rotr_i32(i32 %a, i32 %c) {
+; CHECK-LABEL: rotr_i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ror w0, w0, w1
+; CHECK-NEXT:    ret
+entry:
+  %d = call i32 @llvm.fshr(i32 %a, i32 %a, i32 %c)
+  ret i32 %d
+}
+
+define i64 @rotl_i64(i64 %a, i64 %c) {
+; CHECK-SD-LABEL: rotl_i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    neg w8, w1
+; CHECK-SD-NEXT:    ror x0, x0, x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rotl_i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    neg x8, x1
+; CHECK-GI-NEXT:    ror x0, x0, x8
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call i64 @llvm.fshl(i64 %a, i64 %a, i64 %c)
+  ret i64 %d
+}
+
+define i64 @rotr_i64(i64 %a, i64 %c) {
+; CHECK-LABEL: rotr_i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ror x0, x0, x1
+; CHECK-NEXT:    ret
+entry:
+  %d = call i64 @llvm.fshr(i64 %a, i64 %a, i64 %c)
+  ret i64 %d
+}
+
+define i128 @rotl_i128(i128 %a, i128 %c) {
+; CHECK-SD-LABEL: rotl_i128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    tst x2, #0x40
+; CHECK-SD-NEXT:    mvn w12, w2
+; CHECK-SD-NEXT:    csel x8, x1, x0, ne
+; CHECK-SD-NEXT:    csel x9, x0, x1, ne
+; CHECK-SD-NEXT:    lsr x10, x9, #1
+; CHECK-SD-NEXT:    lsr x11, x8, #1
+; CHECK-SD-NEXT:    lsl x8, x8, x2
+; CHECK-SD-NEXT:    lsl x9, x9, x2
+; CHECK-SD-NEXT:    lsr x10, x10, x12
+; CHECK-SD-NEXT:    lsr x11, x11, x12
+; CHECK-SD-NEXT:    orr x0, x8, x10
+; CHECK-SD-NEXT:    orr x1, x9, x11
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rotl_i128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #64 // =0x40
+; CHECK-GI-NEXT:    and x9, x2, #0x7f
+; CHECK-GI-NEXT:    neg x10, x2
+; CHECK-GI-NEXT:    sub x12, x8, x9
+; CHECK-GI-NEXT:    sub x11, x9, #64
+; CHECK-GI-NEXT:    lsl x14, x1, x9
+; CHECK-GI-NEXT:    lsr x12, x0, x12
+; CHECK-GI-NEXT:    lsl x13, x0, x9
+; CHECK-GI-NEXT:    cmp x9, #64
+; CHECK-GI-NEXT:    and x9, x10, #0x7f
+; CHECK-GI-NEXT:    lsl x11, x0, x11
+; CHECK-GI-NEXT:    orr x12, x12, x14
+; CHECK-GI-NEXT:    sub x8, x8, x9
+; CHECK-GI-NEXT:    sub x14, x9, #64
+; CHECK-GI-NEXT:    csel x11, x12, x11, lo
+; CHECK-GI-NEXT:    lsr x12, x0, x9
+; CHECK-GI-NEXT:    lsl x8, x1, x8
+; CHECK-GI-NEXT:    csel x13, x13, xzr, lo
+; CHECK-GI-NEXT:    tst x2, #0x7f
+; CHECK-GI-NEXT:    lsr x14, x1, x14
+; CHECK-GI-NEXT:    csel x11, x1, x11, eq
+; CHECK-GI-NEXT:    orr x8, x12, x8
+; CHECK-GI-NEXT:    cmp x9, #64
+; CHECK-GI-NEXT:    lsr x12, x1, x9
+; CHECK-GI-NEXT:    csel x8, x8, x14, lo
+; CHECK-GI-NEXT:    tst x10, #0x7f
+; CHECK-GI-NEXT:    csel x8, x0, x8, eq
+; CHECK-GI-NEXT:    cmp x9, #64
+; CHECK-GI-NEXT:    csel x9, x12, xzr, lo
+; CHECK-GI-NEXT:    orr x0, x13, x8
+; CHECK-GI-NEXT:    orr x1, x11, x9
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call i128 @llvm.fshl(i128 %a, i128 %a, i128 %c)
+  ret i128 %d
+}
+
+define i128 @rotr_i128(i128 %a, i128 %c) {
+; CHECK-SD-LABEL: rotr_i128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    tst x2, #0x40
+; CHECK-SD-NEXT:    mvn w12, w2
+; CHECK-SD-NEXT:    csel x8, x0, x1, eq
+; CHECK-SD-NEXT:    csel x9, x1, x0, eq
+; CHECK-SD-NEXT:    lsl x10, x9, #1
+; CHECK-SD-NEXT:    lsl x11, x8, #1
+; CHECK-SD-NEXT:    lsr x8, x8, x2
+; CHECK-SD-NEXT:    lsr x9, x9, x2
+; CHECK-SD-NEXT:    lsl x10, x10, x12
+; CHECK-SD-NEXT:    lsl x11, x11, x12
+; CHECK-SD-NEXT:    orr x0, x10, x8
+; CHECK-SD-NEXT:    orr x1, x11, x9
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rotr_i128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #64 // =0x40
+; CHECK-GI-NEXT:    and x9, x2, #0x7f
+; CHECK-GI-NEXT:    neg x13, x2
+; CHECK-GI-NEXT:    sub x10, x8, x9
+; CHECK-GI-NEXT:    sub x11, x9, #64
+; CHECK-GI-NEXT:    lsr x12, x0, x9
+; CHECK-GI-NEXT:    lsl x10, x1, x10
+; CHECK-GI-NEXT:    lsr x11, x1, x11
+; CHECK-GI-NEXT:    and x14, x13, #0x7f
+; CHECK-GI-NEXT:    cmp x9, #64
+; CHECK-GI-NEXT:    sub x8, x8, x14
+; CHECK-GI-NEXT:    lsl x15, x1, x14
+; CHECK-GI-NEXT:    orr x10, x12, x10
+; CHECK-GI-NEXT:    lsr x12, x1, x9
+; CHECK-GI-NEXT:    lsr x8, x0, x8
+; CHECK-GI-NEXT:    csel x10, x10, x11, lo
+; CHECK-GI-NEXT:    sub x11, x14, #64
+; CHECK-GI-NEXT:    tst x2, #0x7f
+; CHECK-GI-NEXT:    csel x10, x0, x10, eq
+; CHECK-GI-NEXT:    cmp x9, #64
+; CHECK-GI-NEXT:    lsl x9, x0, x14
+; CHECK-GI-NEXT:    lsl x11, x0, x11
+; CHECK-GI-NEXT:    csel x12, x12, xzr, lo
+; CHECK-GI-NEXT:    orr x8, x8, x15
+; CHECK-GI-NEXT:    cmp x14, #64
+; CHECK-GI-NEXT:    csel x9, x9, xzr, lo
+; CHECK-GI-NEXT:    csel x8, x8, x11, lo
+; CHECK-GI-NEXT:    tst x13, #0x7f
+; CHECK-GI-NEXT:    csel x8, x1, x8, eq
+; CHECK-GI-NEXT:    orr x0, x10, x9
+; CHECK-GI-NEXT:    orr x1, x12, x8
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call i128 @llvm.fshr(i128 %a, i128 %a, i128 %c)
+  ret i128 %d
+}
+
+define i8 @fshl_i8(i8 %a, i8 %b, i8 %c) {
+; CHECK-SD-LABEL: fshl_i8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    and w8, w2, #0x7
+; CHECK-SD-NEXT:    ubfiz w9, w1, #23, #8
+; CHECK-SD-NEXT:    mvn w10, w8
+; CHECK-SD-NEXT:    lsl w8, w0, w8
+; CHECK-SD-NEXT:    lsr w9, w9, w10
+; CHECK-SD-NEXT:    orr w0, w8, w9
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshl_i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    ubfx w9, w1, #1, #7
+; CHECK-GI-NEXT:    and w10, w2, #0x7
+; CHECK-GI-NEXT:    bic w8, w8, w2
+; CHECK-GI-NEXT:    lsl w10, w0, w10
+; CHECK-GI-NEXT:    lsr w8, w9, w8
+; CHECK-GI-NEXT:    orr w0, w10, w8
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call i8 @llvm.fshl(i8 %a, i8 %b, i8 %c)
+  ret i8 %d
+}
+
+define i8 @fshr_i8(i8 %a, i8 %b, i8 %c) {
+; CHECK-SD-LABEL: fshr_i8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #24 // =0x18
+; CHECK-SD-NEXT:    lsl w9, w1, #24
+; CHECK-SD-NEXT:    mvn w10, w2
+; CHECK-SD-NEXT:    bfxil w8, w2, #0, #3
+; CHECK-SD-NEXT:    lsl w11, w0, #1
+; CHECK-SD-NEXT:    and x10, x10, #0x7
+; CHECK-SD-NEXT:    lsr w8, w9, w8
+; CHECK-SD-NEXT:    lsl w9, w11, w10
+; CHECK-SD-NEXT:    orr w0, w9, w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshr_i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    lsl w9, w0, #1
+; CHECK-GI-NEXT:    and w10, w2, #0x7
+; CHECK-GI-NEXT:    bic w8, w8, w2
+; CHECK-GI-NEXT:    and w11, w1, #0xff
+; CHECK-GI-NEXT:    lsl w8, w9, w8
+; CHECK-GI-NEXT:    lsr w9, w11, w10
+; CHECK-GI-NEXT:    orr w0, w8, w9
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call i8 @llvm.fshr(i8 %a, i8 %b, i8 %c)
+  ret i8 %d
+}
+
+define i16 @fshl_i16(i16 %a, i16 %b, i16 %c) {
+; CHECK-SD-LABEL: fshl_i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    and w8, w2, #0xf
+; CHECK-SD-NEXT:    ubfiz w9, w1, #15, #16
+; CHECK-SD-NEXT:    mvn w10, w8
+; CHECK-SD-NEXT:    lsl w8, w0, w8
+; CHECK-SD-NEXT:    lsr w9, w9, w10
+; CHECK-SD-NEXT:    orr w0, w8, w9
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshl_i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #15 // =0xf
+; CHECK-GI-NEXT:    ubfx w9, w1, #1, #15
+; CHECK-GI-NEXT:    and w10, w2, #0xf
+; CHECK-GI-NEXT:    bic w8, w8, w2
+; CHECK-GI-NEXT:    lsl w10, w0, w10
+; CHECK-GI-NEXT:    lsr w8, w9, w8
+; CHECK-GI-NEXT:    orr w0, w10, w8
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call i16 @llvm.fshl(i16 %a, i16 %b, i16 %c)
+  ret i16 %d
+}
+
+define i16 @fshr_i16(i16 %a, i16 %b, i16 %c) {
+; CHECK-SD-LABEL: fshr_i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #16 // =0x10
+; CHECK-SD-NEXT:    lsl w9, w1, #16
+; CHECK-SD-NEXT:    mvn w10, w2
+; CHECK-SD-NEXT:    bfxil w8, w2, #0, #4
+; CHECK-SD-NEXT:    lsl w11, w0, #1
+; CHECK-SD-NEXT:    and x10, x10, #0xf
+; CHECK-SD-NEXT:    lsr w8, w9, w8
+; CHECK-SD-NEXT:    lsl w9, w11, w10
+; CHECK-SD-NEXT:    orr w0, w9, w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshr_i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #15 // =0xf
+; CHECK-GI-NEXT:    lsl w9, w0, #1
+; CHECK-GI-NEXT:    and w10, w2, #0xf
+; CHECK-GI-NEXT:    bic w8, w8, w2
+; CHECK-GI-NEXT:    and w11, w1, #0xffff
+; CHECK-GI-NEXT:    lsl w8, w9, w8
+; CHECK-GI-NEXT:    lsr w9, w11, w10
+; CHECK-GI-NEXT:    orr w0, w8, w9
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call i16 @llvm.fshr(i16 %a, i16 %b, i16 %c)
+  ret i16 %d
+}
+
+define i32 @fshl_i32(i32 %a, i32 %b, i32 %c) {
+; CHECK-SD-LABEL: fshl_i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    lsr w8, w1, #1
+; CHECK-SD-NEXT:    // kill: def $w2 killed $w2 def $x2
+; CHECK-SD-NEXT:    mvn w9, w2
+; CHECK-SD-NEXT:    lsl w10, w0, w2
+; CHECK-SD-NEXT:    lsr w8, w8, w9
+; CHECK-SD-NEXT:    orr w0, w10, w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshl_i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #31 // =0x1f
+; CHECK-GI-NEXT:    lsr w9, w1, #1
+; CHECK-GI-NEXT:    and w10, w2, #0x1f
+; CHECK-GI-NEXT:    bic w8, w8, w2
+; CHECK-GI-NEXT:    lsl w10, w0, w10
+; CHECK-GI-NEXT:    lsr w8, w9, w8
+; CHECK-GI-NEXT:    orr w0, w10, w8
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call i32 @llvm.fshl(i32 %a, i32 %b, i32 %c)
+  ret i32 %d
+}
+
+define i32 @fshr_i32(i32 %a, i32 %b, i32 %c) {
+; CHECK-SD-LABEL: fshr_i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    lsl w8, w0, #1
+; CHECK-SD-NEXT:    // kill: def $w2 killed $w2 def $x2
+; CHECK-SD-NEXT:    mvn w9, w2
+; CHECK-SD-NEXT:    lsr w10, w1, w2
+; CHECK-SD-NEXT:    lsl w8, w8, w9
+; CHECK-SD-NEXT:    orr w0, w8, w10
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshr_i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #31 // =0x1f
+; CHECK-GI-NEXT:    lsl w9, w0, #1
+; CHECK-GI-NEXT:    and w10, w2, #0x1f
+; CHECK-GI-NEXT:    bic w8, w8, w2
+; CHECK-GI-NEXT:    lsl w8, w9, w8
+; CHECK-GI-NEXT:    lsr w9, w1, w10
+; CHECK-GI-NEXT:    orr w0, w8, w9
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call i32 @llvm.fshr(i32 %a, i32 %b, i32 %c)
+  ret i32 %d
+}
+
+define i64 @fshl_i64(i64 %a, i64 %b, i64 %c) {
+; CHECK-SD-LABEL: fshl_i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    lsr x8, x1, #1
+; CHECK-SD-NEXT:    mvn w9, w2
+; CHECK-SD-NEXT:    lsl x10, x0, x2
+; CHECK-SD-NEXT:    lsr x8, x8, x9
+; CHECK-SD-NEXT:    orr x0, x10, x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshl_i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #63 // =0x3f
+; CHECK-GI-NEXT:    lsr x9, x1, #1
+; CHECK-GI-NEXT:    and x10, x2, #0x3f
+; CHECK-GI-NEXT:    bic x8, x8, x2
+; CHECK-GI-NEXT:    lsl x10, x0, x10
+; CHECK-GI-NEXT:    lsr x8, x9, x8
+; CHECK-GI-NEXT:    orr x0, x10, x8
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call i64 @llvm.fshl(i64 %a, i64 %b, i64 %c)
+  ret i64 %d
+}
+
+define i64 @fshr_i64(i64 %a, i64 %b, i64 %c) {
+; CHECK-SD-LABEL: fshr_i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    lsl x8, x0, #1
+; CHECK-SD-NEXT:    mvn w9, w2
+; CHECK-SD-NEXT:    lsr x10, x1, x2
+; CHECK-SD-NEXT:    lsl x8, x8, x9
+; CHECK-SD-NEXT:    orr x0, x8, x10
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshr_i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #63 // =0x3f
+; CHECK-GI-NEXT:    lsl x9, x0, #1
+; CHECK-GI-NEXT:    and x10, x2, #0x3f
+; CHECK-GI-NEXT:    bic x8, x8, x2
+; CHECK-GI-NEXT:    lsl x8, x9, x8
+; CHECK-GI-NEXT:    lsr x9, x1, x10
+; CHECK-GI-NEXT:    orr x0, x8, x9
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call i64 @llvm.fshr(i64 %a, i64 %b, i64 %c)
+  ret i64 %d
+}
+
+define i128 @fshl_i128(i128 %a, i128 %b, i128 %c) {
+; CHECK-SD-LABEL: fshl_i128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    tst x4, #0x40
+; CHECK-SD-NEXT:    mvn w11, w4
+; CHECK-SD-NEXT:    csel x8, x3, x0, ne
+; CHECK-SD-NEXT:    csel x9, x2, x3, ne
+; CHECK-SD-NEXT:    csel x12, x0, x1, ne
+; CHECK-SD-NEXT:    lsr x9, x9, #1
+; CHECK-SD-NEXT:    lsr x10, x8, #1
+; CHECK-SD-NEXT:    lsl x8, x8, x4
+; CHECK-SD-NEXT:    lsl x12, x12, x4
+; CHECK-SD-NEXT:    lsr x9, x9, x11
+; CHECK-SD-NEXT:    lsr x10, x10, x11
+; CHECK-SD-NEXT:    orr x0, x8, x9
+; CHECK-SD-NEXT:    orr x1, x12, x10
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshl_i128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    and x9, x4, #0x7f
+; CHECK-GI-NEXT:    mov w10, #64 // =0x40
+; CHECK-GI-NEXT:    lsl x14, x3, #63
+; CHECK-GI-NEXT:    sub x12, x10, x9
+; CHECK-GI-NEXT:    lsl x13, x1, x9
+; CHECK-GI-NEXT:    mov w8, #127 // =0x7f
+; CHECK-GI-NEXT:    lsr x12, x0, x12
+; CHECK-GI-NEXT:    bic x8, x8, x4
+; CHECK-GI-NEXT:    sub x15, x9, #64
+; CHECK-GI-NEXT:    cmp x9, #64
+; CHECK-GI-NEXT:    lsl x9, x0, x9
+; CHECK-GI-NEXT:    lsl x15, x0, x15
+; CHECK-GI-NEXT:    orr x12, x12, x13
+; CHECK-GI-NEXT:    orr x13, x14, x2, lsr #1
+; CHECK-GI-NEXT:    lsr x14, x3, #1
+; CHECK-GI-NEXT:    sub x10, x10, x8
+; CHECK-GI-NEXT:    sub x16, x8, #64
+; CHECK-GI-NEXT:    csel x9, x9, xzr, lo
+; CHECK-GI-NEXT:    lsr x17, x13, x8
+; CHECK-GI-NEXT:    lsl x10, x14, x10
+; CHECK-GI-NEXT:    csel x12, x12, x15, lo
+; CHECK-GI-NEXT:    tst x4, #0x7f
+; CHECK-GI-NEXT:    lsr x15, x14, x16
+; CHECK-GI-NEXT:    mvn x11, x4
+; CHECK-GI-NEXT:    csel x12, x1, x12, eq
+; CHECK-GI-NEXT:    orr x10, x17, x10
+; CHECK-GI-NEXT:    cmp x8, #64
+; CHECK-GI-NEXT:    lsr x14, x14, x8
+; CHECK-GI-NEXT:    csel x10, x10, x15, lo
+; CHECK-GI-NEXT:    tst x11, #0x7f
+; CHECK-GI-NEXT:    csel x10, x13, x10, eq
+; CHECK-GI-NEXT:    cmp x8, #64
+; CHECK-GI-NEXT:    csel x8, x14, xzr, lo
+; CHECK-GI-NEXT:    orr x0, x9, x10
+; CHECK-GI-NEXT:    orr x1, x12, x8
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call i128 @llvm.fshl(i128 %a, i128 %b, i128 %c)
+  ret i128 %d
+}
+
+define i128 @fshr_i128(i128 %a, i128 %b, i128 %c) {
+; CHECK-SD-LABEL: fshr_i128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    tst x4, #0x40
+; CHECK-SD-NEXT:    mvn w12, w4
+; CHECK-SD-NEXT:    csel x8, x3, x0, eq
+; CHECK-SD-NEXT:    csel x9, x0, x1, eq
+; CHECK-SD-NEXT:    csel x10, x2, x3, eq
+; CHECK-SD-NEXT:    lsl x11, x8, #1
+; CHECK-SD-NEXT:    lsl x9, x9, #1
+; CHECK-SD-NEXT:    lsr x10, x10, x4
+; CHECK-SD-NEXT:    lsr x8, x8, x4
+; CHECK-SD-NEXT:    lsl x11, x11, x12
+; CHECK-SD-NEXT:    lsl x9, x9, x12
+; CHECK-SD-NEXT:    orr x0, x11, x10
+; CHECK-SD-NEXT:    orr x1, x9, x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshr_i128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    lsr x8, x0, #63
+; CHECK-GI-NEXT:    mov w9, #127 // =0x7f
+; CHECK-GI-NEXT:    mov w10, #64 // =0x40
+; CHECK-GI-NEXT:    bic x9, x9, x4
+; CHECK-GI-NEXT:    lsl x11, x0, #1
+; CHECK-GI-NEXT:    and x12, x4, #0x7f
+; CHECK-GI-NEXT:    orr x8, x8, x1, lsl #1
+; CHECK-GI-NEXT:    sub x14, x10, x9
+; CHECK-GI-NEXT:    sub x17, x9, #64
+; CHECK-GI-NEXT:    lsl x15, x11, x9
+; CHECK-GI-NEXT:    lsr x14, x11, x14
+; CHECK-GI-NEXT:    cmp x9, #64
+; CHECK-GI-NEXT:    lsl x16, x8, x9
+; CHECK-GI-NEXT:    sub x9, x10, x12
+; CHECK-GI-NEXT:    lsl x10, x11, x17
+; CHECK-GI-NEXT:    mvn x13, x4
+; CHECK-GI-NEXT:    csel x11, x15, xzr, lo
+; CHECK-GI-NEXT:    sub x15, x12, #64
+; CHECK-GI-NEXT:    orr x14, x14, x16
+; CHECK-GI-NEXT:    lsr x16, x2, x12
+; CHECK-GI-NEXT:    lsl x9, x3, x9
+; CHECK-GI-NEXT:    csel x10, x14, x10, lo
+; CHECK-GI-NEXT:    tst x13, #0x7f
+; CHECK-GI-NEXT:    lsr x13, x3, x15
+; CHECK-GI-NEXT:    csel x8, x8, x10, eq
+; CHECK-GI-NEXT:    orr x9, x16, x9
+; CHECK-GI-NEXT:    cmp x12, #64
+; CHECK-GI-NEXT:    lsr x10, x3, x12
+; CHECK-GI-NEXT:    csel x9, x9, x13, lo
+; CHECK-GI-NEXT:    tst x4, #0x7f
+; CHECK-GI-NEXT:    csel x9, x2, x9, eq
+; CHECK-GI-NEXT:    cmp x12, #64
+; CHECK-GI-NEXT:    csel x10, x10, xzr, lo
+; CHECK-GI-NEXT:    orr x0, x11, x9
+; CHECK-GI-NEXT:    orr x1, x8, x10
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call i128 @llvm.fshr(i128 %a, i128 %b, i128 %c)
+  ret i128 %d
+}
+
+define i8 @rotl_i8_c(i8 %a) {
+; CHECK-LABEL: rotl_i8_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ubfx w8, w0, #5, #3
+; CHECK-NEXT:    orr w0, w8, w0, lsl #3
+; CHECK-NEXT:    ret
+entry:
+  %d = call i8 @llvm.fshl(i8 %a, i8 %a, i8 3)
+  ret i8 %d
+}
+
+define i8 @rotr_i8_c(i8 %a) {
+; CHECK-SD-LABEL: rotr_i8_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    lsl w8, w0, #5
+; CHECK-SD-NEXT:    bfxil w8, w0, #3, #5
+; CHECK-SD-NEXT:    mov w0, w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rotr_i8_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ubfx w8, w0, #3, #5
+; CHECK-GI-NEXT:    orr w0, w8, w0, lsl #5
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call i8 @llvm.fshr(i8 %a, i8 %a, i8 3)
+  ret i8 %d
+}
+
+define i16 @rotl_i16_c(i16 %a) {
+; CHECK-LABEL: rotl_i16_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ubfx w8, w0, #13, #3
+; CHECK-NEXT:    orr w0, w8, w0, lsl #3
+; CHECK-NEXT:    ret
+entry:
+  %d = call i16 @llvm.fshl(i16 %a, i16 %a, i16 3)
+  ret i16 %d
+}
+
+define i16 @rotr_i16_c(i16 %a) {
+; CHECK-SD-LABEL: rotr_i16_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    lsl w8, w0, #13
+; CHECK-SD-NEXT:    bfxil w8, w0, #3, #13
+; CHECK-SD-NEXT:    mov w0, w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rotr_i16_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ubfx w8, w0, #3, #13
+; CHECK-GI-NEXT:    orr w0, w8, w0, lsl #13
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call i16 @llvm.fshr(i16 %a, i16 %a, i16 3)
+  ret i16 %d
+}
+
+define i32 @rotl_i32_c(i32 %a) {
+; CHECK-LABEL: rotl_i32_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ror w0, w0, #29
+; CHECK-NEXT:    ret
+entry:
+  %d = call i32 @llvm.fshl(i32 %a, i32 %a, i32 3)
+  ret i32 %d
+}
+
+define i32 @rotr_i32_c(i32 %a) {
+; CHECK-LABEL: rotr_i32_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ror w0, w0, #3
+; CHECK-NEXT:    ret
+entry:
+  %d = call i32 @llvm.fshr(i32 %a, i32 %a, i32 3)
+  ret i32 %d
+}
+
+define i64 @rotl_i64_c(i64 %a) {
+; CHECK-LABEL: rotl_i64_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ror x0, x0, #61
+; CHECK-NEXT:    ret
+entry:
+  %d = call i64 @llvm.fshl(i64 %a, i64 %a, i64 3)
+  ret i64 %d
+}
+
+define i64 @rotr_i64_c(i64 %a) {
+; CHECK-LABEL: rotr_i64_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ror x0, x0, #3
+; CHECK-NEXT:    ret
+entry:
+  %d = call i64 @llvm.fshr(i64 %a, i64 %a, i64 3)
+  ret i64 %d
+}
+
+define i128 @rotl_i128_c(i128 %a) {
+; CHECK-SD-LABEL: rotl_i128_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    extr x8, x0, x1, #61
+; CHECK-SD-NEXT:    extr x1, x1, x0, #61
+; CHECK-SD-NEXT:    mov x0, x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rotl_i128_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    lsr x8, x0, #61
+; CHECK-GI-NEXT:    lsr x9, x1, #61
+; CHECK-GI-NEXT:    orr x1, x8, x1, lsl #3
+; CHECK-GI-NEXT:    orr x0, x9, x0, lsl #3
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call i128 @llvm.fshl(i128 %a, i128 %a, i128 3)
+  ret i128 %d
+}
+
+define i128 @rotr_i128_c(i128 %a) {
+; CHECK-SD-LABEL: rotr_i128_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    extr x8, x1, x0, #3
+; CHECK-SD-NEXT:    extr x1, x0, x1, #3
+; CHECK-SD-NEXT:    mov x0, x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rotr_i128_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    lsl x8, x1, #61
+; CHECK-GI-NEXT:    lsl x9, x0, #61
+; CHECK-GI-NEXT:    orr x0, x8, x0, lsr #3
+; CHECK-GI-NEXT:    orr x1, x9, x1, lsr #3
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call i128 @llvm.fshr(i128 %a, i128 %a, i128 3)
+  ret i128 %d
+}
+
+define i8 @fshl_i8_c(i8 %a, i8 %b) {
+; CHECK-SD-LABEL: fshl_i8_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    lsl w8, w1, #24
+; CHECK-SD-NEXT:    extr w0, w0, w8, #29
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshl_i8_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ubfx w8, w1, #5, #3
+; CHECK-GI-NEXT:    orr w0, w8, w0, lsl #3
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call i8 @llvm.fshl(i8 %a, i8 %b, i8 3)
+  ret i8 %d
+}
+
+define i8 @fshr_i8_c(i8 %a, i8 %b) {
+; CHECK-SD-LABEL: fshr_i8_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    lsl w8, w1, #24
+; CHECK-SD-NEXT:    extr w0, w0, w8, #27
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshr_i8_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ubfx w8, w1, #3, #5
+; CHECK-GI-NEXT:    orr w0, w8, w0, lsl #5
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call i8 @llvm.fshr(i8 %a, i8 %b, i8 3)
+  ret i8 %d
+}
+
+define i16 @fshl_i16_c(i16 %a, i16 %b) {
+; CHECK-SD-LABEL: fshl_i16_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    lsl w8, w1, #16
+; CHECK-SD-NEXT:    extr w0, w0, w8, #29
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshl_i16_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ubfx w8, w1, #13, #3
+; CHECK-GI-NEXT:    orr w0, w8, w0, lsl #3
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call i16 @llvm.fshl(i16 %a, i16 %b, i16 3)
+  ret i16 %d
+}
+
+define i16 @fshr_i16_c(i16 %a, i16 %b) {
+; CHECK-SD-LABEL: fshr_i16_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    lsl w8, w1, #16
+; CHECK-SD-NEXT:    extr w0, w0, w8, #19
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshr_i16_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ubfx w8, w1, #3, #13
+; CHECK-GI-NEXT:    orr w0, w8, w0, lsl #13
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call i16 @llvm.fshr(i16 %a, i16 %b, i16 3)
+  ret i16 %d
+}
+
+define i32 @fshl_i32_c(i32 %a, i32 %b) {
+; CHECK-LABEL: fshl_i32_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    extr w0, w0, w1, #29
+; CHECK-NEXT:    ret
+entry:
+  %d = call i32 @llvm.fshl(i32 %a, i32 %b, i32 3)
+  ret i32 %d
+}
+
+define i32 @fshr_i32_c(i32 %a, i32 %b) {
+; CHECK-LABEL: fshr_i32_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    extr w0, w0, w1, #3
+; CHECK-NEXT:    ret
+entry:
+  %d = call i32 @llvm.fshr(i32 %a, i32 %b, i32 3)
+  ret i32 %d
+}
+
+define i64 @fshl_i64_c(i64 %a, i64 %b) {
+; CHECK-LABEL: fshl_i64_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    extr x0, x0, x1, #61
+; CHECK-NEXT:    ret
+entry:
+  %d = call i64 @llvm.fshl(i64 %a, i64 %b, i64 3)
+  ret i64 %d
+}
+
+define i64 @fshr_i64_c(i64 %a, i64 %b) {
+; CHECK-LABEL: fshr_i64_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    extr x0, x0, x1, #3
+; CHECK-NEXT:    ret
+entry:
+  %d = call i64 @llvm.fshr(i64 %a, i64 %b, i64 3)
+  ret i64 %d
+}
+
+define i128 @fshl_i128_c(i128 %a, i128 %b) {
+; CHECK-SD-LABEL: fshl_i128_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    extr x8, x0, x3, #61
+; CHECK-SD-NEXT:    extr x1, x1, x0, #61
+; CHECK-SD-NEXT:    mov x0, x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshl_i128_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    lsr x8, x0, #61
+; CHECK-GI-NEXT:    lsr x9, x3, #61
+; CHECK-GI-NEXT:    orr x1, x8, x1, lsl #3
+; CHECK-GI-NEXT:    orr x0, x9, x0, lsl #3
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call i128 @llvm.fshl(i128 %a, i128 %b, i128 3)
+  ret i128 %d
+}
+
+define i128 @fshr_i128_c(i128 %a, i128 %b) {
+; CHECK-SD-LABEL: fshr_i128_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    extr x8, x3, x2, #3
+; CHECK-SD-NEXT:    extr x1, x0, x3, #3
+; CHECK-SD-NEXT:    mov x0, x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshr_i128_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    lsl x8, x3, #61
+; CHECK-GI-NEXT:    lsr x9, x3, #3
+; CHECK-GI-NEXT:    orr x8, x8, x2, lsr #3
+; CHECK-GI-NEXT:    orr x1, x9, x0, lsl #61
+; CHECK-GI-NEXT:    mov x0, x8
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call i128 @llvm.fshr(i128 %a, i128 %b, i128 3)
+  ret i128 %d
+}
+
+define <8 x i8> @rotl_v8i8(<8 x i8> %a, <8 x i8> %c) {
+; CHECK-LABEL: rotl_v8i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v2.8b, #7
+; CHECK-NEXT:    neg v3.8b, v1.8b
+; CHECK-NEXT:    and v3.8b, v3.8b, v2.8b
+; CHECK-NEXT:    and v1.8b, v1.8b, v2.8b
+; CHECK-NEXT:    neg v2.8b, v3.8b
+; CHECK-NEXT:    ushl v1.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ushl v0.8b, v0.8b, v2.8b
+; CHECK-NEXT:    orr v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    ret
+entry:
+  %d = call <8 x i8> @llvm.fshl(<8 x i8> %a, <8 x i8> %a, <8 x i8> %c)
+  ret <8 x i8> %d
+}
+
+define <8 x i8> @rotr_v8i8(<8 x i8> %a, <8 x i8> %c) {
+; CHECK-SD-LABEL: rotr_v8i8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v2.8b, #7
+; CHECK-SD-NEXT:    neg v3.8b, v1.8b
+; CHECK-SD-NEXT:    and v1.8b, v1.8b, v2.8b
+; CHECK-SD-NEXT:    and v2.8b, v3.8b, v2.8b
+; CHECK-SD-NEXT:    neg v1.8b, v1.8b
+; CHECK-SD-NEXT:    ushl v2.8b, v0.8b, v2.8b
+; CHECK-SD-NEXT:    ushl v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    orr v0.8b, v0.8b, v2.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rotr_v8i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v2.8b, #7
+; CHECK-GI-NEXT:    neg v3.8b, v1.8b
+; CHECK-GI-NEXT:    and v1.8b, v1.8b, v2.8b
+; CHECK-GI-NEXT:    and v2.8b, v3.8b, v2.8b
+; CHECK-GI-NEXT:    neg v1.8b, v1.8b
+; CHECK-GI-NEXT:    ushl v1.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    ushl v0.8b, v0.8b, v2.8b
+; CHECK-GI-NEXT:    orr v0.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <8 x i8> @llvm.fshr(<8 x i8> %a, <8 x i8> %a, <8 x i8> %c)
+  ret <8 x i8> %d
+}
+
+define <16 x i8> @rotl_v16i8(<16 x i8> %a, <16 x i8> %c) {
+; CHECK-LABEL: rotl_v16i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v2.16b, #7
+; CHECK-NEXT:    neg v3.16b, v1.16b
+; CHECK-NEXT:    and v3.16b, v3.16b, v2.16b
+; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-NEXT:    neg v2.16b, v3.16b
+; CHECK-NEXT:    ushl v1.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ushl v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    ret
+entry:
+  %d = call <16 x i8> @llvm.fshl(<16 x i8> %a, <16 x i8> %a, <16 x i8> %c)
+  ret <16 x i8> %d
+}
+
+define <16 x i8> @rotr_v16i8(<16 x i8> %a, <16 x i8> %c) {
+; CHECK-SD-LABEL: rotr_v16i8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v2.16b, #7
+; CHECK-SD-NEXT:    neg v3.16b, v1.16b
+; CHECK-SD-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-SD-NEXT:    and v2.16b, v3.16b, v2.16b
+; CHECK-SD-NEXT:    neg v1.16b, v1.16b
+; CHECK-SD-NEXT:    ushl v2.16b, v0.16b, v2.16b
+; CHECK-SD-NEXT:    ushl v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rotr_v16i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v2.16b, #7
+; CHECK-GI-NEXT:    neg v3.16b, v1.16b
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT:    and v2.16b, v3.16b, v2.16b
+; CHECK-GI-NEXT:    neg v1.16b, v1.16b
+; CHECK-GI-NEXT:    ushl v1.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ushl v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    orr v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <16 x i8> @llvm.fshr(<16 x i8> %a, <16 x i8> %a, <16 x i8> %c)
+  ret <16 x i8> %d
+}
+
+define <4 x i16> @rotl_v4i16(<4 x i16> %a, <4 x i16> %c) {
+; CHECK-LABEL: rotl_v4i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v2.4h, #15
+; CHECK-NEXT:    neg v3.4h, v1.4h
+; CHECK-NEXT:    and v3.8b, v3.8b, v2.8b
+; CHECK-NEXT:    and v1.8b, v1.8b, v2.8b
+; CHECK-NEXT:    neg v2.4h, v3.4h
+; CHECK-NEXT:    ushl v1.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ushl v0.4h, v0.4h, v2.4h
+; CHECK-NEXT:    orr v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    ret
+entry:
+  %d = call <4 x i16> @llvm.fshl(<4 x i16> %a, <4 x i16> %a, <4 x i16> %c)
+  ret <4 x i16> %d
+}
+
+define <4 x i16> @rotr_v4i16(<4 x i16> %a, <4 x i16> %c) {
+; CHECK-SD-LABEL: rotr_v4i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v2.4h, #15
+; CHECK-SD-NEXT:    neg v3.4h, v1.4h
+; CHECK-SD-NEXT:    and v1.8b, v1.8b, v2.8b
+; CHECK-SD-NEXT:    and v2.8b, v3.8b, v2.8b
+; CHECK-SD-NEXT:    neg v1.4h, v1.4h
+; CHECK-SD-NEXT:    ushl v2.4h, v0.4h, v2.4h
+; CHECK-SD-NEXT:    ushl v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT:    orr v0.8b, v0.8b, v2.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rotr_v4i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v2.4h, #15
+; CHECK-GI-NEXT:    neg v3.4h, v1.4h
+; CHECK-GI-NEXT:    and v1.8b, v1.8b, v2.8b
+; CHECK-GI-NEXT:    and v2.8b, v3.8b, v2.8b
+; CHECK-GI-NEXT:    neg v1.4h, v1.4h
+; CHECK-GI-NEXT:    ushl v1.4h, v0.4h, v1.4h
+; CHECK-GI-NEXT:    ushl v0.4h, v0.4h, v2.4h
+; CHECK-GI-NEXT:    orr v0.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <4 x i16> @llvm.fshr(<4 x i16> %a, <4 x i16> %a, <4 x i16> %c)
+  ret <4 x i16> %d
+}
+
+define <7 x i16> @rotl_v7i16(<7 x i16> %a, <7 x i16> %c) {
+; CHECK-SD-LABEL: rotl_v7i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v2.8h, #15
+; CHECK-SD-NEXT:    neg v3.8h, v1.8h
+; CHECK-SD-NEXT:    and v3.16b, v3.16b, v2.16b
+; CHECK-SD-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-SD-NEXT:    neg v2.8h, v3.8h
+; CHECK-SD-NEXT:    ushl v1.8h, v0.8h, v1.8h
+; CHECK-SD-NEXT:    ushl v0.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT:    orr v0.16b, v1.16b, v0.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rotl_v7i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #0 // =0x0
+; CHECK-GI-NEXT:    mov w9, #15 // =0xf
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    fmov s3, w9
+; CHECK-GI-NEXT:    mov v2.h[1], w8
+; CHECK-GI-NEXT:    mov v3.h[1], w9
+; CHECK-GI-NEXT:    mov v2.h[2], w8
+; CHECK-GI-NEXT:    mov v3.h[2], w9
+; CHECK-GI-NEXT:    mov v2.h[3], w8
+; CHECK-GI-NEXT:    mov v3.h[3], w9
+; CHECK-GI-NEXT:    mov v2.h[4], w8
+; CHECK-GI-NEXT:    mov v3.h[4], w9
+; CHECK-GI-NEXT:    mov v2.h[5], w8
+; CHECK-GI-NEXT:    mov v3.h[5], w9
+; CHECK-GI-NEXT:    mov v2.h[6], w8
+; CHECK-GI-NEXT:    mov v3.h[6], w9
+; CHECK-GI-NEXT:    sub v2.8h, v2.8h, v1.8h
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT:    and v2.16b, v2.16b, v3.16b
+; CHECK-GI-NEXT:    ushl v1.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    neg v2.8h, v2.8h
+; CHECK-GI-NEXT:    ushl v0.8h, v0.8h, v2.8h
+; CHECK-GI-NEXT:    orr v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <7 x i16> @llvm.fshl(<7 x i16> %a, <7 x i16> %a, <7 x i16> %c)
+  ret <7 x i16> %d
+}
+
+define <7 x i16> @rotr_v7i16(<7 x i16> %a, <7 x i16> %c) {
+; CHECK-SD-LABEL: rotr_v7i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v2.8h, #15
+; CHECK-SD-NEXT:    neg v3.8h, v1.8h
+; CHECK-SD-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-SD-NEXT:    and v2.16b, v3.16b, v2.16b
+; CHECK-SD-NEXT:    neg v1.8h, v1.8h
+; CHECK-SD-NEXT:    ushl v2.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT:    ushl v0.8h, v0.8h, v1.8h
+; CHECK-SD-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rotr_v7i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #0 // =0x0
+; CHECK-GI-NEXT:    mov w9, #15 // =0xf
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    fmov s3, w9
+; CHECK-GI-NEXT:    mov v2.h[1], w8
+; CHECK-GI-NEXT:    mov v3.h[1], w9
+; CHECK-GI-NEXT:    mov v2.h[2], w8
+; CHECK-GI-NEXT:    mov v3.h[2], w9
+; CHECK-GI-NEXT:    mov v2.h[3], w8
+; CHECK-GI-NEXT:    mov v3.h[3], w9
+; CHECK-GI-NEXT:    mov v2.h[4], w8
+; CHECK-GI-NEXT:    mov v3.h[4], w9
+; CHECK-GI-NEXT:    mov v2.h[5], w8
+; CHECK-GI-NEXT:    mov v3.h[5], w9
+; CHECK-GI-NEXT:    mov v2.h[6], w8
+; CHECK-GI-NEXT:    mov v3.h[6], w9
+; CHECK-GI-NEXT:    sub v2.8h, v2.8h, v1.8h
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT:    neg v1.8h, v1.8h
+; CHECK-GI-NEXT:    and v2.16b, v2.16b, v3.16b
+; CHECK-GI-NEXT:    ushl v1.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    ushl v0.8h, v0.8h, v2.8h
+; CHECK-GI-NEXT:    orr v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <7 x i16> @llvm.fshr(<7 x i16> %a, <7 x i16> %a, <7 x i16> %c)
+  ret <7 x i16> %d
+}
+
+define <8 x i16> @rotl_v8i16(<8 x i16> %a, <8 x i16> %c) {
+; CHECK-LABEL: rotl_v8i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v2.8h, #15
+; CHECK-NEXT:    neg v3.8h, v1.8h
+; CHECK-NEXT:    and v3.16b, v3.16b, v2.16b
+; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-NEXT:    neg v2.8h, v3.8h
+; CHECK-NEXT:    ushl v1.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ushl v0.8h, v0.8h, v2.8h
+; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    ret
+entry:
+  %d = call <8 x i16> @llvm.fshl(<8 x i16> %a, <8 x i16> %a, <8 x i16> %c)
+  ret <8 x i16> %d
+}
+
+define <8 x i16> @rotr_v8i16(<8 x i16> %a, <8 x i16> %c) {
+; CHECK-SD-LABEL: rotr_v8i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v2.8h, #15
+; CHECK-SD-NEXT:    neg v3.8h, v1.8h
+; CHECK-SD-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-SD-NEXT:    and v2.16b, v3.16b, v2.16b
+; CHECK-SD-NEXT:    neg v1.8h, v1.8h
+; CHECK-SD-NEXT:    ushl v2.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT:    ushl v0.8h, v0.8h, v1.8h
+; CHECK-SD-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rotr_v8i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v2.8h, #15
+; CHECK-GI-NEXT:    neg v3.8h, v1.8h
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT:    and v2.16b, v3.16b, v2.16b
+; CHECK-GI-NEXT:    neg v1.8h, v1.8h
+; CHECK-GI-NEXT:    ushl v1.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    ushl v0.8h, v0.8h, v2.8h
+; CHECK-GI-NEXT:    orr v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <8 x i16> @llvm.fshr(<8 x i16> %a, <8 x i16> %a, <8 x i16> %c)
+  ret <8 x i16> %d
+}
+
+define <16 x i16> @rotl_v16i16(<16 x i16> %a, <16 x i16> %c) {
+; CHECK-LABEL: rotl_v16i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v4.8h, #15
+; CHECK-NEXT:    neg v5.8h, v2.8h
+; CHECK-NEXT:    neg v6.8h, v3.8h
+; CHECK-NEXT:    and v5.16b, v5.16b, v4.16b
+; CHECK-NEXT:    and v6.16b, v6.16b, v4.16b
+; CHECK-NEXT:    and v2.16b, v2.16b, v4.16b
+; CHECK-NEXT:    and v3.16b, v3.16b, v4.16b
+; CHECK-NEXT:    neg v4.8h, v5.8h
+; CHECK-NEXT:    neg v5.8h, v6.8h
+; CHECK-NEXT:    ushl v2.8h, v0.8h, v2.8h
+; CHECK-NEXT:    ushl v3.8h, v1.8h, v3.8h
+; CHECK-NEXT:    ushl v0.8h, v0.8h, v4.8h
+; CHECK-NEXT:    ushl v1.8h, v1.8h, v5.8h
+; CHECK-NEXT:    orr v0.16b, v2.16b, v0.16b
+; CHECK-NEXT:    orr v1.16b, v3.16b, v1.16b
+; CHECK-NEXT:    ret
+entry:
+  %d = call <16 x i16> @llvm.fshl(<16 x i16> %a, <16 x i16> %a, <16 x i16> %c)
+  ret <16 x i16> %d
+}
+
+define <16 x i16> @rotr_v16i16(<16 x i16> %a, <16 x i16> %c) {
+; CHECK-SD-LABEL: rotr_v16i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v4.8h, #15
+; CHECK-SD-NEXT:    neg v5.8h, v2.8h
+; CHECK-SD-NEXT:    neg v6.8h, v3.8h
+; CHECK-SD-NEXT:    and v2.16b, v2.16b, v4.16b
+; CHECK-SD-NEXT:    and v3.16b, v3.16b, v4.16b
+; CHECK-SD-NEXT:    and v5.16b, v5.16b, v4.16b
+; CHECK-SD-NEXT:    and v4.16b, v6.16b, v4.16b
+; CHECK-SD-NEXT:    neg v2.8h, v2.8h
+; CHECK-SD-NEXT:    neg v3.8h, v3.8h
+; CHECK-SD-NEXT:    ushl v5.8h, v0.8h, v5.8h
+; CHECK-SD-NEXT:    ushl v0.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT:    ushl v2.8h, v1.8h, v4.8h
+; CHECK-SD-NEXT:    ushl v1.8h, v1.8h, v3.8h
+; CHECK-SD-NEXT:    orr v0.16b, v0.16b, v5.16b
+; CHECK-SD-NEXT:    orr v1.16b, v1.16b, v2.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rotr_v16i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v4.8h, #15
+; CHECK-GI-NEXT:    neg v6.8h, v3.8h
+; CHECK-GI-NEXT:    and v5.16b, v2.16b, v4.16b
+; CHECK-GI-NEXT:    neg v2.8h, v2.8h
+; CHECK-GI-NEXT:    and v3.16b, v3.16b, v4.16b
+; CHECK-GI-NEXT:    neg v5.8h, v5.8h
+; CHECK-GI-NEXT:    and v2.16b, v2.16b, v4.16b
+; CHECK-GI-NEXT:    and v4.16b, v6.16b, v4.16b
+; CHECK-GI-NEXT:    neg v3.8h, v3.8h
+; CHECK-GI-NEXT:    ushl v5.8h, v0.8h, v5.8h
+; CHECK-GI-NEXT:    ushl v0.8h, v0.8h, v2.8h
+; CHECK-GI-NEXT:    ushl v3.8h, v1.8h, v3.8h
+; CHECK-GI-NEXT:    ushl v1.8h, v1.8h, v4.8h
+; CHECK-GI-NEXT:    orr v0.16b, v5.16b, v0.16b
+; CHECK-GI-NEXT:    orr v1.16b, v3.16b, v1.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <16 x i16> @llvm.fshr(<16 x i16> %a, <16 x i16> %a, <16 x i16> %c)
+  ret <16 x i16> %d
+}
+
+define <2 x i32> @rotl_v2i32(<2 x i32> %a, <2 x i32> %c) {
+; CHECK-LABEL: rotl_v2i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v2.2s, #31
+; CHECK-NEXT:    neg v3.2s, v1.2s
+; CHECK-NEXT:    and v3.8b, v3.8b, v2.8b
+; CHECK-NEXT:    and v1.8b, v1.8b, v2.8b
+; CHECK-NEXT:    neg v2.2s, v3.2s
+; CHECK-NEXT:    ushl v1.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ushl v0.2s, v0.2s, v2.2s
+; CHECK-NEXT:    orr v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    ret
+entry:
+  %d = call <2 x i32> @llvm.fshl(<2 x i32> %a, <2 x i32> %a, <2 x i32> %c)
+  ret <2 x i32> %d
+}
+
+define <2 x i32> @rotr_v2i32(<2 x i32> %a, <2 x i32> %c) {
+; CHECK-SD-LABEL: rotr_v2i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v2.2s, #31
+; CHECK-SD-NEXT:    neg v3.2s, v1.2s
+; CHECK-SD-NEXT:    and v1.8b, v1.8b, v2.8b
+; CHECK-SD-NEXT:    and v2.8b, v3.8b, v2.8b
+; CHECK-SD-NEXT:    neg v1.2s, v1.2s
+; CHECK-SD-NEXT:    ushl v2.2s, v0.2s, v2.2s
+; CHECK-SD-NEXT:    ushl v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    orr v0.8b, v0.8b, v2.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rotr_v2i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v2.2s, #31
+; CHECK-GI-NEXT:    neg v3.2s, v1.2s
+; CHECK-GI-NEXT:    and v1.8b, v1.8b, v2.8b
+; CHECK-GI-NEXT:    and v2.8b, v3.8b, v2.8b
+; CHECK-GI-NEXT:    neg v1.2s, v1.2s
+; CHECK-GI-NEXT:    ushl v1.2s, v0.2s, v1.2s
+; CHECK-GI-NEXT:    ushl v0.2s, v0.2s, v2.2s
+; CHECK-GI-NEXT:    orr v0.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <2 x i32> @llvm.fshr(<2 x i32> %a, <2 x i32> %a, <2 x i32> %c)
+  ret <2 x i32> %d
+}
+
+define <4 x i32> @rotl_v4i32(<4 x i32> %a, <4 x i32> %c) {
+; CHECK-LABEL: rotl_v4i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v2.4s, #31
+; CHECK-NEXT:    neg v3.4s, v1.4s
+; CHECK-NEXT:    and v3.16b, v3.16b, v2.16b
+; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-NEXT:    neg v2.4s, v3.4s
+; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ushl v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    ret
+entry:
+  %d = call <4 x i32> @llvm.fshl(<4 x i32> %a, <4 x i32> %a, <4 x i32> %c)
+  ret <4 x i32> %d
+}
+
+define <4 x i32> @rotr_v4i32(<4 x i32> %a, <4 x i32> %c) {
+; CHECK-SD-LABEL: rotr_v4i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v2.4s, #31
+; CHECK-SD-NEXT:    neg v3.4s, v1.4s
+; CHECK-SD-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-SD-NEXT:    and v2.16b, v3.16b, v2.16b
+; CHECK-SD-NEXT:    neg v1.4s, v1.4s
+; CHECK-SD-NEXT:    ushl v2.4s, v0.4s, v2.4s
+; CHECK-SD-NEXT:    ushl v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rotr_v4i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v2.4s, #31
+; CHECK-GI-NEXT:    neg v3.4s, v1.4s
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT:    and v2.16b, v3.16b, v2.16b
+; CHECK-GI-NEXT:    neg v1.4s, v1.4s
+; CHECK-GI-NEXT:    ushl v1.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v2.4s
+; CHECK-GI-NEXT:    orr v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <4 x i32> @llvm.fshr(<4 x i32> %a, <4 x i32> %a, <4 x i32> %c)
+  ret <4 x i32> %d
+}
+
+define <7 x i32> @rotl_v7i32(<7 x i32> %a, <7 x i32> %c) {
+; CHECK-SD-LABEL: rotl_v7i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov s0, w7
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    ldr s1, [sp, #24]
+; CHECK-SD-NEXT:    fmov s2, w0
+; CHECK-SD-NEXT:    add x9, sp, #40
+; CHECK-SD-NEXT:    fmov s5, w4
+; CHECK-SD-NEXT:    movi v3.4s, #31
+; CHECK-SD-NEXT:    ld1 { v0.s }[1], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #8
+; CHECK-SD-NEXT:    mov v2.s[1], w1
+; CHECK-SD-NEXT:    mov v5.s[1], w5
+; CHECK-SD-NEXT:    ld1 { v0.s }[2], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #32
+; CHECK-SD-NEXT:    ld1 { v1.s }[1], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #16
+; CHECK-SD-NEXT:    mov v2.s[2], w2
+; CHECK-SD-NEXT:    mov v5.s[2], w6
+; CHECK-SD-NEXT:    ld1 { v0.s }[3], [x8]
+; CHECK-SD-NEXT:    ld1 { v1.s }[2], [x9]
+; CHECK-SD-NEXT:    neg v4.4s, v0.4s
+; CHECK-SD-NEXT:    mov v2.s[3], w3
+; CHECK-SD-NEXT:    and v0.16b, v0.16b, v3.16b
+; CHECK-SD-NEXT:    neg v6.4s, v1.4s
+; CHECK-SD-NEXT:    and v1.16b, v1.16b, v3.16b
+; CHECK-SD-NEXT:    and v4.16b, v4.16b, v3.16b
+; CHECK-SD-NEXT:    and v6.16b, v6.16b, v3.16b
+; CHECK-SD-NEXT:    ushl v0.4s, v2.4s, v0.4s
+; CHECK-SD-NEXT:    ushl v1.4s, v5.4s, v1.4s
+; CHECK-SD-NEXT:    neg v4.4s, v4.4s
+; CHECK-SD-NEXT:    neg v3.4s, v6.4s
+; CHECK-SD-NEXT:    ushl v2.4s, v2.4s, v4.4s
+; CHECK-SD-NEXT:    ushl v3.4s, v5.4s, v3.4s
+; CHECK-SD-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-SD-NEXT:    orr v1.16b, v1.16b, v3.16b
+; CHECK-SD-NEXT:    mov w1, v0.s[1]
+; CHECK-SD-NEXT:    mov w2, v0.s[2]
+; CHECK-SD-NEXT:    mov w3, v0.s[3]
+; CHECK-SD-NEXT:    mov w5, v1.s[1]
+; CHECK-SD-NEXT:    mov w6, v1.s[2]
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    fmov w4, s1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rotl_v7i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v1.s[0], w7
+; CHECK-GI-NEXT:    ldr s0, [sp, #24]
+; CHECK-GI-NEXT:    mov v2.s[0], wzr
+; CHECK-GI-NEXT:    mov v3.s[0], w7
+; CHECK-GI-NEXT:    mov x9, sp
+; CHECK-GI-NEXT:    ldr s5, [sp, #32]
+; CHECK-GI-NEXT:    mov v4.16b, v0.16b
+; CHECK-GI-NEXT:    mov w8, #31 // =0x1f
+; CHECK-GI-NEXT:    ldr s7, [sp]
+; CHECK-GI-NEXT:    mov v16.s[0], w8
+; CHECK-GI-NEXT:    mov v6.s[0], w0
+; CHECK-GI-NEXT:    ldr s18, [sp, #40]
+; CHECK-GI-NEXT:    ld1 { v1.s }[1], [x9]
+; CHECK-GI-NEXT:    mov v2.s[1], wzr
+; CHECK-GI-NEXT:    add x9, sp, #8
+; CHECK-GI-NEXT:    mov v4.s[1], v5.s[0]
+; CHECK-GI-NEXT:    mov v3.s[1], v7.s[0]
+; CHECK-GI-NEXT:    mov v7.s[0], w0
+; CHECK-GI-NEXT:    mov v19.s[0], w8
+; CHECK-GI-NEXT:    ldr s17, [sp, #8]
+; CHECK-GI-NEXT:    mov v20.s[0], w4
+; CHECK-GI-NEXT:    ld1 { v1.s }[2], [x9]
+; CHECK-GI-NEXT:    mov v16.s[1], w8
+; CHECK-GI-NEXT:    add x9, sp, #16
+; CHECK-GI-NEXT:    mov v2.s[2], wzr
+; CHECK-GI-NEXT:    mov v6.s[1], w1
+; CHECK-GI-NEXT:    mov v21.s[0], w4
+; CHECK-GI-NEXT:    mov v4.s[2], v18.s[0]
+; CHECK-GI-NEXT:    mov v7.s[1], w1
+; CHECK-GI-NEXT:    mov v3.s[2], v17.s[0]
+; CHECK-GI-NEXT:    ld1 { v1.s }[3], [x9]
+; CHECK-GI-NEXT:    mov v0.s[1], v5.s[0]
+; CHECK-GI-NEXT:    mov v19.s[1], w8
+; CHECK-GI-NEXT:    movi v5.4s, #31
+; CHECK-GI-NEXT:    mov v16.s[2], w8
+; CHECK-GI-NEXT:    ldr s17, [sp, #16]
+; CHECK-GI-NEXT:    mov v6.s[2], w2
+; CHECK-GI-NEXT:    mov v20.s[1], w5
+; CHECK-GI-NEXT:    mov v21.s[1], w5
+; CHECK-GI-NEXT:    sub v2.4s, v2.4s, v4.4s
+; CHECK-GI-NEXT:    neg v1.4s, v1.4s
+; CHECK-GI-NEXT:    mov v7.s[2], w2
+; CHECK-GI-NEXT:    mov v3.s[3], v17.s[0]
+; CHECK-GI-NEXT:    mov v0.s[2], v18.s[0]
+; CHECK-GI-NEXT:    mov v19.s[2], w8
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v5.16b
+; CHECK-GI-NEXT:    and v2.16b, v2.16b, v16.16b
+; CHECK-GI-NEXT:    mov v6.s[3], w3
+; CHECK-GI-NEXT:    mov v7.s[3], w3
+; CHECK-GI-NEXT:    mov v20.s[2], w6
+; CHECK-GI-NEXT:    mov v21.s[2], w6
+; CHECK-GI-NEXT:    and v3.16b, v3.16b, v5.16b
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v19.16b
+; CHECK-GI-NEXT:    neg v1.4s, v1.4s
+; CHECK-GI-NEXT:    neg v2.4s, v2.4s
+; CHECK-GI-NEXT:    ushl v3.4s, v6.4s, v3.4s
+; CHECK-GI-NEXT:    ushl v0.4s, v20.4s, v0.4s
+; CHECK-GI-NEXT:    ushl v1.4s, v7.4s, v1.4s
+; CHECK-GI-NEXT:    ushl v2.4s, v21.4s, v2.4s
+; CHECK-GI-NEXT:    orr v1.16b, v3.16b, v1.16b
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    mov s2, v1.s[1]
+; CHECK-GI-NEXT:    mov s3, v1.s[2]
+; CHECK-GI-NEXT:    mov s4, v1.s[3]
+; CHECK-GI-NEXT:    mov s5, v0.s[1]
+; CHECK-GI-NEXT:    mov s6, v0.s[2]
+; CHECK-GI-NEXT:    fmov w0, s1
+; CHECK-GI-NEXT:    fmov w4, s0
+; CHECK-GI-NEXT:    fmov w1, s2
+; CHECK-GI-NEXT:    fmov w2, s3
+; CHECK-GI-NEXT:    fmov w3, s4
+; CHECK-GI-NEXT:    fmov w5, s5
+; CHECK-GI-NEXT:    fmov w6, s6
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <7 x i32> @llvm.fshl(<7 x i32> %a, <7 x i32> %a, <7 x i32> %c)
+  ret <7 x i32> %d
+}
+
+define <7 x i32> @rotr_v7i32(<7 x i32> %a, <7 x i32> %c) {
+; CHECK-SD-LABEL: rotr_v7i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov s0, w7
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    fmov s2, w0
+; CHECK-SD-NEXT:    ldr s1, [sp, #24]
+; CHECK-SD-NEXT:    add x9, sp, #32
+; CHECK-SD-NEXT:    fmov s4, w4
+; CHECK-SD-NEXT:    movi v3.4s, #31
+; CHECK-SD-NEXT:    ld1 { v0.s }[1], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #8
+; CHECK-SD-NEXT:    mov v2.s[1], w1
+; CHECK-SD-NEXT:    ld1 { v1.s }[1], [x9]
+; CHECK-SD-NEXT:    add x9, sp, #40
+; CHECK-SD-NEXT:    mov v4.s[1], w5
+; CHECK-SD-NEXT:    ld1 { v0.s }[2], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #16
+; CHECK-SD-NEXT:    ld1 { v1.s }[2], [x9]
+; CHECK-SD-NEXT:    mov v2.s[2], w2
+; CHECK-SD-NEXT:    mov v4.s[2], w6
+; CHECK-SD-NEXT:    ld1 { v0.s }[3], [x8]
+; CHECK-SD-NEXT:    neg v6.4s, v1.4s
+; CHECK-SD-NEXT:    and v1.16b, v1.16b, v3.16b
+; CHECK-SD-NEXT:    mov v2.s[3], w3
+; CHECK-SD-NEXT:    neg v5.4s, v0.4s
+; CHECK-SD-NEXT:    and v0.16b, v0.16b, v3.16b
+; CHECK-SD-NEXT:    neg v1.4s, v1.4s
+; CHECK-SD-NEXT:    and v5.16b, v5.16b, v3.16b
+; CHECK-SD-NEXT:    neg v0.4s, v0.4s
+; CHECK-SD-NEXT:    and v3.16b, v6.16b, v3.16b
+; CHECK-SD-NEXT:    ushl v1.4s, v4.4s, v1.4s
+; CHECK-SD-NEXT:    ushl v5.4s, v2.4s, v5.4s
+; CHECK-SD-NEXT:    ushl v0.4s, v2.4s, v0.4s
+; CHECK-SD-NEXT:    ushl v2.4s, v4.4s, v3.4s
+; CHECK-SD-NEXT:    orr v0.16b, v0.16b, v5.16b
+; CHECK-SD-NEXT:    orr v1.16b, v1.16b, v2.16b
+; CHECK-SD-NEXT:    mov w1, v0.s[1]
+; CHECK-SD-NEXT:    mov w2, v0.s[2]
+; CHECK-SD-NEXT:    mov w3, v0.s[3]
+; CHECK-SD-NEXT:    mov w5, v1.s[1]
+; CHECK-SD-NEXT:    mov w6, v1.s[2]
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    fmov w4, s1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rotr_v7i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v0.s[0], w7
+; CHECK-GI-NEXT:    ldr s2, [sp]
+; CHECK-GI-NEXT:    mov v1.s[0], w7
+; CHECK-GI-NEXT:    mov w8, #31 // =0x1f
+; CHECK-GI-NEXT:    ldr s7, [sp, #8]
+; CHECK-GI-NEXT:    mov v3.s[0], wzr
+; CHECK-GI-NEXT:    mov v5.s[0], w8
+; CHECK-GI-NEXT:    mov x11, sp
+; CHECK-GI-NEXT:    ldr s16, [sp, #32]
+; CHECK-GI-NEXT:    mov v4.s[0], w0
+; CHECK-GI-NEXT:    mov v17.s[0], w0
+; CHECK-GI-NEXT:    ldr s6, [sp, #16]
+; CHECK-GI-NEXT:    mov v0.s[1], v2.s[0]
+; CHECK-GI-NEXT:    ldr s2, [sp, #24]
+; CHECK-GI-NEXT:    ld1 { v1.s }[1], [x11]
+; CHECK-GI-NEXT:    mov v3.s[1], wzr
+; CHECK-GI-NEXT:    add x10, sp, #8
+; CHECK-GI-NEXT:    ldr s18, [sp, #40]
+; CHECK-GI-NEXT:    mov v19.16b, v2.16b
+; CHECK-GI-NEXT:    mov v2.s[1], v16.s[0]
+; CHECK-GI-NEXT:    mov v5.s[1], w8
+; CHECK-GI-NEXT:    ld1 { v1.s }[2], [x10]
+; CHECK-GI-NEXT:    mov v4.s[1], w1
+; CHECK-GI-NEXT:    mov v17.s[1], w1
+; CHECK-GI-NEXT:    mov v0.s[2], v7.s[0]
+; CHECK-GI-NEXT:    mov v7.s[0], w8
+; CHECK-GI-NEXT:    mov v20.s[0], w4
+; CHECK-GI-NEXT:    mov v19.s[1], v16.s[0]
+; CHECK-GI-NEXT:    add x9, sp, #16
+; CHECK-GI-NEXT:    movi v16.4s, #31
+; CHECK-GI-NEXT:    mov v2.s[2], v18.s[0]
+; CHECK-GI-NEXT:    mov v3.s[2], wzr
+; CHECK-GI-NEXT:    mov v5.s[2], w8
+; CHECK-GI-NEXT:    ld1 { v1.s }[3], [x9]
+; CHECK-GI-NEXT:    mov v4.s[2], w2
+; CHECK-GI-NEXT:    mov v17.s[2], w2
+; CHECK-GI-NEXT:    mov v0.s[3], v6.s[0]
+; CHECK-GI-NEXT:    mov v6.s[0], w4
+; CHECK-GI-NEXT:    mov v7.s[1], w8
+; CHECK-GI-NEXT:    mov v19.s[2], v18.s[0]
+; CHECK-GI-NEXT:    mov v20.s[1], w5
+; CHECK-GI-NEXT:    neg v1.4s, v1.4s
+; CHECK-GI-NEXT:    and v2.16b, v2.16b, v5.16b
+; CHECK-GI-NEXT:    mov v4.s[3], w3
+; CHECK-GI-NEXT:    mov v17.s[3], w3
+; CHECK-GI-NEXT:    mov v6.s[1], w5
+; CHECK-GI-NEXT:    mov v7.s[2], w8
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v16.16b
+; CHECK-GI-NEXT:    sub v3.4s, v3.4s, v19.4s
+; CHECK-GI-NEXT:    mov v20.s[2], w6
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v16.16b
+; CHECK-GI-NEXT:    neg v2.4s, v2.4s
+; CHECK-GI-NEXT:    neg v0.4s, v0.4s
+; CHECK-GI-NEXT:    mov v6.s[2], w6
+; CHECK-GI-NEXT:    and v3.16b, v3.16b, v7.16b
+; CHECK-GI-NEXT:    ushl v1.4s, v17.4s, v1.4s
+; CHECK-GI-NEXT:    ushl v2.4s, v20.4s, v2.4s
+; CHECK-GI-NEXT:    ushl v0.4s, v4.4s, v0.4s
+; CHECK-GI-NEXT:    ushl v3.4s, v6.4s, v3.4s
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    orr v1.16b, v2.16b, v3.16b
+; CHECK-GI-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-NEXT:    mov s3, v0.s[2]
+; CHECK-GI-NEXT:    mov s4, v0.s[3]
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    mov s5, v1.s[1]
+; CHECK-GI-NEXT:    mov s6, v1.s[2]
+; CHECK-GI-NEXT:    fmov w4, s1
+; CHECK-GI-NEXT:    fmov w1, s2
+; CHECK-GI-NEXT:    fmov w2, s3
+; CHECK-GI-NEXT:    fmov w3, s4
+; CHECK-GI-NEXT:    fmov w5, s5
+; CHECK-GI-NEXT:    fmov w6, s6
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <7 x i32> @llvm.fshr(<7 x i32> %a, <7 x i32> %a, <7 x i32> %c)
+  ret <7 x i32> %d
+}
+
+define <8 x i32> @rotl_v8i32(<8 x i32> %a, <8 x i32> %c) {
+; CHECK-LABEL: rotl_v8i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v4.4s, #31
+; CHECK-NEXT:    neg v5.4s, v2.4s
+; CHECK-NEXT:    neg v6.4s, v3.4s
+; CHECK-NEXT:    and v5.16b, v5.16b, v4.16b
+; CHECK-NEXT:    and v6.16b, v6.16b, v4.16b
+; CHECK-NEXT:    and v2.16b, v2.16b, v4.16b
+; CHECK-NEXT:    and v3.16b, v3.16b, v4.16b
+; CHECK-NEXT:    neg v4.4s, v5.4s
+; CHECK-NEXT:    neg v5.4s, v6.4s
+; CHECK-NEXT:    ushl v2.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ushl v3.4s, v1.4s, v3.4s
+; CHECK-NEXT:    ushl v0.4s, v0.4s, v4.4s
+; CHECK-NEXT:    ushl v1.4s, v1.4s, v5.4s
+; CHECK-NEXT:    orr v0.16b, v2.16b, v0.16b
+; CHECK-NEXT:    orr v1.16b, v3.16b, v1.16b
+; CHECK-NEXT:    ret
+entry:
+  %d = call <8 x i32> @llvm.fshl(<8 x i32> %a, <8 x i32> %a, <8 x i32> %c)
+  ret <8 x i32> %d
+}
+
+define <8 x i32> @rotr_v8i32(<8 x i32> %a, <8 x i32> %c) {
+; CHECK-SD-LABEL: rotr_v8i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v4.4s, #31
+; CHECK-SD-NEXT:    neg v5.4s, v2.4s
+; CHECK-SD-NEXT:    neg v6.4s, v3.4s
+; CHECK-SD-NEXT:    and v2.16b, v2.16b, v4.16b
+; CHECK-SD-NEXT:    and v3.16b, v3.16b, v4.16b
+; CHECK-SD-NEXT:    and v5.16b, v5.16b, v4.16b
+; CHECK-SD-NEXT:    and v4.16b, v6.16b, v4.16b
+; CHECK-SD-NEXT:    neg v2.4s, v2.4s
+; CHECK-SD-NEXT:    neg v3.4s, v3.4s
+; CHECK-SD-NEXT:    ushl v5.4s, v0.4s, v5.4s
+; CHECK-SD-NEXT:    ushl v0.4s, v0.4s, v2.4s
+; CHECK-SD-NEXT:    ushl v2.4s, v1.4s, v4.4s
+; CHECK-SD-NEXT:    ushl v1.4s, v1.4s, v3.4s
+; CHECK-SD-NEXT:    orr v0.16b, v0.16b, v5.16b
+; CHECK-SD-NEXT:    orr v1.16b, v1.16b, v2.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rotr_v8i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v4.4s, #31
+; CHECK-GI-NEXT:    neg v6.4s, v3.4s
+; CHECK-GI-NEXT:    and v5.16b, v2.16b, v4.16b
+; CHECK-GI-NEXT:    neg v2.4s, v2.4s
+; CHECK-GI-NEXT:    and v3.16b, v3.16b, v4.16b
+; CHECK-GI-NEXT:    neg v5.4s, v5.4s
+; CHECK-GI-NEXT:    and v2.16b, v2.16b, v4.16b
+; CHECK-GI-NEXT:    and v4.16b, v6.16b, v4.16b
+; CHECK-GI-NEXT:    neg v3.4s, v3.4s
+; CHECK-GI-NEXT:    ushl v5.4s, v0.4s, v5.4s
+; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v2.4s
+; CHECK-GI-NEXT:    ushl v3.4s, v1.4s, v3.4s
+; CHECK-GI-NEXT:    ushl v1.4s, v1.4s, v4.4s
+; CHECK-GI-NEXT:    orr v0.16b, v5.16b, v0.16b
+; CHECK-GI-NEXT:    orr v1.16b, v3.16b, v1.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <8 x i32> @llvm.fshr(<8 x i32> %a, <8 x i32> %a, <8 x i32> %c)
+  ret <8 x i32> %d
+}
+
+define <2 x i64> @rotl_v2i64(<2 x i64> %a, <2 x i64> %c) {
+; CHECK-SD-LABEL: rotl_v2i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #63 // =0x3f
+; CHECK-SD-NEXT:    neg v3.2d, v1.2d
+; CHECK-SD-NEXT:    dup v2.2d, x8
+; CHECK-SD-NEXT:    and v3.16b, v3.16b, v2.16b
+; CHECK-SD-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-SD-NEXT:    neg v2.2d, v3.2d
+; CHECK-SD-NEXT:    ushl v1.2d, v0.2d, v1.2d
+; CHECK-SD-NEXT:    ushl v0.2d, v0.2d, v2.2d
+; CHECK-SD-NEXT:    orr v0.16b, v1.16b, v0.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rotl_v2i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI60_0
+; CHECK-GI-NEXT:    neg v2.2d, v1.2d
+; CHECK-GI-NEXT:    ldr q3, [x8, :lo12:.LCPI60_0]
+; CHECK-GI-NEXT:    and v2.16b, v2.16b, v3.16b
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT:    neg v2.2d, v2.2d
+; CHECK-GI-NEXT:    ushl v1.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    ushl v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    orr v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <2 x i64> @llvm.fshl(<2 x i64> %a, <2 x i64> %a, <2 x i64> %c)
+  ret <2 x i64> %d
+}
+
+define <2 x i64> @rotr_v2i64(<2 x i64> %a, <2 x i64> %c) {
+; CHECK-SD-LABEL: rotr_v2i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #63 // =0x3f
+; CHECK-SD-NEXT:    neg v3.2d, v1.2d
+; CHECK-SD-NEXT:    dup v2.2d, x8
+; CHECK-SD-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-SD-NEXT:    and v2.16b, v3.16b, v2.16b
+; CHECK-SD-NEXT:    neg v1.2d, v1.2d
+; CHECK-SD-NEXT:    ushl v2.2d, v0.2d, v2.2d
+; CHECK-SD-NEXT:    ushl v0.2d, v0.2d, v1.2d
+; CHECK-SD-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rotr_v2i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI61_0
+; CHECK-GI-NEXT:    neg v3.2d, v1.2d
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI61_0]
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT:    and v2.16b, v3.16b, v2.16b
+; CHECK-GI-NEXT:    neg v1.2d, v1.2d
+; CHECK-GI-NEXT:    ushl v1.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    ushl v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    orr v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <2 x i64> @llvm.fshr(<2 x i64> %a, <2 x i64> %a, <2 x i64> %c)
+  ret <2 x i64> %d
+}
+
+define <4 x i64> @rotl_v4i64(<4 x i64> %a, <4 x i64> %c) {
+; CHECK-SD-LABEL: rotl_v4i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #63 // =0x3f
+; CHECK-SD-NEXT:    neg v5.2d, v2.2d
+; CHECK-SD-NEXT:    neg v6.2d, v3.2d
+; CHECK-SD-NEXT:    dup v4.2d, x8
+; CHECK-SD-NEXT:    and v5.16b, v5.16b, v4.16b
+; CHECK-SD-NEXT:    and v6.16b, v6.16b, v4.16b
+; CHECK-SD-NEXT:    and v2.16b, v2.16b, v4.16b
+; CHECK-SD-NEXT:    and v3.16b, v3.16b, v4.16b
+; CHECK-SD-NEXT:    neg v4.2d, v5.2d
+; CHECK-SD-NEXT:    neg v5.2d, v6.2d
+; CHECK-SD-NEXT:    ushl v2.2d, v0.2d, v2.2d
+; CHECK-SD-NEXT:    ushl v3.2d, v1.2d, v3.2d
+; CHECK-SD-NEXT:    ushl v0.2d, v0.2d, v4.2d
+; CHECK-SD-NEXT:    ushl v1.2d, v1.2d, v5.2d
+; CHECK-SD-NEXT:    orr v0.16b, v2.16b, v0.16b
+; CHECK-SD-NEXT:    orr v1.16b, v3.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rotl_v4i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI62_0
+; CHECK-GI-NEXT:    neg v4.2d, v2.2d
+; CHECK-GI-NEXT:    neg v5.2d, v3.2d
+; CHECK-GI-NEXT:    ldr q6, [x8, :lo12:.LCPI62_0]
+; CHECK-GI-NEXT:    and v4.16b, v4.16b, v6.16b
+; CHECK-GI-NEXT:    and v5.16b, v5.16b, v6.16b
+; CHECK-GI-NEXT:    and v2.16b, v2.16b, v6.16b
+; CHECK-GI-NEXT:    and v3.16b, v3.16b, v6.16b
+; CHECK-GI-NEXT:    neg v4.2d, v4.2d
+; CHECK-GI-NEXT:    neg v5.2d, v5.2d
+; CHECK-GI-NEXT:    ushl v2.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    ushl v3.2d, v1.2d, v3.2d
+; CHECK-GI-NEXT:    ushl v0.2d, v0.2d, v4.2d
+; CHECK-GI-NEXT:    ushl v1.2d, v1.2d, v5.2d
+; CHECK-GI-NEXT:    orr v0.16b, v2.16b, v0.16b
+; CHECK-GI-NEXT:    orr v1.16b, v3.16b, v1.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <4 x i64> @llvm.fshl(<4 x i64> %a, <4 x i64> %a, <4 x i64> %c)
+  ret <4 x i64> %d
+}
+
+define <4 x i64> @rotr_v4i64(<4 x i64> %a, <4 x i64> %c) {
+; CHECK-SD-LABEL: rotr_v4i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #63 // =0x3f
+; CHECK-SD-NEXT:    neg v5.2d, v2.2d
+; CHECK-SD-NEXT:    neg v6.2d, v3.2d
+; CHECK-SD-NEXT:    dup v4.2d, x8
+; CHECK-SD-NEXT:    and v2.16b, v2.16b, v4.16b
+; CHECK-SD-NEXT:    and v3.16b, v3.16b, v4.16b
+; CHECK-SD-NEXT:    and v5.16b, v5.16b, v4.16b
+; CHECK-SD-NEXT:    and v4.16b, v6.16b, v4.16b
+; CHECK-SD-NEXT:    neg v2.2d, v2.2d
+; CHECK-SD-NEXT:    neg v3.2d, v3.2d
+; CHECK-SD-NEXT:    ushl v5.2d, v0.2d, v5.2d
+; CHECK-SD-NEXT:    ushl v0.2d, v0.2d, v2.2d
+; CHECK-SD-NEXT:    ushl v2.2d, v1.2d, v4.2d
+; CHECK-SD-NEXT:    ushl v1.2d, v1.2d, v3.2d
+; CHECK-SD-NEXT:    orr v0.16b, v0.16b, v5.16b
+; CHECK-SD-NEXT:    orr v1.16b, v1.16b, v2.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rotr_v4i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI63_0
+; CHECK-GI-NEXT:    neg v6.2d, v3.2d
+; CHECK-GI-NEXT:    ldr q4, [x8, :lo12:.LCPI63_0]
+; CHECK-GI-NEXT:    and v5.16b, v2.16b, v4.16b
+; CHECK-GI-NEXT:    neg v2.2d, v2.2d
+; CHECK-GI-NEXT:    and v3.16b, v3.16b, v4.16b
+; CHECK-GI-NEXT:    neg v5.2d, v5.2d
+; CHECK-GI-NEXT:    and v2.16b, v2.16b, v4.16b
+; CHECK-GI-NEXT:    and v4.16b, v6.16b, v4.16b
+; CHECK-GI-NEXT:    neg v3.2d, v3.2d
+; CHECK-GI-NEXT:    ushl v5.2d, v0.2d, v5.2d
+; CHECK-GI-NEXT:    ushl v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    ushl v3.2d, v1.2d, v3.2d
+; CHECK-GI-NEXT:    ushl v1.2d, v1.2d, v4.2d
+; CHECK-GI-NEXT:    orr v0.16b, v5.16b, v0.16b
+; CHECK-GI-NEXT:    orr v1.16b, v3.16b, v1.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <4 x i64> @llvm.fshr(<4 x i64> %a, <4 x i64> %a, <4 x i64> %c)
+  ret <4 x i64> %d
+}
+
+define <2 x i128> @rotl_v2i128(<2 x i128> %a, <2 x i128> %c) {
+; CHECK-SD-LABEL: rotl_v2i128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    tst x4, #0x40
+; CHECK-SD-NEXT:    mvn w9, w4
+; CHECK-SD-NEXT:    csel x8, x1, x0, ne
+; CHECK-SD-NEXT:    csel x10, x0, x1, ne
+; CHECK-SD-NEXT:    tst x6, #0x40
+; CHECK-SD-NEXT:    lsl x11, x8, x4
+; CHECK-SD-NEXT:    lsr x12, x10, #1
+; CHECK-SD-NEXT:    lsr x8, x8, #1
+; CHECK-SD-NEXT:    csel x13, x3, x2, ne
+; CHECK-SD-NEXT:    csel x14, x2, x3, ne
+; CHECK-SD-NEXT:    lsl x10, x10, x4
+; CHECK-SD-NEXT:    lsr x15, x14, #1
+; CHECK-SD-NEXT:    lsr x16, x13, #1
+; CHECK-SD-NEXT:    lsr x12, x12, x9
+; CHECK-SD-NEXT:    lsr x8, x8, x9
+; CHECK-SD-NEXT:    lsl x9, x13, x6
+; CHECK-SD-NEXT:    mvn w13, w6
+; CHECK-SD-NEXT:    lsr x15, x15, x13
+; CHECK-SD-NEXT:    lsl x14, x14, x6
+; CHECK-SD-NEXT:    lsr x13, x16, x13
+; CHECK-SD-NEXT:    orr x0, x11, x12
+; CHECK-SD-NEXT:    orr x1, x10, x8
+; CHECK-SD-NEXT:    orr x2, x9, x15
+; CHECK-SD-NEXT:    orr x3, x14, x13
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rotl_v2i128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    and x8, x4, #0x7f
+; CHECK-GI-NEXT:    mov w9, #64 // =0x40
+; CHECK-GI-NEXT:    neg x13, x4
+; CHECK-GI-NEXT:    sub x10, x9, x8
+; CHECK-GI-NEXT:    lsl x12, x1, x8
+; CHECK-GI-NEXT:    sub x11, x8, #64
+; CHECK-GI-NEXT:    lsr x10, x0, x10
+; CHECK-GI-NEXT:    lsl x14, x0, x8
+; CHECK-GI-NEXT:    lsl x11, x0, x11
+; CHECK-GI-NEXT:    cmp x8, #64
+; CHECK-GI-NEXT:    neg x15, x6
+; CHECK-GI-NEXT:    orr x8, x10, x12
+; CHECK-GI-NEXT:    and x10, x6, #0x7f
+; CHECK-GI-NEXT:    csel x12, x14, xzr, lo
+; CHECK-GI-NEXT:    sub x14, x9, x10
+; CHECK-GI-NEXT:    csel x8, x8, x11, lo
+; CHECK-GI-NEXT:    tst x4, #0x7f
+; CHECK-GI-NEXT:    sub x11, x10, #64
+; CHECK-GI-NEXT:    lsl x16, x2, x10
+; CHECK-GI-NEXT:    lsr x14, x2, x14
+; CHECK-GI-NEXT:    lsl x17, x3, x10
+; CHECK-GI-NEXT:    csel x8, x1, x8, eq
+; CHECK-GI-NEXT:    lsl x11, x2, x11
+; CHECK-GI-NEXT:    cmp x10, #64
+; CHECK-GI-NEXT:    and x4, x15, #0x7f
+; CHECK-GI-NEXT:    orr x10, x14, x17
+; CHECK-GI-NEXT:    csel x14, x16, xzr, lo
+; CHECK-GI-NEXT:    and x16, x13, #0x7f
+; CHECK-GI-NEXT:    csel x10, x10, x11, lo
+; CHECK-GI-NEXT:    sub x11, x9, x16
+; CHECK-GI-NEXT:    sub x17, x16, #64
+; CHECK-GI-NEXT:    lsr x18, x0, x16
+; CHECK-GI-NEXT:    lsl x11, x1, x11
+; CHECK-GI-NEXT:    tst x6, #0x7f
+; CHECK-GI-NEXT:    lsr x17, x1, x17
+; CHECK-GI-NEXT:    csel x10, x3, x10, eq
+; CHECK-GI-NEXT:    cmp x16, #64
+; CHECK-GI-NEXT:    orr x11, x18, x11
+; CHECK-GI-NEXT:    sub x9, x9, x4
+; CHECK-GI-NEXT:    lsr x1, x1, x16
+; CHECK-GI-NEXT:    csel x11, x11, x17, lo
+; CHECK-GI-NEXT:    tst x13, #0x7f
+; CHECK-GI-NEXT:    sub x13, x4, #64
+; CHECK-GI-NEXT:    lsr x17, x2, x4
+; CHECK-GI-NEXT:    lsl x9, x3, x9
+; CHECK-GI-NEXT:    csel x11, x0, x11, eq
+; CHECK-GI-NEXT:    cmp x16, #64
+; CHECK-GI-NEXT:    lsr x13, x3, x13
+; CHECK-GI-NEXT:    orr x0, x12, x11
+; CHECK-GI-NEXT:    csel x16, x1, xzr, lo
+; CHECK-GI-NEXT:    orr x9, x17, x9
+; CHECK-GI-NEXT:    cmp x4, #64
+; CHECK-GI-NEXT:    lsr x17, x3, x4
+; CHECK-GI-NEXT:    csel x9, x9, x13, lo
+; CHECK-GI-NEXT:    tst x15, #0x7f
+; CHECK-GI-NEXT:    csel x9, x2, x9, eq
+; CHECK-GI-NEXT:    cmp x4, #64
+; CHECK-GI-NEXT:    orr x1, x8, x16
+; CHECK-GI-NEXT:    csel x13, x17, xzr, lo
+; CHECK-GI-NEXT:    orr x2, x14, x9
+; CHECK-GI-NEXT:    orr x3, x10, x13
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <2 x i128> @llvm.fshl(<2 x i128> %a, <2 x i128> %a, <2 x i128> %c)
+  ret <2 x i128> %d
+}
+
+define <2 x i128> @rotr_v2i128(<2 x i128> %a, <2 x i128> %c) {
+; CHECK-SD-LABEL: rotr_v2i128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    tst x4, #0x40
+; CHECK-SD-NEXT:    mvn w9, w4
+; CHECK-SD-NEXT:    csel x8, x0, x1, eq
+; CHECK-SD-NEXT:    csel x10, x1, x0, eq
+; CHECK-SD-NEXT:    tst x6, #0x40
+; CHECK-SD-NEXT:    lsr x11, x8, x4
+; CHECK-SD-NEXT:    lsl x12, x10, #1
+; CHECK-SD-NEXT:    lsl x8, x8, #1
+; CHECK-SD-NEXT:    csel x13, x2, x3, eq
+; CHECK-SD-NEXT:    csel x14, x3, x2, eq
+; CHECK-SD-NEXT:    lsr x10, x10, x4
+; CHECK-SD-NEXT:    lsl x15, x14, #1
+; CHECK-SD-NEXT:    lsl x16, x13, #1
+; CHECK-SD-NEXT:    lsl x12, x12, x9
+; CHECK-SD-NEXT:    lsl x8, x8, x9
+; CHECK-SD-NEXT:    lsr x9, x13, x6
+; CHECK-SD-NEXT:    mvn w13, w6
+; CHECK-SD-NEXT:    lsl x15, x15, x13
+; CHECK-SD-NEXT:    lsr x14, x14, x6
+; CHECK-SD-NEXT:    lsl x13, x16, x13
+; CHECK-SD-NEXT:    orr x0, x12, x11
+; CHECK-SD-NEXT:    orr x1, x8, x10
+; CHECK-SD-NEXT:    orr x2, x15, x9
+; CHECK-SD-NEXT:    orr x3, x13, x14
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rotr_v2i128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    and x8, x4, #0x7f
+; CHECK-GI-NEXT:    mov w9, #64 // =0x40
+; CHECK-GI-NEXT:    and x14, x6, #0x7f
+; CHECK-GI-NEXT:    sub x10, x9, x8
+; CHECK-GI-NEXT:    sub x11, x8, #64
+; CHECK-GI-NEXT:    lsr x12, x0, x8
+; CHECK-GI-NEXT:    lsl x10, x1, x10
+; CHECK-GI-NEXT:    lsr x11, x1, x11
+; CHECK-GI-NEXT:    cmp x8, #64
+; CHECK-GI-NEXT:    sub x15, x9, x14
+; CHECK-GI-NEXT:    neg x13, x4
+; CHECK-GI-NEXT:    lsr x17, x3, x14
+; CHECK-GI-NEXT:    orr x10, x12, x10
+; CHECK-GI-NEXT:    lsr x12, x1, x8
+; CHECK-GI-NEXT:    lsl x15, x3, x15
+; CHECK-GI-NEXT:    csel x10, x10, x11, lo
+; CHECK-GI-NEXT:    tst x4, #0x7f
+; CHECK-GI-NEXT:    sub x11, x14, #64
+; CHECK-GI-NEXT:    csel x10, x0, x10, eq
+; CHECK-GI-NEXT:    cmp x8, #64
+; CHECK-GI-NEXT:    lsr x8, x2, x14
+; CHECK-GI-NEXT:    lsr x11, x3, x11
+; CHECK-GI-NEXT:    csel x12, x12, xzr, lo
+; CHECK-GI-NEXT:    cmp x14, #64
+; CHECK-GI-NEXT:    orr x8, x8, x15
+; CHECK-GI-NEXT:    neg x16, x6
+; CHECK-GI-NEXT:    csel x8, x8, x11, lo
+; CHECK-GI-NEXT:    tst x6, #0x7f
+; CHECK-GI-NEXT:    and x11, x13, #0x7f
+; CHECK-GI-NEXT:    csel x8, x2, x8, eq
+; CHECK-GI-NEXT:    cmp x14, #64
+; CHECK-GI-NEXT:    sub x14, x9, x11
+; CHECK-GI-NEXT:    sub x15, x11, #64
+; CHECK-GI-NEXT:    lsr x14, x0, x14
+; CHECK-GI-NEXT:    lsl x18, x1, x11
+; CHECK-GI-NEXT:    lsl x4, x0, x11
+; CHECK-GI-NEXT:    lsl x15, x0, x15
+; CHECK-GI-NEXT:    and x0, x16, #0x7f
+; CHECK-GI-NEXT:    csel x17, x17, xzr, lo
+; CHECK-GI-NEXT:    orr x14, x14, x18
+; CHECK-GI-NEXT:    cmp x11, #64
+; CHECK-GI-NEXT:    sub x9, x9, x0
+; CHECK-GI-NEXT:    csel x14, x14, x15, lo
+; CHECK-GI-NEXT:    sub x15, x0, #64
+; CHECK-GI-NEXT:    lsr x9, x2, x9
+; CHECK-GI-NEXT:    lsl x18, x3, x0
+; CHECK-GI-NEXT:    csel x11, x4, xzr, lo
+; CHECK-GI-NEXT:    tst x13, #0x7f
+; CHECK-GI-NEXT:    lsl x13, x2, x0
+; CHECK-GI-NEXT:    lsl x15, x2, x15
+; CHECK-GI-NEXT:    csel x14, x1, x14, eq
+; CHECK-GI-NEXT:    orr x9, x9, x18
+; CHECK-GI-NEXT:    cmp x0, #64
+; CHECK-GI-NEXT:    csel x13, x13, xzr, lo
+; CHECK-GI-NEXT:    csel x9, x9, x15, lo
+; CHECK-GI-NEXT:    tst x16, #0x7f
+; CHECK-GI-NEXT:    csel x9, x3, x9, eq
+; CHECK-GI-NEXT:    orr x0, x10, x11
+; CHECK-GI-NEXT:    orr x1, x12, x14
+; CHECK-GI-NEXT:    orr x2, x8, x13
+; CHECK-GI-NEXT:    orr x3, x17, x9
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <2 x i128> @llvm.fshr(<2 x i128> %a, <2 x i128> %a, <2 x i128> %c)
+  ret <2 x i128> %d
+}
+
+define <8 x i8> @fshl_v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
+; CHECK-LABEL: fshl_v8i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v3.8b, #7
+; CHECK-NEXT:    ushr v1.8b, v1.8b, #1
+; CHECK-NEXT:    bic v4.8b, v3.8b, v2.8b
+; CHECK-NEXT:    and v2.8b, v2.8b, v3.8b
+; CHECK-NEXT:    neg v3.8b, v4.8b
+; CHECK-NEXT:    ushl v0.8b, v0.8b, v2.8b
+; CHECK-NEXT:    ushl v1.8b, v1.8b, v3.8b
+; CHECK-NEXT:    orr v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
+entry:
+  %d = call <8 x i8> @llvm.fshl(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c)
+  ret <8 x i8> %d
+}
+
+define <8 x i8> @fshr_v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
+; CHECK-SD-LABEL: fshr_v8i8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v3.8b, #7
+; CHECK-SD-NEXT:    add v0.8b, v0.8b, v0.8b
+; CHECK-SD-NEXT:    and v4.8b, v2.8b, v3.8b
+; CHECK-SD-NEXT:    bic v2.8b, v3.8b, v2.8b
+; CHECK-SD-NEXT:    neg v3.8b, v4.8b
+; CHECK-SD-NEXT:    ushl v0.8b, v0.8b, v2.8b
+; CHECK-SD-NEXT:    ushl v1.8b, v1.8b, v3.8b
+; CHECK-SD-NEXT:    orr v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshr_v8i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v3.8b, #7
+; CHECK-GI-NEXT:    shl v0.8b, v0.8b, #1
+; CHECK-GI-NEXT:    and v4.8b, v2.8b, v3.8b
+; CHECK-GI-NEXT:    bic v2.8b, v3.8b, v2.8b
+; CHECK-GI-NEXT:    neg v3.8b, v4.8b
+; CHECK-GI-NEXT:    ushl v0.8b, v0.8b, v2.8b
+; CHECK-GI-NEXT:    ushl v1.8b, v1.8b, v3.8b
+; CHECK-GI-NEXT:    orr v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <8 x i8> @llvm.fshr(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c)
+  ret <8 x i8> %d
+}
+
+define <16 x i8> @fshl_v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK-LABEL: fshl_v16i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v3.16b, #7
+; CHECK-NEXT:    ushr v1.16b, v1.16b, #1
+; CHECK-NEXT:    bic v4.16b, v3.16b, v2.16b
+; CHECK-NEXT:    and v2.16b, v2.16b, v3.16b
+; CHECK-NEXT:    neg v3.16b, v4.16b
+; CHECK-NEXT:    ushl v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    ushl v1.16b, v1.16b, v3.16b
+; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+entry:
+  %d = call <16 x i8> @llvm.fshl(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
+  ret <16 x i8> %d
+}
+
+define <16 x i8> @fshr_v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK-SD-LABEL: fshr_v16i8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v3.16b, #7
+; CHECK-SD-NEXT:    add v0.16b, v0.16b, v0.16b
+; CHECK-SD-NEXT:    and v4.16b, v2.16b, v3.16b
+; CHECK-SD-NEXT:    bic v2.16b, v3.16b, v2.16b
+; CHECK-SD-NEXT:    neg v3.16b, v4.16b
+; CHECK-SD-NEXT:    ushl v0.16b, v0.16b, v2.16b
+; CHECK-SD-NEXT:    ushl v1.16b, v1.16b, v3.16b
+; CHECK-SD-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshr_v16i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v3.16b, #7
+; CHECK-GI-NEXT:    shl v0.16b, v0.16b, #1
+; CHECK-GI-NEXT:    and v4.16b, v2.16b, v3.16b
+; CHECK-GI-NEXT:    bic v2.16b, v3.16b, v2.16b
+; CHECK-GI-NEXT:    neg v3.16b, v4.16b
+; CHECK-GI-NEXT:    ushl v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    ushl v1.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <16 x i8> @llvm.fshr(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
+  ret <16 x i8> %d
+}
+
+define <4 x i16> @fshl_v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) {
+; CHECK-LABEL: fshl_v4i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v3.4h, #15
+; CHECK-NEXT:    ushr v1.4h, v1.4h, #1
+; CHECK-NEXT:    bic v4.8b, v3.8b, v2.8b
+; CHECK-NEXT:    and v2.8b, v2.8b, v3.8b
+; CHECK-NEXT:    neg v3.4h, v4.4h
+; CHECK-NEXT:    ushl v0.4h, v0.4h, v2.4h
+; CHECK-NEXT:    ushl v1.4h, v1.4h, v3.4h
+; CHECK-NEXT:    orr v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
+entry:
+  %d = call <4 x i16> @llvm.fshl(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c)
+  ret <4 x i16> %d
+}
+
+define <4 x i16> @fshr_v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) {
+; CHECK-SD-LABEL: fshr_v4i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v3.4h, #15
+; CHECK-SD-NEXT:    add v0.4h, v0.4h, v0.4h
+; CHECK-SD-NEXT:    and v4.8b, v2.8b, v3.8b
+; CHECK-SD-NEXT:    bic v2.8b, v3.8b, v2.8b
+; CHECK-SD-NEXT:    neg v3.4h, v4.4h
+; CHECK-SD-NEXT:    ushl v0.4h, v0.4h, v2.4h
+; CHECK-SD-NEXT:    ushl v1.4h, v1.4h, v3.4h
+; CHECK-SD-NEXT:    orr v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshr_v4i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v3.4h, #15
+; CHECK-GI-NEXT:    shl v0.4h, v0.4h, #1
+; CHECK-GI-NEXT:    and v4.8b, v2.8b, v3.8b
+; CHECK-GI-NEXT:    bic v2.8b, v3.8b, v2.8b
+; CHECK-GI-NEXT:    neg v3.4h, v4.4h
+; CHECK-GI-NEXT:    ushl v0.4h, v0.4h, v2.4h
+; CHECK-GI-NEXT:    ushl v1.4h, v1.4h, v3.4h
+; CHECK-GI-NEXT:    orr v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <4 x i16> @llvm.fshr(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c)
+  ret <4 x i16> %d
+}
+
+define <7 x i16> @fshl_v7i16(<7 x i16> %a, <7 x i16> %b, <7 x i16> %c) {
+; CHECK-SD-LABEL: fshl_v7i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v3.8h, #15
+; CHECK-SD-NEXT:    ushr v1.8h, v1.8h, #1
+; CHECK-SD-NEXT:    bic v4.16b, v3.16b, v2.16b
+; CHECK-SD-NEXT:    and v2.16b, v2.16b, v3.16b
+; CHECK-SD-NEXT:    neg v3.8h, v4.8h
+; CHECK-SD-NEXT:    ushl v0.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT:    ushl v1.8h, v1.8h, v3.8h
+; CHECK-SD-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshl_v7i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #65535 // =0xffff
+; CHECK-GI-NEXT:    mov w9, #15 // =0xf
+; CHECK-GI-NEXT:    mov w10, #1 // =0x1
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    fmov s4, w9
+; CHECK-GI-NEXT:    fmov s5, w10
+; CHECK-GI-NEXT:    mov v3.h[1], w8
+; CHECK-GI-NEXT:    mov v4.h[1], w9
+; CHECK-GI-NEXT:    mov v5.h[1], w10
+; CHECK-GI-NEXT:    mov v3.h[2], w8
+; CHECK-GI-NEXT:    mov v4.h[2], w9
+; CHECK-GI-NEXT:    mov v5.h[2], w10
+; CHECK-GI-NEXT:    mov v3.h[3], w8
+; CHECK-GI-NEXT:    mov v4.h[3], w9
+; CHECK-GI-NEXT:    mov v5.h[3], w10
+; CHECK-GI-NEXT:    mov v3.h[4], w8
+; CHECK-GI-NEXT:    mov v4.h[4], w9
+; CHECK-GI-NEXT:    mov v5.h[4], w10
+; CHECK-GI-NEXT:    mov v3.h[5], w8
+; CHECK-GI-NEXT:    mov v4.h[5], w9
+; CHECK-GI-NEXT:    mov v5.h[5], w10
+; CHECK-GI-NEXT:    mov v3.h[6], w8
+; CHECK-GI-NEXT:    mov v4.h[6], w9
+; CHECK-GI-NEXT:    mov v5.h[6], w10
+; CHECK-GI-NEXT:    eor v3.16b, v2.16b, v3.16b
+; CHECK-GI-NEXT:    neg v5.8h, v5.8h
+; CHECK-GI-NEXT:    and v2.16b, v2.16b, v4.16b
+; CHECK-GI-NEXT:    and v3.16b, v3.16b, v4.16b
+; CHECK-GI-NEXT:    ushl v1.8h, v1.8h, v5.8h
+; CHECK-GI-NEXT:    ushl v0.8h, v0.8h, v2.8h
+; CHECK-GI-NEXT:    neg v3.8h, v3.8h
+; CHECK-GI-NEXT:    ushl v1.8h, v1.8h, v3.8h
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <7 x i16> @llvm.fshl(<7 x i16> %a, <7 x i16> %b, <7 x i16> %c)
+  ret <7 x i16> %d
+}
+
+define <7 x i16> @fshr_v7i16(<7 x i16> %a, <7 x i16> %b, <7 x i16> %c) {
+; CHECK-SD-LABEL: fshr_v7i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v3.8h, #15
+; CHECK-SD-NEXT:    add v0.8h, v0.8h, v0.8h
+; CHECK-SD-NEXT:    and v4.16b, v2.16b, v3.16b
+; CHECK-SD-NEXT:    bic v2.16b, v3.16b, v2.16b
+; CHECK-SD-NEXT:    neg v3.8h, v4.8h
+; CHECK-SD-NEXT:    ushl v0.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT:    ushl v1.8h, v1.8h, v3.8h
+; CHECK-SD-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshr_v7i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #15 // =0xf
+; CHECK-GI-NEXT:    mov w9, #65535 // =0xffff
+; CHECK-GI-NEXT:    mov w10, #1 // =0x1
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    fmov s4, w9
+; CHECK-GI-NEXT:    fmov s5, w10
+; CHECK-GI-NEXT:    mov v3.h[1], w8
+; CHECK-GI-NEXT:    mov v4.h[1], w9
+; CHECK-GI-NEXT:    mov v5.h[1], w10
+; CHECK-GI-NEXT:    mov v3.h[2], w8
+; CHECK-GI-NEXT:    mov v4.h[2], w9
+; CHECK-GI-NEXT:    mov v5.h[2], w10
+; CHECK-GI-NEXT:    mov v3.h[3], w8
+; CHECK-GI-NEXT:    mov v4.h[3], w9
+; CHECK-GI-NEXT:    mov v5.h[3], w10
+; CHECK-GI-NEXT:    mov v3.h[4], w8
+; CHECK-GI-NEXT:    mov v4.h[4], w9
+; CHECK-GI-NEXT:    mov v5.h[4], w10
+; CHECK-GI-NEXT:    mov v3.h[5], w8
+; CHECK-GI-NEXT:    mov v4.h[5], w9
+; CHECK-GI-NEXT:    mov v5.h[5], w10
+; CHECK-GI-NEXT:    mov v3.h[6], w8
+; CHECK-GI-NEXT:    mov v4.h[6], w9
+; CHECK-GI-NEXT:    mov v5.h[6], w10
+; CHECK-GI-NEXT:    eor v4.16b, v2.16b, v4.16b
+; CHECK-GI-NEXT:    and v2.16b, v2.16b, v3.16b
+; CHECK-GI-NEXT:    ushl v0.8h, v0.8h, v5.8h
+; CHECK-GI-NEXT:    and v3.16b, v4.16b, v3.16b
+; CHECK-GI-NEXT:    neg v2.8h, v2.8h
+; CHECK-GI-NEXT:    ushl v0.8h, v0.8h, v3.8h
+; CHECK-GI-NEXT:    ushl v1.8h, v1.8h, v2.8h
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <7 x i16> @llvm.fshr(<7 x i16> %a, <7 x i16> %b, <7 x i16> %c)
+  ret <7 x i16> %d
+}
+
+define <8 x i16> @fshl_v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK-LABEL: fshl_v8i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v3.8h, #15
+; CHECK-NEXT:    ushr v1.8h, v1.8h, #1
+; CHECK-NEXT:    bic v4.16b, v3.16b, v2.16b
+; CHECK-NEXT:    and v2.16b, v2.16b, v3.16b
+; CHECK-NEXT:    neg v3.8h, v4.8h
+; CHECK-NEXT:    ushl v0.8h, v0.8h, v2.8h
+; CHECK-NEXT:    ushl v1.8h, v1.8h, v3.8h
+; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+entry:
+  %d = call <8 x i16> @llvm.fshl(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c)
+  ret <8 x i16> %d
+}
+
+define <8 x i16> @fshr_v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK-SD-LABEL: fshr_v8i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v3.8h, #15
+; CHECK-SD-NEXT:    add v0.8h, v0.8h, v0.8h
+; CHECK-SD-NEXT:    and v4.16b, v2.16b, v3.16b
+; CHECK-SD-NEXT:    bic v2.16b, v3.16b, v2.16b
+; CHECK-SD-NEXT:    neg v3.8h, v4.8h
+; CHECK-SD-NEXT:    ushl v0.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT:    ushl v1.8h, v1.8h, v3.8h
+; CHECK-SD-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshr_v8i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v3.8h, #15
+; CHECK-GI-NEXT:    shl v0.8h, v0.8h, #1
+; CHECK-GI-NEXT:    and v4.16b, v2.16b, v3.16b
+; CHECK-GI-NEXT:    bic v2.16b, v3.16b, v2.16b
+; CHECK-GI-NEXT:    neg v3.8h, v4.8h
+; CHECK-GI-NEXT:    ushl v0.8h, v0.8h, v2.8h
+; CHECK-GI-NEXT:    ushl v1.8h, v1.8h, v3.8h
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <8 x i16> @llvm.fshr(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c)
+  ret <8 x i16> %d
+}
+
+define <16 x i16> @fshl_v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c) {
+; CHECK-LABEL: fshl_v16i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v6.8h, #15
+; CHECK-NEXT:    ushr v2.8h, v2.8h, #1
+; CHECK-NEXT:    ushr v3.8h, v3.8h, #1
+; CHECK-NEXT:    bic v7.16b, v6.16b, v4.16b
+; CHECK-NEXT:    bic v16.16b, v6.16b, v5.16b
+; CHECK-NEXT:    and v4.16b, v4.16b, v6.16b
+; CHECK-NEXT:    and v5.16b, v5.16b, v6.16b
+; CHECK-NEXT:    neg v6.8h, v7.8h
+; CHECK-NEXT:    neg v7.8h, v16.8h
+; CHECK-NEXT:    ushl v0.8h, v0.8h, v4.8h
+; CHECK-NEXT:    ushl v1.8h, v1.8h, v5.8h
+; CHECK-NEXT:    ushl v2.8h, v2.8h, v6.8h
+; CHECK-NEXT:    ushl v3.8h, v3.8h, v7.8h
+; CHECK-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    orr v1.16b, v1.16b, v3.16b
+; CHECK-NEXT:    ret
+entry:
+  %d = call <16 x i16> @llvm.fshl(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c)
+  ret <16 x i16> %d
+}
+
+define <16 x i16> @fshr_v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c) {
+; CHECK-SD-LABEL: fshr_v16i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v6.8h, #15
+; CHECK-SD-NEXT:    add v0.8h, v0.8h, v0.8h
+; CHECK-SD-NEXT:    add v1.8h, v1.8h, v1.8h
+; CHECK-SD-NEXT:    and v7.16b, v4.16b, v6.16b
+; CHECK-SD-NEXT:    and v16.16b, v5.16b, v6.16b
+; CHECK-SD-NEXT:    bic v4.16b, v6.16b, v4.16b
+; CHECK-SD-NEXT:    bic v5.16b, v6.16b, v5.16b
+; CHECK-SD-NEXT:    neg v6.8h, v7.8h
+; CHECK-SD-NEXT:    neg v7.8h, v16.8h
+; CHECK-SD-NEXT:    ushl v0.8h, v0.8h, v4.8h
+; CHECK-SD-NEXT:    ushl v1.8h, v1.8h, v5.8h
+; CHECK-SD-NEXT:    ushl v2.8h, v2.8h, v6.8h
+; CHECK-SD-NEXT:    ushl v3.8h, v3.8h, v7.8h
+; CHECK-SD-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-SD-NEXT:    orr v1.16b, v1.16b, v3.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshr_v16i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v6.8h, #15
+; CHECK-GI-NEXT:    shl v0.8h, v0.8h, #1
+; CHECK-GI-NEXT:    shl v1.8h, v1.8h, #1
+; CHECK-GI-NEXT:    and v7.16b, v4.16b, v6.16b
+; CHECK-GI-NEXT:    and v16.16b, v5.16b, v6.16b
+; CHECK-GI-NEXT:    bic v4.16b, v6.16b, v4.16b
+; CHECK-GI-NEXT:    bic v5.16b, v6.16b, v5.16b
+; CHECK-GI-NEXT:    neg v6.8h, v7.8h
+; CHECK-GI-NEXT:    neg v7.8h, v16.8h
+; CHECK-GI-NEXT:    ushl v0.8h, v0.8h, v4.8h
+; CHECK-GI-NEXT:    ushl v1.8h, v1.8h, v5.8h
+; CHECK-GI-NEXT:    ushl v2.8h, v2.8h, v6.8h
+; CHECK-GI-NEXT:    ushl v3.8h, v3.8h, v7.8h
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    orr v1.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <16 x i16> @llvm.fshr(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c)
+  ret <16 x i16> %d
+}
+
+define <2 x i32> @fshl_v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) {
+; CHECK-LABEL: fshl_v2i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v3.2s, #31
+; CHECK-NEXT:    ushr v1.2s, v1.2s, #1
+; CHECK-NEXT:    bic v4.8b, v3.8b, v2.8b
+; CHECK-NEXT:    and v2.8b, v2.8b, v3.8b
+; CHECK-NEXT:    neg v3.2s, v4.2s
+; CHECK-NEXT:    ushl v0.2s, v0.2s, v2.2s
+; CHECK-NEXT:    ushl v1.2s, v1.2s, v3.2s
+; CHECK-NEXT:    orr v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
+entry:
+  %d = call <2 x i32> @llvm.fshl(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c)
+  ret <2 x i32> %d
+}
+
+define <2 x i32> @fshr_v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) {
+; CHECK-SD-LABEL: fshr_v2i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v3.2s, #31
+; CHECK-SD-NEXT:    add v0.2s, v0.2s, v0.2s
+; CHECK-SD-NEXT:    and v4.8b, v2.8b, v3.8b
+; CHECK-SD-NEXT:    bic v2.8b, v3.8b, v2.8b
+; CHECK-SD-NEXT:    neg v3.2s, v4.2s
+; CHECK-SD-NEXT:    ushl v0.2s, v0.2s, v2.2s
+; CHECK-SD-NEXT:    ushl v1.2s, v1.2s, v3.2s
+; CHECK-SD-NEXT:    orr v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshr_v2i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v3.2s, #31
+; CHECK-GI-NEXT:    shl v0.2s, v0.2s, #1
+; CHECK-GI-NEXT:    and v4.8b, v2.8b, v3.8b
+; CHECK-GI-NEXT:    bic v2.8b, v3.8b, v2.8b
+; CHECK-GI-NEXT:    neg v3.2s, v4.2s
+; CHECK-GI-NEXT:    ushl v0.2s, v0.2s, v2.2s
+; CHECK-GI-NEXT:    ushl v1.2s, v1.2s, v3.2s
+; CHECK-GI-NEXT:    orr v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <2 x i32> @llvm.fshr(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c)
+  ret <2 x i32> %d
+}
+
+define <4 x i32> @fshl_v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: fshl_v4i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v3.4s, #31
+; CHECK-NEXT:    ushr v1.4s, v1.4s, #1
+; CHECK-NEXT:    bic v4.16b, v3.16b, v2.16b
+; CHECK-NEXT:    and v2.16b, v2.16b, v3.16b
+; CHECK-NEXT:    neg v3.4s, v4.4s
+; CHECK-NEXT:    ushl v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ushl v1.4s, v1.4s, v3.4s
+; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+entry:
+  %d = call <4 x i32> @llvm.fshl(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c)
+  ret <4 x i32> %d
+}
+
+define <4 x i32> @fshr_v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-SD-LABEL: fshr_v4i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v3.4s, #31
+; CHECK-SD-NEXT:    add v0.4s, v0.4s, v0.4s
+; CHECK-SD-NEXT:    and v4.16b, v2.16b, v3.16b
+; CHECK-SD-NEXT:    bic v2.16b, v3.16b, v2.16b
+; CHECK-SD-NEXT:    neg v3.4s, v4.4s
+; CHECK-SD-NEXT:    ushl v0.4s, v0.4s, v2.4s
+; CHECK-SD-NEXT:    ushl v1.4s, v1.4s, v3.4s
+; CHECK-SD-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshr_v4i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v3.4s, #31
+; CHECK-GI-NEXT:    shl v0.4s, v0.4s, #1
+; CHECK-GI-NEXT:    and v4.16b, v2.16b, v3.16b
+; CHECK-GI-NEXT:    bic v2.16b, v3.16b, v2.16b
+; CHECK-GI-NEXT:    neg v3.4s, v4.4s
+; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v2.4s
+; CHECK-GI-NEXT:    ushl v1.4s, v1.4s, v3.4s
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <4 x i32> @llvm.fshr(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c)
+  ret <4 x i32> %d
+}
+
+define <7 x i32> @fshl_v7i32(<7 x i32> %a, <7 x i32> %b, <7 x i32> %c) {
+; CHECK-SD-LABEL: fshl_v7i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ldr s0, [sp, #48]
+; CHECK-SD-NEXT:    add x8, sp, #56
+; CHECK-SD-NEXT:    fmov s3, w7
+; CHECK-SD-NEXT:    fmov s4, w0
+; CHECK-SD-NEXT:    ldr s2, [sp, #80]
+; CHECK-SD-NEXT:    mov x9, sp
+; CHECK-SD-NEXT:    ld1 { v0.s }[1], [x8]
+; CHECK-SD-NEXT:    add x10, sp, #88
+; CHECK-SD-NEXT:    ldr s1, [sp, #24]
+; CHECK-SD-NEXT:    ld1 { v3.s }[1], [x9]
+; CHECK-SD-NEXT:    add x9, sp, #64
+; CHECK-SD-NEXT:    ld1 { v2.s }[1], [x10]
+; CHECK-SD-NEXT:    mov v4.s[1], w1
+; CHECK-SD-NEXT:    add x8, sp, #32
+; CHECK-SD-NEXT:    fmov s5, w4
+; CHECK-SD-NEXT:    ld1 { v0.s }[2], [x9]
+; CHECK-SD-NEXT:    ld1 { v1.s }[1], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #8
+; CHECK-SD-NEXT:    add x9, sp, #72
+; CHECK-SD-NEXT:    movi v6.4s, #31
+; CHECK-SD-NEXT:    add x10, sp, #96
+; CHECK-SD-NEXT:    ld1 { v3.s }[2], [x8]
+; CHECK-SD-NEXT:    ld1 { v2.s }[2], [x10]
+; CHECK-SD-NEXT:    mov v5.s[1], w5
+; CHECK-SD-NEXT:    ld1 { v0.s }[3], [x9]
+; CHECK-SD-NEXT:    mov v4.s[2], w2
+; CHECK-SD-NEXT:    add x8, sp, #16
+; CHECK-SD-NEXT:    add x9, sp, #40
+; CHECK-SD-NEXT:    ld1 { v3.s }[3], [x8]
+; CHECK-SD-NEXT:    ld1 { v1.s }[2], [x9]
+; CHECK-SD-NEXT:    bic v16.16b, v6.16b, v2.16b
+; CHECK-SD-NEXT:    bic v7.16b, v6.16b, v0.16b
+; CHECK-SD-NEXT:    mov v5.s[2], w6
+; CHECK-SD-NEXT:    and v0.16b, v0.16b, v6.16b
+; CHECK-SD-NEXT:    mov v4.s[3], w3
+; CHECK-SD-NEXT:    and v2.16b, v2.16b, v6.16b
+; CHECK-SD-NEXT:    ushr v3.4s, v3.4s, #1
+; CHECK-SD-NEXT:    ushr v1.4s, v1.4s, #1
+; CHECK-SD-NEXT:    neg v6.4s, v16.4s
+; CHECK-SD-NEXT:    neg v7.4s, v7.4s
+; CHECK-SD-NEXT:    ushl v2.4s, v5.4s, v2.4s
+; CHECK-SD-NEXT:    ushl v0.4s, v4.4s, v0.4s
+; CHECK-SD-NEXT:    ushl v1.4s, v1.4s, v6.4s
+; CHECK-SD-NEXT:    ushl v3.4s, v3.4s, v7.4s
+; CHECK-SD-NEXT:    orr v1.16b, v2.16b, v1.16b
+; CHECK-SD-NEXT:    orr v0.16b, v0.16b, v3.16b
+; CHECK-SD-NEXT:    mov w5, v1.s[1]
+; CHECK-SD-NEXT:    mov w6, v1.s[2]
+; CHECK-SD-NEXT:    fmov w4, s1
+; CHECK-SD-NEXT:    mov w1, v0.s[1]
+; CHECK-SD-NEXT:    mov w2, v0.s[2]
+; CHECK-SD-NEXT:    mov w3, v0.s[3]
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshl_v7i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w11, #-1 // =0xffffffff
+; CHECK-GI-NEXT:    ldr s3, [sp, #48]
+; CHECK-GI-NEXT:    ldr s19, [sp, #56]
+; CHECK-GI-NEXT:    mov v20.s[0], w11
+; CHECK-GI-NEXT:    mov v21.s[0], w7
+; CHECK-GI-NEXT:    ldr s16, [sp, #80]
+; CHECK-GI-NEXT:    ldr s22, [sp, #88]
+; CHECK-GI-NEXT:    mov w12, #31 // =0x1f
+; CHECK-GI-NEXT:    mov w13, #1 // =0x1
+; CHECK-GI-NEXT:    ldr s6, [sp]
+; CHECK-GI-NEXT:    mov v3.s[1], v19.s[0]
+; CHECK-GI-NEXT:    mov v19.s[0], w12
+; CHECK-GI-NEXT:    mov v23.s[0], w13
+; CHECK-GI-NEXT:    mov v16.s[1], v22.s[0]
+; CHECK-GI-NEXT:    ldr s7, [sp, #64]
+; CHECK-GI-NEXT:    mov v20.s[1], w11
+; CHECK-GI-NEXT:    mov v22.s[0], w0
+; CHECK-GI-NEXT:    mov v21.s[1], v6.s[0]
+; CHECK-GI-NEXT:    ldr s24, [sp, #96]
+; CHECK-GI-NEXT:    mov v6.s[0], w12
+; CHECK-GI-NEXT:    ldr s5, [sp, #8]
+; CHECK-GI-NEXT:    ldr s18, [sp, #48]
+; CHECK-GI-NEXT:    mov v3.s[2], v7.s[0]
+; CHECK-GI-NEXT:    mov v19.s[1], w12
+; CHECK-GI-NEXT:    mov v23.s[1], w13
+; CHECK-GI-NEXT:    ldr s0, [sp, #24]
+; CHECK-GI-NEXT:    ldr s4, [sp, #32]
+; CHECK-GI-NEXT:    add x10, sp, #56
+; CHECK-GI-NEXT:    mov v16.s[2], v24.s[0]
+; CHECK-GI-NEXT:    mov v20.s[2], w11
+; CHECK-GI-NEXT:    ldr s17, [sp, #72]
+; CHECK-GI-NEXT:    ld1 { v18.s }[1], [x10]
+; CHECK-GI-NEXT:    mov v22.s[1], w1
+; CHECK-GI-NEXT:    mov v21.s[2], v5.s[0]
+; CHECK-GI-NEXT:    mov v5.s[0], w4
+; CHECK-GI-NEXT:    ldr s24, [sp, #80]
+; CHECK-GI-NEXT:    mov v6.s[1], w12
+; CHECK-GI-NEXT:    mov v0.s[1], v4.s[0]
+; CHECK-GI-NEXT:    add x9, sp, #64
+; CHECK-GI-NEXT:    add x10, sp, #88
+; CHECK-GI-NEXT:    movi v7.4s, #31
+; CHECK-GI-NEXT:    mov v3.s[3], v17.s[0]
+; CHECK-GI-NEXT:    mov v19.s[2], w12
+; CHECK-GI-NEXT:    mov v23.s[2], w13
+; CHECK-GI-NEXT:    ldr s2, [sp, #16]
+; CHECK-GI-NEXT:    ldr s1, [sp, #40]
+; CHECK-GI-NEXT:    ld1 { v18.s }[2], [x9]
+; CHECK-GI-NEXT:    ld1 { v24.s }[1], [x10]
+; CHECK-GI-NEXT:    eor v4.16b, v16.16b, v20.16b
+; CHECK-GI-NEXT:    mov v22.s[2], w2
+; CHECK-GI-NEXT:    mov v5.s[1], w5
+; CHECK-GI-NEXT:    add x8, sp, #72
+; CHECK-GI-NEXT:    add x9, sp, #96
+; CHECK-GI-NEXT:    mov v21.s[3], v2.s[0]
+; CHECK-GI-NEXT:    mov v6.s[2], w12
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[0]
+; CHECK-GI-NEXT:    ld1 { v18.s }[3], [x8]
+; CHECK-GI-NEXT:    bic v2.16b, v7.16b, v3.16b
+; CHECK-GI-NEXT:    ld1 { v24.s }[2], [x9]
+; CHECK-GI-NEXT:    and v1.16b, v4.16b, v19.16b
+; CHECK-GI-NEXT:    neg v3.4s, v23.4s
+; CHECK-GI-NEXT:    mov v22.s[3], w3
+; CHECK-GI-NEXT:    mov v5.s[2], w6
+; CHECK-GI-NEXT:    and v4.16b, v18.16b, v7.16b
+; CHECK-GI-NEXT:    ushr v7.4s, v21.4s, #1
+; CHECK-GI-NEXT:    neg v2.4s, v2.4s
+; CHECK-GI-NEXT:    and v6.16b, v24.16b, v6.16b
+; CHECK-GI-NEXT:    neg v1.4s, v1.4s
+; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v3.4s
+; CHECK-GI-NEXT:    ushl v3.4s, v22.4s, v4.4s
+; CHECK-GI-NEXT:    ushl v2.4s, v7.4s, v2.4s
+; CHECK-GI-NEXT:    ushl v4.4s, v5.4s, v6.4s
+; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    orr v1.16b, v3.16b, v2.16b
+; CHECK-GI-NEXT:    orr v0.16b, v4.16b, v0.16b
+; CHECK-GI-NEXT:    mov s2, v1.s[1]
+; CHECK-GI-NEXT:    mov s3, v1.s[2]
+; CHECK-GI-NEXT:    mov s4, v1.s[3]
+; CHECK-GI-NEXT:    mov s5, v0.s[1]
+; CHECK-GI-NEXT:    mov s6, v0.s[2]
+; CHECK-GI-NEXT:    fmov w0, s1
+; CHECK-GI-NEXT:    fmov w4, s0
+; CHECK-GI-NEXT:    fmov w1, s2
+; CHECK-GI-NEXT:    fmov w2, s3
+; CHECK-GI-NEXT:    fmov w3, s4
+; CHECK-GI-NEXT:    fmov w5, s5
+; CHECK-GI-NEXT:    fmov w6, s6
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <7 x i32> @llvm.fshl(<7 x i32> %a, <7 x i32> %b, <7 x i32> %c)
+  ret <7 x i32> %d
+}
+
+define <7 x i32> @fshr_v7i32(<7 x i32> %a, <7 x i32> %b, <7 x i32> %c) {
+; CHECK-SD-LABEL: fshr_v7i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov s1, w0
+; CHECK-SD-NEXT:    ldr s0, [sp, #48]
+; CHECK-SD-NEXT:    add x8, sp, #56
+; CHECK-SD-NEXT:    ldr s2, [sp, #80]
+; CHECK-SD-NEXT:    fmov s3, w4
+; CHECK-SD-NEXT:    add x9, sp, #64
+; CHECK-SD-NEXT:    ld1 { v0.s }[1], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #88
+; CHECK-SD-NEXT:    fmov s4, w7
+; CHECK-SD-NEXT:    mov v1.s[1], w1
+; CHECK-SD-NEXT:    ld1 { v2.s }[1], [x8]
+; CHECK-SD-NEXT:    mov x10, sp
+; CHECK-SD-NEXT:    mov v3.s[1], w5
+; CHECK-SD-NEXT:    add x8, sp, #72
+; CHECK-SD-NEXT:    movi v5.4s, #31
+; CHECK-SD-NEXT:    ld1 { v0.s }[2], [x9]
+; CHECK-SD-NEXT:    add x9, sp, #96
+; CHECK-SD-NEXT:    ld1 { v4.s }[1], [x10]
+; CHECK-SD-NEXT:    ld1 { v2.s }[2], [x9]
+; CHECK-SD-NEXT:    ldr s6, [sp, #24]
+; CHECK-SD-NEXT:    add x9, sp, #8
+; CHECK-SD-NEXT:    mov v1.s[2], w2
+; CHECK-SD-NEXT:    ld1 { v0.s }[3], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #32
+; CHECK-SD-NEXT:    mov v3.s[2], w6
+; CHECK-SD-NEXT:    ld1 { v4.s }[2], [x9]
+; CHECK-SD-NEXT:    ld1 { v6.s }[1], [x8]
+; CHECK-SD-NEXT:    bic v16.16b, v5.16b, v2.16b
+; CHECK-SD-NEXT:    and v2.16b, v2.16b, v5.16b
+; CHECK-SD-NEXT:    add x8, sp, #40
+; CHECK-SD-NEXT:    add x9, sp, #16
+; CHECK-SD-NEXT:    mov v1.s[3], w3
+; CHECK-SD-NEXT:    and v7.16b, v0.16b, v5.16b
+; CHECK-SD-NEXT:    bic v0.16b, v5.16b, v0.16b
+; CHECK-SD-NEXT:    ld1 { v4.s }[3], [x9]
+; CHECK-SD-NEXT:    ld1 { v6.s }[2], [x8]
+; CHECK-SD-NEXT:    add v3.4s, v3.4s, v3.4s
+; CHECK-SD-NEXT:    neg v2.4s, v2.4s
+; CHECK-SD-NEXT:    neg v5.4s, v7.4s
+; CHECK-SD-NEXT:    add v1.4s, v1.4s, v1.4s
+; CHECK-SD-NEXT:    ushl v3.4s, v3.4s, v16.4s
+; CHECK-SD-NEXT:    ushl v2.4s, v6.4s, v2.4s
+; CHECK-SD-NEXT:    ushl v0.4s, v1.4s, v0.4s
+; CHECK-SD-NEXT:    ushl v1.4s, v4.4s, v5.4s
+; CHECK-SD-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT:    orr v1.16b, v3.16b, v2.16b
+; CHECK-SD-NEXT:    mov w1, v0.s[1]
+; CHECK-SD-NEXT:    mov w2, v0.s[2]
+; CHECK-SD-NEXT:    mov w3, v0.s[3]
+; CHECK-SD-NEXT:    mov w5, v1.s[1]
+; CHECK-SD-NEXT:    mov w6, v1.s[2]
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    fmov w4, s1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshr_v7i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ldr s2, [sp, #48]
+; CHECK-GI-NEXT:    ldr s16, [sp, #56]
+; CHECK-GI-NEXT:    add x10, sp, #56
+; CHECK-GI-NEXT:    ldr s4, [sp, #80]
+; CHECK-GI-NEXT:    ldr s17, [sp, #88]
+; CHECK-GI-NEXT:    mov w11, #-1 // =0xffffffff
+; CHECK-GI-NEXT:    ldr s7, [sp, #48]
+; CHECK-GI-NEXT:    mov v2.s[1], v16.s[0]
+; CHECK-GI-NEXT:    mov v16.s[0], w0
+; CHECK-GI-NEXT:    mov v4.s[1], v17.s[0]
+; CHECK-GI-NEXT:    ldr s19, [sp, #64]
+; CHECK-GI-NEXT:    mov v18.s[0], w11
+; CHECK-GI-NEXT:    ld1 { v7.s }[1], [x10]
+; CHECK-GI-NEXT:    mov w10, #31 // =0x1f
+; CHECK-GI-NEXT:    add x9, sp, #64
+; CHECK-GI-NEXT:    mov v17.s[0], w10
+; CHECK-GI-NEXT:    ldr s21, [sp, #96]
+; CHECK-GI-NEXT:    mov v22.s[0], w4
+; CHECK-GI-NEXT:    mov v2.s[2], v19.s[0]
+; CHECK-GI-NEXT:    mov v19.s[0], w7
+; CHECK-GI-NEXT:    mov v16.s[1], w1
+; CHECK-GI-NEXT:    ld1 { v7.s }[2], [x9]
+; CHECK-GI-NEXT:    mov w9, #1 // =0x1
+; CHECK-GI-NEXT:    mov v4.s[2], v21.s[0]
+; CHECK-GI-NEXT:    mov v21.s[0], w10
+; CHECK-GI-NEXT:    mov v23.s[0], w9
+; CHECK-GI-NEXT:    ldr s6, [sp]
+; CHECK-GI-NEXT:    ldr s24, [sp, #80]
+; CHECK-GI-NEXT:    mov v17.s[1], w10
+; CHECK-GI-NEXT:    mov v18.s[1], w11
+; CHECK-GI-NEXT:    add x12, sp, #88
+; CHECK-GI-NEXT:    mov v19.s[1], v6.s[0]
+; CHECK-GI-NEXT:    add x8, sp, #72
+; CHECK-GI-NEXT:    ld1 { v24.s }[1], [x12]
+; CHECK-GI-NEXT:    mov v16.s[2], w2
+; CHECK-GI-NEXT:    mov v22.s[1], w5
+; CHECK-GI-NEXT:    mov v21.s[1], w10
+; CHECK-GI-NEXT:    mov v23.s[1], w9
+; CHECK-GI-NEXT:    ldr s5, [sp, #8]
+; CHECK-GI-NEXT:    ldr s0, [sp, #24]
+; CHECK-GI-NEXT:    ldr s3, [sp, #32]
+; CHECK-GI-NEXT:    ld1 { v7.s }[3], [x8]
+; CHECK-GI-NEXT:    movi v6.4s, #31
+; CHECK-GI-NEXT:    add x8, sp, #96
+; CHECK-GI-NEXT:    mov v17.s[2], w10
+; CHECK-GI-NEXT:    mov v18.s[2], w11
+; CHECK-GI-NEXT:    ldr s20, [sp, #72]
+; CHECK-GI-NEXT:    ld1 { v24.s }[2], [x8]
+; CHECK-GI-NEXT:    mov v19.s[2], v5.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v3.s[0]
+; CHECK-GI-NEXT:    mov v16.s[3], w3
+; CHECK-GI-NEXT:    mov v2.s[3], v20.s[0]
+; CHECK-GI-NEXT:    mov v21.s[2], w10
+; CHECK-GI-NEXT:    mov v22.s[2], w6
+; CHECK-GI-NEXT:    mov v23.s[2], w9
+; CHECK-GI-NEXT:    ldr s1, [sp, #16]
+; CHECK-GI-NEXT:    and v5.16b, v7.16b, v6.16b
+; CHECK-GI-NEXT:    ldr s3, [sp, #40]
+; CHECK-GI-NEXT:    and v7.16b, v24.16b, v17.16b
+; CHECK-GI-NEXT:    eor v4.16b, v4.16b, v18.16b
+; CHECK-GI-NEXT:    mov v19.s[3], v1.s[0]
+; CHECK-GI-NEXT:    shl v1.4s, v16.4s, #1
+; CHECK-GI-NEXT:    mov v0.s[2], v3.s[0]
+; CHECK-GI-NEXT:    bic v2.16b, v6.16b, v2.16b
+; CHECK-GI-NEXT:    neg v5.4s, v5.4s
+; CHECK-GI-NEXT:    and v3.16b, v4.16b, v21.16b
+; CHECK-GI-NEXT:    ushl v4.4s, v22.4s, v23.4s
+; CHECK-GI-NEXT:    neg v6.4s, v7.4s
+; CHECK-GI-NEXT:    ushl v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    ushl v2.4s, v19.4s, v5.4s
+; CHECK-GI-NEXT:    ushl v3.4s, v4.4s, v3.4s
+; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v6.4s
+; CHECK-GI-NEXT:    orr v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT:    orr v0.16b, v3.16b, v0.16b
+; CHECK-GI-NEXT:    mov s2, v1.s[1]
+; CHECK-GI-NEXT:    mov s3, v1.s[2]
+; CHECK-GI-NEXT:    mov s4, v1.s[3]
+; CHECK-GI-NEXT:    mov s5, v0.s[1]
+; CHECK-GI-NEXT:    mov s6, v0.s[2]
+; CHECK-GI-NEXT:    fmov w0, s1
+; CHECK-GI-NEXT:    fmov w4, s0
+; CHECK-GI-NEXT:    fmov w1, s2
+; CHECK-GI-NEXT:    fmov w2, s3
+; CHECK-GI-NEXT:    fmov w3, s4
+; CHECK-GI-NEXT:    fmov w5, s5
+; CHECK-GI-NEXT:    fmov w6, s6
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <7 x i32> @llvm.fshr(<7 x i32> %a, <7 x i32> %b, <7 x i32> %c)
+  ret <7 x i32> %d
+}
+
+define <8 x i32> @fshl_v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c) {
+; CHECK-LABEL: fshl_v8i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v6.4s, #31
+; CHECK-NEXT:    ushr v2.4s, v2.4s, #1
+; CHECK-NEXT:    ushr v3.4s, v3.4s, #1
+; CHECK-NEXT:    bic v7.16b, v6.16b, v4.16b
+; CHECK-NEXT:    bic v16.16b, v6.16b, v5.16b
+; CHECK-NEXT:    and v4.16b, v4.16b, v6.16b
+; CHECK-NEXT:    and v5.16b, v5.16b, v6.16b
+; CHECK-NEXT:    neg v6.4s, v7.4s
+; CHECK-NEXT:    neg v7.4s, v16.4s
+; CHECK-NEXT:    ushl v0.4s, v0.4s, v4.4s
+; CHECK-NEXT:    ushl v1.4s, v1.4s, v5.4s
+; CHECK-NEXT:    ushl v2.4s, v2.4s, v6.4s
+; CHECK-NEXT:    ushl v3.4s, v3.4s, v7.4s
+; CHECK-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    orr v1.16b, v1.16b, v3.16b
+; CHECK-NEXT:    ret
+entry:
+  %d = call <8 x i32> @llvm.fshl(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c)
+  ret <8 x i32> %d
+}
+
+define <8 x i32> @fshr_v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c) {
+; CHECK-SD-LABEL: fshr_v8i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v6.4s, #31
+; CHECK-SD-NEXT:    add v0.4s, v0.4s, v0.4s
+; CHECK-SD-NEXT:    add v1.4s, v1.4s, v1.4s
+; CHECK-SD-NEXT:    and v7.16b, v4.16b, v6.16b
+; CHECK-SD-NEXT:    and v16.16b, v5.16b, v6.16b
+; CHECK-SD-NEXT:    bic v4.16b, v6.16b, v4.16b
+; CHECK-SD-NEXT:    bic v5.16b, v6.16b, v5.16b
+; CHECK-SD-NEXT:    neg v6.4s, v7.4s
+; CHECK-SD-NEXT:    neg v7.4s, v16.4s
+; CHECK-SD-NEXT:    ushl v0.4s, v0.4s, v4.4s
+; CHECK-SD-NEXT:    ushl v1.4s, v1.4s, v5.4s
+; CHECK-SD-NEXT:    ushl v2.4s, v2.4s, v6.4s
+; CHECK-SD-NEXT:    ushl v3.4s, v3.4s, v7.4s
+; CHECK-SD-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-SD-NEXT:    orr v1.16b, v1.16b, v3.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshr_v8i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v6.4s, #31
+; CHECK-GI-NEXT:    shl v0.4s, v0.4s, #1
+; CHECK-GI-NEXT:    shl v1.4s, v1.4s, #1
+; CHECK-GI-NEXT:    and v7.16b, v4.16b, v6.16b
+; CHECK-GI-NEXT:    and v16.16b, v5.16b, v6.16b
+; CHECK-GI-NEXT:    bic v4.16b, v6.16b, v4.16b
+; CHECK-GI-NEXT:    bic v5.16b, v6.16b, v5.16b
+; CHECK-GI-NEXT:    neg v6.4s, v7.4s
+; CHECK-GI-NEXT:    neg v7.4s, v16.4s
+; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v4.4s
+; CHECK-GI-NEXT:    ushl v1.4s, v1.4s, v5.4s
+; CHECK-GI-NEXT:    ushl v2.4s, v2.4s, v6.4s
+; CHECK-GI-NEXT:    ushl v3.4s, v3.4s, v7.4s
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    orr v1.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <8 x i32> @llvm.fshr(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c)
+  ret <8 x i32> %d
+}
+
+define <2 x i64> @fshl_v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) {
+; CHECK-SD-LABEL: fshl_v2i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #63 // =0x3f
+; CHECK-SD-NEXT:    ushr v1.2d, v1.2d, #1
+; CHECK-SD-NEXT:    dup v3.2d, x8
+; CHECK-SD-NEXT:    bic v4.16b, v3.16b, v2.16b
+; CHECK-SD-NEXT:    and v2.16b, v2.16b, v3.16b
+; CHECK-SD-NEXT:    neg v3.2d, v4.2d
+; CHECK-SD-NEXT:    ushl v0.2d, v0.2d, v2.2d
+; CHECK-SD-NEXT:    ushl v1.2d, v1.2d, v3.2d
+; CHECK-SD-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshl_v2i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI86_0
+; CHECK-GI-NEXT:    ushr v1.2d, v1.2d, #1
+; CHECK-GI-NEXT:    ldr q3, [x8, :lo12:.LCPI86_0]
+; CHECK-GI-NEXT:    bic v4.16b, v3.16b, v2.16b
+; CHECK-GI-NEXT:    and v2.16b, v2.16b, v3.16b
+; CHECK-GI-NEXT:    neg v3.2d, v4.2d
+; CHECK-GI-NEXT:    ushl v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    ushl v1.2d, v1.2d, v3.2d
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <2 x i64> @llvm.fshl(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c)
+  ret <2 x i64> %d
+}
+
+define <2 x i64> @fshr_v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) {
+; CHECK-SD-LABEL: fshr_v2i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #63 // =0x3f
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v0.2d
+; CHECK-SD-NEXT:    dup v3.2d, x8
+; CHECK-SD-NEXT:    and v4.16b, v2.16b, v3.16b
+; CHECK-SD-NEXT:    bic v2.16b, v3.16b, v2.16b
+; CHECK-SD-NEXT:    neg v3.2d, v4.2d
+; CHECK-SD-NEXT:    ushl v0.2d, v0.2d, v2.2d
+; CHECK-SD-NEXT:    ushl v1.2d, v1.2d, v3.2d
+; CHECK-SD-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshr_v2i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI87_0
+; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #1
+; CHECK-GI-NEXT:    ldr q3, [x8, :lo12:.LCPI87_0]
+; CHECK-GI-NEXT:    and v4.16b, v2.16b, v3.16b
+; CHECK-GI-NEXT:    bic v2.16b, v3.16b, v2.16b
+; CHECK-GI-NEXT:    neg v3.2d, v4.2d
+; CHECK-GI-NEXT:    ushl v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    ushl v1.2d, v1.2d, v3.2d
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <2 x i64> @llvm.fshr(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c)
+  ret <2 x i64> %d
+}
+
+define <4 x i64> @fshl_v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c) {
+; CHECK-SD-LABEL: fshl_v4i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #63 // =0x3f
+; CHECK-SD-NEXT:    ushr v2.2d, v2.2d, #1
+; CHECK-SD-NEXT:    ushr v3.2d, v3.2d, #1
+; CHECK-SD-NEXT:    dup v6.2d, x8
+; CHECK-SD-NEXT:    bic v7.16b, v6.16b, v4.16b
+; CHECK-SD-NEXT:    bic v16.16b, v6.16b, v5.16b
+; CHECK-SD-NEXT:    and v4.16b, v4.16b, v6.16b
+; CHECK-SD-NEXT:    and v5.16b, v5.16b, v6.16b
+; CHECK-SD-NEXT:    neg v6.2d, v7.2d
+; CHECK-SD-NEXT:    neg v7.2d, v16.2d
+; CHECK-SD-NEXT:    ushl v0.2d, v0.2d, v4.2d
+; CHECK-SD-NEXT:    ushl v1.2d, v1.2d, v5.2d
+; CHECK-SD-NEXT:    ushl v2.2d, v2.2d, v6.2d
+; CHECK-SD-NEXT:    ushl v3.2d, v3.2d, v7.2d
+; CHECK-SD-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-SD-NEXT:    orr v1.16b, v1.16b, v3.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshl_v4i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI88_0
+; CHECK-GI-NEXT:    ushr v2.2d, v2.2d, #1
+; CHECK-GI-NEXT:    ushr v3.2d, v3.2d, #1
+; CHECK-GI-NEXT:    ldr q6, [x8, :lo12:.LCPI88_0]
+; CHECK-GI-NEXT:    bic v7.16b, v6.16b, v4.16b
+; CHECK-GI-NEXT:    bic v16.16b, v6.16b, v5.16b
+; CHECK-GI-NEXT:    and v4.16b, v4.16b, v6.16b
+; CHECK-GI-NEXT:    and v5.16b, v5.16b, v6.16b
+; CHECK-GI-NEXT:    neg v6.2d, v7.2d
+; CHECK-GI-NEXT:    neg v7.2d, v16.2d
+; CHECK-GI-NEXT:    ushl v0.2d, v0.2d, v4.2d
+; CHECK-GI-NEXT:    ushl v1.2d, v1.2d, v5.2d
+; CHECK-GI-NEXT:    ushl v2.2d, v2.2d, v6.2d
+; CHECK-GI-NEXT:    ushl v3.2d, v3.2d, v7.2d
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    orr v1.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <4 x i64> @llvm.fshl(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c)
+  ret <4 x i64> %d
+}
+
+define <4 x i64> @fshr_v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c) {
+; CHECK-SD-LABEL: fshr_v4i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #63 // =0x3f
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v0.2d
+; CHECK-SD-NEXT:    add v1.2d, v1.2d, v1.2d
+; CHECK-SD-NEXT:    dup v6.2d, x8
+; CHECK-SD-NEXT:    and v7.16b, v4.16b, v6.16b
+; CHECK-SD-NEXT:    and v16.16b, v5.16b, v6.16b
+; CHECK-SD-NEXT:    bic v4.16b, v6.16b, v4.16b
+; CHECK-SD-NEXT:    bic v5.16b, v6.16b, v5.16b
+; CHECK-SD-NEXT:    neg v6.2d, v7.2d
+; CHECK-SD-NEXT:    neg v7.2d, v16.2d
+; CHECK-SD-NEXT:    ushl v0.2d, v0.2d, v4.2d
+; CHECK-SD-NEXT:    ushl v1.2d, v1.2d, v5.2d
+; CHECK-SD-NEXT:    ushl v2.2d, v2.2d, v6.2d
+; CHECK-SD-NEXT:    ushl v3.2d, v3.2d, v7.2d
+; CHECK-SD-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-SD-NEXT:    orr v1.16b, v1.16b, v3.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshr_v4i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI89_0
+; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #1
+; CHECK-GI-NEXT:    shl v1.2d, v1.2d, #1
+; CHECK-GI-NEXT:    ldr q6, [x8, :lo12:.LCPI89_0]
+; CHECK-GI-NEXT:    and v7.16b, v4.16b, v6.16b
+; CHECK-GI-NEXT:    and v16.16b, v5.16b, v6.16b
+; CHECK-GI-NEXT:    bic v4.16b, v6.16b, v4.16b
+; CHECK-GI-NEXT:    bic v5.16b, v6.16b, v5.16b
+; CHECK-GI-NEXT:    neg v6.2d, v7.2d
+; CHECK-GI-NEXT:    neg v7.2d, v16.2d
+; CHECK-GI-NEXT:    ushl v0.2d, v0.2d, v4.2d
+; CHECK-GI-NEXT:    ushl v1.2d, v1.2d, v5.2d
+; CHECK-GI-NEXT:    ushl v2.2d, v2.2d, v6.2d
+; CHECK-GI-NEXT:    ushl v3.2d, v3.2d, v7.2d
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    orr v1.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <4 x i64> @llvm.fshr(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c)
+  ret <4 x i64> %d
+}
+
+define <2 x i128> @fshl_v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> %c) {
+; CHECK-SD-LABEL: fshl_v2i128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ldr x8, [sp]
+; CHECK-SD-NEXT:    ldr x10, [sp, #16]
+; CHECK-SD-NEXT:    tst x8, #0x40
+; CHECK-SD-NEXT:    mvn w13, w8
+; CHECK-SD-NEXT:    mvn w16, w10
+; CHECK-SD-NEXT:    csel x9, x5, x0, ne
+; CHECK-SD-NEXT:    csel x11, x4, x5, ne
+; CHECK-SD-NEXT:    csel x14, x0, x1, ne
+; CHECK-SD-NEXT:    lsl x12, x9, x8
+; CHECK-SD-NEXT:    lsr x11, x11, #1
+; CHECK-SD-NEXT:    lsr x9, x9, #1
+; CHECK-SD-NEXT:    tst x10, #0x40
+; CHECK-SD-NEXT:    lsl x8, x14, x8
+; CHECK-SD-NEXT:    csel x14, x7, x2, ne
+; CHECK-SD-NEXT:    csel x15, x6, x7, ne
+; CHECK-SD-NEXT:    lsr x11, x11, x13
+; CHECK-SD-NEXT:    lsr x9, x9, x13
+; CHECK-SD-NEXT:    lsr x13, x15, #1
+; CHECK-SD-NEXT:    lsr x15, x14, #1
+; CHECK-SD-NEXT:    csel x17, x2, x3, ne
+; CHECK-SD-NEXT:    lsl x14, x14, x10
+; CHECK-SD-NEXT:    orr x0, x12, x11
+; CHECK-SD-NEXT:    lsr x13, x13, x16
+; CHECK-SD-NEXT:    lsl x10, x17, x10
+; CHECK-SD-NEXT:    lsr x15, x15, x16
+; CHECK-SD-NEXT:    orr x1, x8, x9
+; CHECK-SD-NEXT:    orr x2, x14, x13
+; CHECK-SD-NEXT:    orr x3, x10, x15
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshl_v2i128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    str x19, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    .cfi_offset w19, -16
+; CHECK-GI-NEXT:    ldr x11, [sp, #16]
+; CHECK-GI-NEXT:    mov w10, #64 // =0x40
+; CHECK-GI-NEXT:    ldr x12, [sp, #32]
+; CHECK-GI-NEXT:    mov w13, #127 // =0x7f
+; CHECK-GI-NEXT:    and x9, x11, #0x7f
+; CHECK-GI-NEXT:    and x14, x12, #0x7f
+; CHECK-GI-NEXT:    mvn x15, x11
+; CHECK-GI-NEXT:    sub x8, x10, x9
+; CHECK-GI-NEXT:    sub x16, x9, #64
+; CHECK-GI-NEXT:    lsl x19, x1, x9
+; CHECK-GI-NEXT:    lsr x18, x0, x8
+; CHECK-GI-NEXT:    lsl x17, x0, x9
+; CHECK-GI-NEXT:    lsl x16, x0, x16
+; CHECK-GI-NEXT:    cmp x9, #64
+; CHECK-GI-NEXT:    bic x0, x13, x11
+; CHECK-GI-NEXT:    mvn x8, x12
+; CHECK-GI-NEXT:    orr x18, x18, x19
+; CHECK-GI-NEXT:    csel x9, x17, xzr, lo
+; CHECK-GI-NEXT:    sub x17, x14, #64
+; CHECK-GI-NEXT:    csel x16, x18, x16, lo
+; CHECK-GI-NEXT:    tst x11, #0x7f
+; CHECK-GI-NEXT:    sub x11, x10, x14
+; CHECK-GI-NEXT:    lsr x11, x2, x11
+; CHECK-GI-NEXT:    lsl x18, x3, x14
+; CHECK-GI-NEXT:    csel x16, x1, x16, eq
+; CHECK-GI-NEXT:    lsl x1, x2, x14
+; CHECK-GI-NEXT:    lsl x17, x2, x17
+; CHECK-GI-NEXT:    cmp x14, #64
+; CHECK-GI-NEXT:    lsl x14, x5, #63
+; CHECK-GI-NEXT:    orr x11, x11, x18
+; CHECK-GI-NEXT:    bic x13, x13, x12
+; CHECK-GI-NEXT:    csel x18, x1, xzr, lo
+; CHECK-GI-NEXT:    csel x11, x11, x17, lo
+; CHECK-GI-NEXT:    tst x12, #0x7f
+; CHECK-GI-NEXT:    lsr x12, x5, #1
+; CHECK-GI-NEXT:    orr x14, x14, x4, lsr #1
+; CHECK-GI-NEXT:    lsl x17, x7, #63
+; CHECK-GI-NEXT:    sub x1, x10, x0
+; CHECK-GI-NEXT:    csel x11, x3, x11, eq
+; CHECK-GI-NEXT:    sub x2, x0, #64
+; CHECK-GI-NEXT:    lsr x3, x14, x0
+; CHECK-GI-NEXT:    lsl x1, x12, x1
+; CHECK-GI-NEXT:    lsr x4, x7, #1
+; CHECK-GI-NEXT:    orr x17, x17, x6, lsr #1
+; CHECK-GI-NEXT:    lsr x2, x12, x2
+; CHECK-GI-NEXT:    cmp x0, #64
+; CHECK-GI-NEXT:    orr x1, x3, x1
+; CHECK-GI-NEXT:    sub x10, x10, x13
+; CHECK-GI-NEXT:    lsr x12, x12, x0
+; CHECK-GI-NEXT:    csel x1, x1, x2, lo
+; CHECK-GI-NEXT:    tst x15, #0x7f
+; CHECK-GI-NEXT:    sub x15, x13, #64
+; CHECK-GI-NEXT:    lsr x2, x17, x13
+; CHECK-GI-NEXT:    lsl x10, x4, x10
+; CHECK-GI-NEXT:    csel x14, x14, x1, eq
+; CHECK-GI-NEXT:    cmp x0, #64
+; CHECK-GI-NEXT:    lsr x15, x4, x15
+; CHECK-GI-NEXT:    lsr x0, x4, x13
+; CHECK-GI-NEXT:    csel x12, x12, xzr, lo
+; CHECK-GI-NEXT:    orr x10, x2, x10
+; CHECK-GI-NEXT:    cmp x13, #64
+; CHECK-GI-NEXT:    csel x10, x10, x15, lo
+; CHECK-GI-NEXT:    tst x8, #0x7f
+; CHECK-GI-NEXT:    orr x1, x16, x12
+; CHECK-GI-NEXT:    csel x8, x17, x10, eq
+; CHECK-GI-NEXT:    cmp x13, #64
+; CHECK-GI-NEXT:    csel x10, x0, xzr, lo
+; CHECK-GI-NEXT:    orr x0, x9, x14
+; CHECK-GI-NEXT:    orr x2, x18, x8
+; CHECK-GI-NEXT:    orr x3, x11, x10
+; CHECK-GI-NEXT:    ldr x19, [sp], #16 // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <2 x i128> @llvm.fshl(<2 x i128> %a, <2 x i128> %b, <2 x i128> %c)
+  ret <2 x i128> %d
+}
+
+define <2 x i128> @fshr_v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> %c) {
+; CHECK-SD-LABEL: fshr_v2i128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ldr x8, [sp]
+; CHECK-SD-NEXT:    ldr x10, [sp, #16]
+; CHECK-SD-NEXT:    tst x8, #0x40
+; CHECK-SD-NEXT:    mvn w13, w8
+; CHECK-SD-NEXT:    csel x9, x4, x5, eq
+; CHECK-SD-NEXT:    csel x11, x5, x0, eq
+; CHECK-SD-NEXT:    csel x14, x0, x1, eq
+; CHECK-SD-NEXT:    tst x10, #0x40
+; CHECK-SD-NEXT:    lsr x9, x9, x8
+; CHECK-SD-NEXT:    lsl x12, x11, #1
+; CHECK-SD-NEXT:    lsr x8, x11, x8
+; CHECK-SD-NEXT:    lsl x11, x14, #1
+; CHECK-SD-NEXT:    csel x14, x7, x2, eq
+; CHECK-SD-NEXT:    csel x15, x2, x3, eq
+; CHECK-SD-NEXT:    csel x16, x6, x7, eq
+; CHECK-SD-NEXT:    lsl x17, x14, #1
+; CHECK-SD-NEXT:    lsl x15, x15, #1
+; CHECK-SD-NEXT:    lsl x12, x12, x13
+; CHECK-SD-NEXT:    lsl x11, x11, x13
+; CHECK-SD-NEXT:    lsr x13, x16, x10
+; CHECK-SD-NEXT:    mvn w16, w10
+; CHECK-SD-NEXT:    lsr x10, x14, x10
+; CHECK-SD-NEXT:    lsl x17, x17, x16
+; CHECK-SD-NEXT:    lsl x14, x15, x16
+; CHECK-SD-NEXT:    orr x0, x12, x9
+; CHECK-SD-NEXT:    orr x1, x11, x8
+; CHECK-SD-NEXT:    orr x2, x17, x13
+; CHECK-SD-NEXT:    orr x3, x14, x10
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshr_v2i128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ldr x9, [sp]
+; CHECK-GI-NEXT:    lsl x12, x1, #1
+; CHECK-GI-NEXT:    mov w11, #127 // =0x7f
+; CHECK-GI-NEXT:    mov w14, #64 // =0x40
+; CHECK-GI-NEXT:    lsl x15, x0, #1
+; CHECK-GI-NEXT:    ldr x8, [sp, #16]
+; CHECK-GI-NEXT:    bic x13, x11, x9
+; CHECK-GI-NEXT:    orr x12, x12, x0, lsr #63
+; CHECK-GI-NEXT:    lsl x1, x3, #1
+; CHECK-GI-NEXT:    sub x17, x14, x13
+; CHECK-GI-NEXT:    sub x18, x13, #64
+; CHECK-GI-NEXT:    lsl x3, x15, x13
+; CHECK-GI-NEXT:    lsr x17, x15, x17
+; CHECK-GI-NEXT:    lsl x0, x12, x13
+; CHECK-GI-NEXT:    lsl x15, x15, x18
+; CHECK-GI-NEXT:    bic x11, x11, x8
+; CHECK-GI-NEXT:    lsl x18, x2, #1
+; CHECK-GI-NEXT:    cmp x13, #64
+; CHECK-GI-NEXT:    orr x17, x17, x0
+; CHECK-GI-NEXT:    orr x13, x1, x2, lsr #63
+; CHECK-GI-NEXT:    mvn x16, x9
+; CHECK-GI-NEXT:    csel x15, x17, x15, lo
+; CHECK-GI-NEXT:    sub x17, x14, x11
+; CHECK-GI-NEXT:    csel x0, x3, xzr, lo
+; CHECK-GI-NEXT:    tst x16, #0x7f
+; CHECK-GI-NEXT:    sub x16, x11, #64
+; CHECK-GI-NEXT:    lsr x17, x18, x17
+; CHECK-GI-NEXT:    lsl x2, x13, x11
+; CHECK-GI-NEXT:    lsl x1, x18, x11
+; CHECK-GI-NEXT:    csel x12, x12, x15, eq
+; CHECK-GI-NEXT:    lsl x15, x18, x16
+; CHECK-GI-NEXT:    and x10, x9, #0x7f
+; CHECK-GI-NEXT:    cmp x11, #64
+; CHECK-GI-NEXT:    mvn x11, x8
+; CHECK-GI-NEXT:    orr x16, x17, x2
+; CHECK-GI-NEXT:    csel x17, x1, xzr, lo
+; CHECK-GI-NEXT:    csel x15, x16, x15, lo
+; CHECK-GI-NEXT:    tst x11, #0x7f
+; CHECK-GI-NEXT:    sub x11, x14, x10
+; CHECK-GI-NEXT:    sub x16, x10, #64
+; CHECK-GI-NEXT:    lsr x18, x4, x10
+; CHECK-GI-NEXT:    lsl x11, x5, x11
+; CHECK-GI-NEXT:    csel x13, x13, x15, eq
+; CHECK-GI-NEXT:    lsr x15, x5, x16
+; CHECK-GI-NEXT:    and x1, x8, #0x7f
+; CHECK-GI-NEXT:    orr x11, x18, x11
+; CHECK-GI-NEXT:    cmp x10, #64
+; CHECK-GI-NEXT:    lsr x16, x5, x10
+; CHECK-GI-NEXT:    csel x11, x11, x15, lo
+; CHECK-GI-NEXT:    tst x9, #0x7f
+; CHECK-GI-NEXT:    sub x9, x14, x1
+; CHECK-GI-NEXT:    sub x14, x1, #64
+; CHECK-GI-NEXT:    lsr x15, x6, x1
+; CHECK-GI-NEXT:    lsl x9, x7, x9
+; CHECK-GI-NEXT:    csel x11, x4, x11, eq
+; CHECK-GI-NEXT:    cmp x10, #64
+; CHECK-GI-NEXT:    lsr x10, x7, x14
+; CHECK-GI-NEXT:    csel x14, x16, xzr, lo
+; CHECK-GI-NEXT:    orr x9, x15, x9
+; CHECK-GI-NEXT:    cmp x1, #64
+; CHECK-GI-NEXT:    lsr x15, x7, x1
+; CHECK-GI-NEXT:    csel x9, x9, x10, lo
+; CHECK-GI-NEXT:    tst x8, #0x7f
+; CHECK-GI-NEXT:    csel x8, x6, x9, eq
+; CHECK-GI-NEXT:    cmp x1, #64
+; CHECK-GI-NEXT:    orr x0, x0, x11
+; CHECK-GI-NEXT:    csel x9, x15, xzr, lo
+; CHECK-GI-NEXT:    orr x1, x12, x14
+; CHECK-GI-NEXT:    orr x2, x17, x8
+; CHECK-GI-NEXT:    orr x3, x13, x9
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <2 x i128> @llvm.fshr(<2 x i128> %a, <2 x i128> %b, <2 x i128> %c)
+  ret <2 x i128> %d
+}
+
+define <8 x i8> @rotl_v8i8_c(<8 x i8> %a) {
+; CHECK-SD-LABEL: rotl_v8i8_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    shl v1.8b, v0.8b, #3
+; CHECK-SD-NEXT:    usra v1.8b, v0.8b, #5
+; CHECK-SD-NEXT:    fmov d0, d1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rotl_v8i8_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    shl v1.8b, v0.8b, #3
+; CHECK-GI-NEXT:    ushr v0.8b, v0.8b, #5
+; CHECK-GI-NEXT:    orr v0.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <8 x i8> @llvm.fshl(<8 x i8> %a, <8 x i8> %a, <8 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+  ret <8 x i8> %d
+}
+
+define <8 x i8> @rotr_v8i8_c(<8 x i8> %a) {
+; CHECK-SD-LABEL: rotr_v8i8_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    shl v1.8b, v0.8b, #5
+; CHECK-SD-NEXT:    usra v1.8b, v0.8b, #3
+; CHECK-SD-NEXT:    fmov d0, d1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rotr_v8i8_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushr v1.8b, v0.8b, #3
+; CHECK-GI-NEXT:    shl v0.8b, v0.8b, #5
+; CHECK-GI-NEXT:    orr v0.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <8 x i8> @llvm.fshr(<8 x i8> %a, <8 x i8> %a, <8 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+  ret <8 x i8> %d
+}
+
+define <16 x i8> @rotl_v16i8_c(<16 x i8> %a) {
+; CHECK-SD-LABEL: rotl_v16i8_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    shl v1.16b, v0.16b, #3
+; CHECK-SD-NEXT:    usra v1.16b, v0.16b, #5
+; CHECK-SD-NEXT:    mov v0.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rotl_v16i8_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    shl v1.16b, v0.16b, #3
+; CHECK-GI-NEXT:    ushr v0.16b, v0.16b, #5
+; CHECK-GI-NEXT:    orr v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <16 x i8> @llvm.fshl(<16 x i8> %a, <16 x i8> %a, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+  ret <16 x i8> %d
+}
+
+define <16 x i8> @rotr_v16i8_c(<16 x i8> %a) {
+; CHECK-SD-LABEL: rotr_v16i8_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    shl v1.16b, v0.16b, #5
+; CHECK-SD-NEXT:    usra v1.16b, v0.16b, #3
+; CHECK-SD-NEXT:    mov v0.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rotr_v16i8_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushr v1.16b, v0.16b, #3
+; CHECK-GI-NEXT:    shl v0.16b, v0.16b, #5
+; CHECK-GI-NEXT:    orr v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <16 x i8> @llvm.fshr(<16 x i8> %a, <16 x i8> %a, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+  ret <16 x i8> %d
+}
+
+define <4 x i16> @rotl_v4i16_c(<4 x i16> %a) {
+; CHECK-SD-LABEL: rotl_v4i16_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    shl v1.4h, v0.4h, #3
+; CHECK-SD-NEXT:    usra v1.4h, v0.4h, #13
+; CHECK-SD-NEXT:    fmov d0, d1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rotl_v4i16_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    shl v1.4h, v0.4h, #3
+; CHECK-GI-NEXT:    ushr v0.4h, v0.4h, #13
+; CHECK-GI-NEXT:    orr v0.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <4 x i16> @llvm.fshl(<4 x i16> %a, <4 x i16> %a, <4 x i16> <i16 3, i16 3, i16 3, i16 3>)
+  ret <4 x i16> %d
+}
+
+define <4 x i16> @rotr_v4i16_c(<4 x i16> %a) {
+; CHECK-SD-LABEL: rotr_v4i16_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    shl v1.4h, v0.4h, #13
+; CHECK-SD-NEXT:    usra v1.4h, v0.4h, #3
+; CHECK-SD-NEXT:    fmov d0, d1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rotr_v4i16_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushr v1.4h, v0.4h, #3
+; CHECK-GI-NEXT:    shl v0.4h, v0.4h, #13
+; CHECK-GI-NEXT:    orr v0.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <4 x i16> @llvm.fshr(<4 x i16> %a, <4 x i16> %a, <4 x i16> <i16 3, i16 3, i16 3, i16 3>)
+  ret <4 x i16> %d
+}
+
+define <7 x i16> @rotl_v7i16_c(<7 x i16> %a) {
+; CHECK-SD-LABEL: rotl_v7i16_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    adrp x8, .LCPI98_0
+; CHECK-SD-NEXT:    adrp x9, .LCPI98_1
+; CHECK-SD-NEXT:    ldr q1, [x8, :lo12:.LCPI98_0]
+; CHECK-SD-NEXT:    ldr q2, [x9, :lo12:.LCPI98_1]
+; CHECK-SD-NEXT:    ushl v1.8h, v0.8h, v1.8h
+; CHECK-SD-NEXT:    ushl v0.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rotl_v7i16_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #13 // =0xd
+; CHECK-GI-NEXT:    mov w9, #3 // =0x3
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    mov v1.h[1], w8
+; CHECK-GI-NEXT:    mov v2.h[1], w9
+; CHECK-GI-NEXT:    mov v1.h[2], w8
+; CHECK-GI-NEXT:    mov v2.h[2], w9
+; CHECK-GI-NEXT:    mov v1.h[3], w8
+; CHECK-GI-NEXT:    mov v2.h[3], w9
+; CHECK-GI-NEXT:    mov v1.h[4], w8
+; CHECK-GI-NEXT:    mov v2.h[4], w9
+; CHECK-GI-NEXT:    mov v1.h[5], w8
+; CHECK-GI-NEXT:    mov v2.h[5], w9
+; CHECK-GI-NEXT:    mov v1.h[6], w8
+; CHECK-GI-NEXT:    mov v2.h[6], w9
+; CHECK-GI-NEXT:    neg v1.8h, v1.8h
+; CHECK-GI-NEXT:    ushl v2.8h, v0.8h, v2.8h
+; CHECK-GI-NEXT:    ushl v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    orr v0.16b, v2.16b, v0.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <7 x i16> @llvm.fshl(<7 x i16> %a, <7 x i16> %a, <7 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+  ret <7 x i16> %d
+}
+
+define <7 x i16> @rotr_v7i16_c(<7 x i16> %a) {
+; CHECK-SD-LABEL: rotr_v7i16_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    adrp x8, .LCPI99_0
+; CHECK-SD-NEXT:    adrp x9, .LCPI99_1
+; CHECK-SD-NEXT:    ldr q1, [x8, :lo12:.LCPI99_0]
+; CHECK-SD-NEXT:    ldr q2, [x9, :lo12:.LCPI99_1]
+; CHECK-SD-NEXT:    ushl v1.8h, v0.8h, v1.8h
+; CHECK-SD-NEXT:    ushl v0.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rotr_v7i16_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #3 // =0x3
+; CHECK-GI-NEXT:    mov w9, #13 // =0xd
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    mov v1.h[1], w8
+; CHECK-GI-NEXT:    mov v2.h[1], w9
+; CHECK-GI-NEXT:    mov v1.h[2], w8
+; CHECK-GI-NEXT:    mov v2.h[2], w9
+; CHECK-GI-NEXT:    mov v1.h[3], w8
+; CHECK-GI-NEXT:    mov v2.h[3], w9
+; CHECK-GI-NEXT:    mov v1.h[4], w8
+; CHECK-GI-NEXT:    mov v2.h[4], w9
+; CHECK-GI-NEXT:    mov v1.h[5], w8
+; CHECK-GI-NEXT:    mov v2.h[5], w9
+; CHECK-GI-NEXT:    mov v1.h[6], w8
+; CHECK-GI-NEXT:    mov v2.h[6], w9
+; CHECK-GI-NEXT:    neg v1.8h, v1.8h
+; CHECK-GI-NEXT:    ushl v1.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    ushl v0.8h, v0.8h, v2.8h
+; CHECK-GI-NEXT:    orr v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <7 x i16> @llvm.fshr(<7 x i16> %a, <7 x i16> %a, <7 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+  ret <7 x i16> %d
+}
+
+define <8 x i16> @rotl_v8i16_c(<8 x i16> %a) {
+; CHECK-SD-LABEL: rotl_v8i16_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    shl v1.8h, v0.8h, #3
+; CHECK-SD-NEXT:    usra v1.8h, v0.8h, #13
+; CHECK-SD-NEXT:    mov v0.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rotl_v8i16_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    shl v1.8h, v0.8h, #3
+; CHECK-GI-NEXT:    ushr v0.8h, v0.8h, #13
+; CHECK-GI-NEXT:    orr v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <8 x i16> @llvm.fshl(<8 x i16> %a, <8 x i16> %a, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+  ret <8 x i16> %d
+}
+
+define <8 x i16> @rotr_v8i16_c(<8 x i16> %a) {
+; CHECK-SD-LABEL: rotr_v8i16_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    shl v1.8h, v0.8h, #13
+; CHECK-SD-NEXT:    usra v1.8h, v0.8h, #3
+; CHECK-SD-NEXT:    mov v0.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rotr_v8i16_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushr v1.8h, v0.8h, #3
+; CHECK-GI-NEXT:    shl v0.8h, v0.8h, #13
+; CHECK-GI-NEXT:    orr v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <8 x i16> @llvm.fshr(<8 x i16> %a, <8 x i16> %a, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+  ret <8 x i16> %d
+}
+
+define <16 x i16> @rotl_v16i16_c(<16 x i16> %a) {
+; CHECK-SD-LABEL: rotl_v16i16_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    shl v2.8h, v0.8h, #3
+; CHECK-SD-NEXT:    shl v3.8h, v1.8h, #3
+; CHECK-SD-NEXT:    usra v2.8h, v0.8h, #13
+; CHECK-SD-NEXT:    usra v3.8h, v1.8h, #13
+; CHECK-SD-NEXT:    mov v0.16b, v2.16b
+; CHECK-SD-NEXT:    mov v1.16b, v3.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rotl_v16i16_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    shl v2.8h, v0.8h, #3
+; CHECK-GI-NEXT:    shl v3.8h, v1.8h, #3
+; CHECK-GI-NEXT:    ushr v0.8h, v0.8h, #13
+; CHECK-GI-NEXT:    ushr v1.8h, v1.8h, #13
+; CHECK-GI-NEXT:    orr v0.16b, v2.16b, v0.16b
+; CHECK-GI-NEXT:    orr v1.16b, v3.16b, v1.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <16 x i16> @llvm.fshl(<16 x i16> %a, <16 x i16> %a, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+  ret <16 x i16> %d
+}
+
+define <16 x i16> @rotr_v16i16_c(<16 x i16> %a) {
+; CHECK-SD-LABEL: rotr_v16i16_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    shl v2.8h, v0.8h, #13
+; CHECK-SD-NEXT:    shl v3.8h, v1.8h, #13
+; CHECK-SD-NEXT:    usra v2.8h, v0.8h, #3
+; CHECK-SD-NEXT:    usra v3.8h, v1.8h, #3
+; CHECK-SD-NEXT:    mov v0.16b, v2.16b
+; CHECK-SD-NEXT:    mov v1.16b, v3.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rotr_v16i16_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushr v2.8h, v0.8h, #3
+; CHECK-GI-NEXT:    ushr v3.8h, v1.8h, #3
+; CHECK-GI-NEXT:    shl v0.8h, v0.8h, #13
+; CHECK-GI-NEXT:    shl v1.8h, v1.8h, #13
+; CHECK-GI-NEXT:    orr v0.16b, v2.16b, v0.16b
+; CHECK-GI-NEXT:    orr v1.16b, v3.16b, v1.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <16 x i16> @llvm.fshr(<16 x i16> %a, <16 x i16> %a, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+  ret <16 x i16> %d
+}
+
+define <2 x i32> @rotl_v2i32_c(<2 x i32> %a) {
+; CHECK-SD-LABEL: rotl_v2i32_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    shl v1.2s, v0.2s, #3
+; CHECK-SD-NEXT:    usra v1.2s, v0.2s, #29
+; CHECK-SD-NEXT:    fmov d0, d1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rotl_v2i32_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    shl v1.2s, v0.2s, #3
+; CHECK-GI-NEXT:    ushr v0.2s, v0.2s, #29
+; CHECK-GI-NEXT:    orr v0.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <2 x i32> @llvm.fshl(<2 x i32> %a, <2 x i32> %a, <2 x i32> <i32 3, i32 3>)
+  ret <2 x i32> %d
+}
+
+define <2 x i32> @rotr_v2i32_c(<2 x i32> %a) {
+; CHECK-SD-LABEL: rotr_v2i32_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    shl v1.2s, v0.2s, #29
+; CHECK-SD-NEXT:    usra v1.2s, v0.2s, #3
+; CHECK-SD-NEXT:    fmov d0, d1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rotr_v2i32_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushr v1.2s, v0.2s, #3
+; CHECK-GI-NEXT:    shl v0.2s, v0.2s, #29
+; CHECK-GI-NEXT:    orr v0.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <2 x i32> @llvm.fshr(<2 x i32> %a, <2 x i32> %a, <2 x i32> <i32 3, i32 3>)
+  ret <2 x i32> %d
+}
+
+define <4 x i32> @rotl_v4i32_c(<4 x i32> %a) {
+; CHECK-SD-LABEL: rotl_v4i32_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    shl v1.4s, v0.4s, #3
+; CHECK-SD-NEXT:    usra v1.4s, v0.4s, #29
+; CHECK-SD-NEXT:    mov v0.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rotl_v4i32_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    shl v1.4s, v0.4s, #3
+; CHECK-GI-NEXT:    ushr v0.4s, v0.4s, #29
+; CHECK-GI-NEXT:    orr v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <4 x i32> @llvm.fshl(<4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 3, i32 3, i32 3, i32 3>)
+  ret <4 x i32> %d
+}
+
+define <4 x i32> @rotr_v4i32_c(<4 x i32> %a) {
+; CHECK-SD-LABEL: rotr_v4i32_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    shl v1.4s, v0.4s, #29
+; CHECK-SD-NEXT:    usra v1.4s, v0.4s, #3
+; CHECK-SD-NEXT:    mov v0.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rotr_v4i32_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushr v1.4s, v0.4s, #3
+; CHECK-GI-NEXT:    shl v0.4s, v0.4s, #29
+; CHECK-GI-NEXT:    orr v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <4 x i32> @llvm.fshr(<4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 3, i32 3, i32 3, i32 3>)
+  ret <4 x i32> %d
+}
+
+define <7 x i32> @rotl_v7i32_c(<7 x i32> %a) {
+; CHECK-SD-LABEL: rotl_v7i32_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov s0, w0
+; CHECK-SD-NEXT:    fmov s1, w4
+; CHECK-SD-NEXT:    adrp x8, .LCPI108_0
+; CHECK-SD-NEXT:    adrp x9, .LCPI108_1
+; CHECK-SD-NEXT:    ldr q2, [x8, :lo12:.LCPI108_0]
+; CHECK-SD-NEXT:    ldr q3, [x9, :lo12:.LCPI108_1]
+; CHECK-SD-NEXT:    mov v0.s[1], w1
+; CHECK-SD-NEXT:    mov v1.s[1], w5
+; CHECK-SD-NEXT:    mov v0.s[2], w2
+; CHECK-SD-NEXT:    mov v1.s[2], w6
+; CHECK-SD-NEXT:    mov v0.s[3], w3
+; CHECK-SD-NEXT:    ushl v2.4s, v1.4s, v2.4s
+; CHECK-SD-NEXT:    ushl v1.4s, v1.4s, v3.4s
+; CHECK-SD-NEXT:    shl v4.4s, v0.4s, #3
+; CHECK-SD-NEXT:    usra v4.4s, v0.4s, #29
+; CHECK-SD-NEXT:    orr v0.16b, v1.16b, v2.16b
+; CHECK-SD-NEXT:    mov w1, v4.s[1]
+; CHECK-SD-NEXT:    mov w2, v4.s[2]
+; CHECK-SD-NEXT:    mov w3, v4.s[3]
+; CHECK-SD-NEXT:    mov w5, v0.s[1]
+; CHECK-SD-NEXT:    mov w6, v0.s[2]
+; CHECK-SD-NEXT:    fmov w0, s4
+; CHECK-SD-NEXT:    fmov w4, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rotl_v7i32_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v0.s[0], w0
+; CHECK-GI-NEXT:    mov v1.s[0], w0
+; CHECK-GI-NEXT:    mov w8, #29 // =0x1d
+; CHECK-GI-NEXT:    mov v2.s[0], w8
+; CHECK-GI-NEXT:    mov w9, #3 // =0x3
+; CHECK-GI-NEXT:    mov v3.s[0], w4
+; CHECK-GI-NEXT:    mov v4.s[0], w9
+; CHECK-GI-NEXT:    mov v5.s[0], w4
+; CHECK-GI-NEXT:    mov v0.s[1], w1
+; CHECK-GI-NEXT:    mov v1.s[1], w1
+; CHECK-GI-NEXT:    mov v2.s[1], w8
+; CHECK-GI-NEXT:    mov v3.s[1], w5
+; CHECK-GI-NEXT:    mov v4.s[1], w9
+; CHECK-GI-NEXT:    mov v5.s[1], w5
+; CHECK-GI-NEXT:    mov v0.s[2], w2
+; CHECK-GI-NEXT:    mov v1.s[2], w2
+; CHECK-GI-NEXT:    mov v2.s[2], w8
+; CHECK-GI-NEXT:    mov v3.s[2], w6
+; CHECK-GI-NEXT:    mov v4.s[2], w9
+; CHECK-GI-NEXT:    mov v5.s[2], w6
+; CHECK-GI-NEXT:    mov v0.s[3], w3
+; CHECK-GI-NEXT:    mov v1.s[3], w3
+; CHECK-GI-NEXT:    neg v2.4s, v2.4s
+; CHECK-GI-NEXT:    ushl v3.4s, v3.4s, v4.4s
+; CHECK-GI-NEXT:    shl v0.4s, v0.4s, #3
+; CHECK-GI-NEXT:    ushr v1.4s, v1.4s, #29
+; CHECK-GI-NEXT:    ushl v2.4s, v5.4s, v2.4s
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    orr v1.16b, v3.16b, v2.16b
+; CHECK-GI-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-NEXT:    mov s3, v0.s[2]
+; CHECK-GI-NEXT:    mov s4, v0.s[3]
+; CHECK-GI-NEXT:    mov s5, v1.s[1]
+; CHECK-GI-NEXT:    mov s6, v1.s[2]
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    fmov w4, s1
+; CHECK-GI-NEXT:    fmov w1, s2
+; CHECK-GI-NEXT:    fmov w2, s3
+; CHECK-GI-NEXT:    fmov w3, s4
+; CHECK-GI-NEXT:    fmov w5, s5
+; CHECK-GI-NEXT:    fmov w6, s6
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <7 x i32> @llvm.fshl(<7 x i32> %a, <7 x i32> %a, <7 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>)
+  ret <7 x i32> %d
+}
+
+define <7 x i32> @rotr_v7i32_c(<7 x i32> %a) {
+; CHECK-SD-LABEL: rotr_v7i32_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov s0, w0
+; CHECK-SD-NEXT:    fmov s1, w4
+; CHECK-SD-NEXT:    adrp x8, .LCPI109_0
+; CHECK-SD-NEXT:    adrp x9, .LCPI109_1
+; CHECK-SD-NEXT:    ldr q2, [x8, :lo12:.LCPI109_0]
+; CHECK-SD-NEXT:    ldr q3, [x9, :lo12:.LCPI109_1]
+; CHECK-SD-NEXT:    mov v0.s[1], w1
+; CHECK-SD-NEXT:    mov v1.s[1], w5
+; CHECK-SD-NEXT:    mov v0.s[2], w2
+; CHECK-SD-NEXT:    mov v1.s[2], w6
+; CHECK-SD-NEXT:    mov v0.s[3], w3
+; CHECK-SD-NEXT:    ushl v2.4s, v1.4s, v2.4s
+; CHECK-SD-NEXT:    ushl v1.4s, v1.4s, v3.4s
+; CHECK-SD-NEXT:    shl v4.4s, v0.4s, #29
+; CHECK-SD-NEXT:    usra v4.4s, v0.4s, #3
+; CHECK-SD-NEXT:    orr v0.16b, v1.16b, v2.16b
+; CHECK-SD-NEXT:    mov w1, v4.s[1]
+; CHECK-SD-NEXT:    mov w2, v4.s[2]
+; CHECK-SD-NEXT:    mov w3, v4.s[3]
+; CHECK-SD-NEXT:    mov w5, v0.s[1]
+; CHECK-SD-NEXT:    mov w6, v0.s[2]
+; CHECK-SD-NEXT:    fmov w0, s4
+; CHECK-SD-NEXT:    fmov w4, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rotr_v7i32_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v0.s[0], w0
+; CHECK-GI-NEXT:    mov v1.s[0], w0
+; CHECK-GI-NEXT:    mov w8, #3 // =0x3
+; CHECK-GI-NEXT:    mov v2.s[0], w8
+; CHECK-GI-NEXT:    mov w9, #29 // =0x1d
+; CHECK-GI-NEXT:    mov v3.s[0], w4
+; CHECK-GI-NEXT:    mov v4.s[0], w4
+; CHECK-GI-NEXT:    mov v5.s[0], w9
+; CHECK-GI-NEXT:    mov v0.s[1], w1
+; CHECK-GI-NEXT:    mov v1.s[1], w1
+; CHECK-GI-NEXT:    mov v2.s[1], w8
+; CHECK-GI-NEXT:    mov v3.s[1], w5
+; CHECK-GI-NEXT:    mov v4.s[1], w5
+; CHECK-GI-NEXT:    mov v5.s[1], w9
+; CHECK-GI-NEXT:    mov v0.s[2], w2
+; CHECK-GI-NEXT:    mov v1.s[2], w2
+; CHECK-GI-NEXT:    mov v2.s[2], w8
+; CHECK-GI-NEXT:    mov v3.s[2], w6
+; CHECK-GI-NEXT:    mov v4.s[2], w6
+; CHECK-GI-NEXT:    mov v5.s[2], w9
+; CHECK-GI-NEXT:    mov v0.s[3], w3
+; CHECK-GI-NEXT:    mov v1.s[3], w3
+; CHECK-GI-NEXT:    neg v2.4s, v2.4s
+; CHECK-GI-NEXT:    ushl v4.4s, v4.4s, v5.4s
+; CHECK-GI-NEXT:    ushr v0.4s, v0.4s, #3
+; CHECK-GI-NEXT:    shl v1.4s, v1.4s, #29
+; CHECK-GI-NEXT:    ushl v2.4s, v3.4s, v2.4s
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    orr v1.16b, v2.16b, v4.16b
+; CHECK-GI-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-NEXT:    mov s3, v0.s[2]
+; CHECK-GI-NEXT:    mov s4, v0.s[3]
+; CHECK-GI-NEXT:    mov s5, v1.s[1]
+; CHECK-GI-NEXT:    mov s6, v1.s[2]
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    fmov w4, s1
+; CHECK-GI-NEXT:    fmov w1, s2
+; CHECK-GI-NEXT:    fmov w2, s3
+; CHECK-GI-NEXT:    fmov w3, s4
+; CHECK-GI-NEXT:    fmov w5, s5
+; CHECK-GI-NEXT:    fmov w6, s6
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <7 x i32> @llvm.fshr(<7 x i32> %a, <7 x i32> %a, <7 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>)
+  ret <7 x i32> %d
+}
+
+define <8 x i32> @rotl_v8i32_c(<8 x i32> %a) {
+; CHECK-SD-LABEL: rotl_v8i32_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    shl v2.4s, v0.4s, #3
+; CHECK-SD-NEXT:    shl v3.4s, v1.4s, #3
+; CHECK-SD-NEXT:    usra v2.4s, v0.4s, #29
+; CHECK-SD-NEXT:    usra v3.4s, v1.4s, #29
+; CHECK-SD-NEXT:    mov v0.16b, v2.16b
+; CHECK-SD-NEXT:    mov v1.16b, v3.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rotl_v8i32_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    shl v2.4s, v0.4s, #3
+; CHECK-GI-NEXT:    shl v3.4s, v1.4s, #3
+; CHECK-GI-NEXT:    ushr v0.4s, v0.4s, #29
+; CHECK-GI-NEXT:    ushr v1.4s, v1.4s, #29
+; CHECK-GI-NEXT:    orr v0.16b, v2.16b, v0.16b
+; CHECK-GI-NEXT:    orr v1.16b, v3.16b, v1.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <8 x i32> @llvm.fshl(<8 x i32> %a, <8 x i32> %a, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>)
+  ret <8 x i32> %d
+}
+
+define <8 x i32> @rotr_v8i32_c(<8 x i32> %a) {
+; CHECK-SD-LABEL: rotr_v8i32_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    shl v2.4s, v0.4s, #29
+; CHECK-SD-NEXT:    shl v3.4s, v1.4s, #29
+; CHECK-SD-NEXT:    usra v2.4s, v0.4s, #3
+; CHECK-SD-NEXT:    usra v3.4s, v1.4s, #3
+; CHECK-SD-NEXT:    mov v0.16b, v2.16b
+; CHECK-SD-NEXT:    mov v1.16b, v3.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rotr_v8i32_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushr v2.4s, v0.4s, #3
+; CHECK-GI-NEXT:    ushr v3.4s, v1.4s, #3
+; CHECK-GI-NEXT:    shl v0.4s, v0.4s, #29
+; CHECK-GI-NEXT:    shl v1.4s, v1.4s, #29
+; CHECK-GI-NEXT:    orr v0.16b, v2.16b, v0.16b
+; CHECK-GI-NEXT:    orr v1.16b, v3.16b, v1.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <8 x i32> @llvm.fshr(<8 x i32> %a, <8 x i32> %a, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>)
+  ret <8 x i32> %d
+}
+
+define <2 x i64> @rotl_v2i64_c(<2 x i64> %a) {
+; CHECK-SD-LABEL: rotl_v2i64_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    shl v1.2d, v0.2d, #3
+; CHECK-SD-NEXT:    usra v1.2d, v0.2d, #61
+; CHECK-SD-NEXT:    mov v0.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rotl_v2i64_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    shl v1.2d, v0.2d, #3
+; CHECK-GI-NEXT:    ushr v0.2d, v0.2d, #61
+; CHECK-GI-NEXT:    orr v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <2 x i64> @llvm.fshl(<2 x i64> %a, <2 x i64> %a, <2 x i64> <i64 3, i64 3>)
+  ret <2 x i64> %d
+}
+
+define <2 x i64> @rotr_v2i64_c(<2 x i64> %a) {
+; CHECK-SD-LABEL: rotr_v2i64_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    shl v1.2d, v0.2d, #61
+; CHECK-SD-NEXT:    usra v1.2d, v0.2d, #3
+; CHECK-SD-NEXT:    mov v0.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rotr_v2i64_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushr v1.2d, v0.2d, #3
+; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #61
+; CHECK-GI-NEXT:    orr v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <2 x i64> @llvm.fshr(<2 x i64> %a, <2 x i64> %a, <2 x i64> <i64 3, i64 3>)
+  ret <2 x i64> %d
+}
+
+define <4 x i64> @rotl_v4i64_c(<4 x i64> %a) {
+; CHECK-SD-LABEL: rotl_v4i64_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    shl v2.2d, v0.2d, #3
+; CHECK-SD-NEXT:    shl v3.2d, v1.2d, #3
+; CHECK-SD-NEXT:    usra v2.2d, v0.2d, #61
+; CHECK-SD-NEXT:    usra v3.2d, v1.2d, #61
+; CHECK-SD-NEXT:    mov v0.16b, v2.16b
+; CHECK-SD-NEXT:    mov v1.16b, v3.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rotl_v4i64_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    shl v2.2d, v0.2d, #3
+; CHECK-GI-NEXT:    shl v3.2d, v1.2d, #3
+; CHECK-GI-NEXT:    ushr v0.2d, v0.2d, #61
+; CHECK-GI-NEXT:    ushr v1.2d, v1.2d, #61
+; CHECK-GI-NEXT:    orr v0.16b, v2.16b, v0.16b
+; CHECK-GI-NEXT:    orr v1.16b, v3.16b, v1.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <4 x i64> @llvm.fshl(<4 x i64> %a, <4 x i64> %a, <4 x i64> <i64 3, i64 3, i64 3, i64 3>)
+  ret <4 x i64> %d
+}
+
+define <4 x i64> @rotr_v4i64_c(<4 x i64> %a) {
+; CHECK-SD-LABEL: rotr_v4i64_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    shl v2.2d, v0.2d, #61
+; CHECK-SD-NEXT:    shl v3.2d, v1.2d, #61
+; CHECK-SD-NEXT:    usra v2.2d, v0.2d, #3
+; CHECK-SD-NEXT:    usra v3.2d, v1.2d, #3
+; CHECK-SD-NEXT:    mov v0.16b, v2.16b
+; CHECK-SD-NEXT:    mov v1.16b, v3.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rotr_v4i64_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushr v2.2d, v0.2d, #3
+; CHECK-GI-NEXT:    ushr v3.2d, v1.2d, #3
+; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #61
+; CHECK-GI-NEXT:    shl v1.2d, v1.2d, #61
+; CHECK-GI-NEXT:    orr v0.16b, v2.16b, v0.16b
+; CHECK-GI-NEXT:    orr v1.16b, v3.16b, v1.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <4 x i64> @llvm.fshr(<4 x i64> %a, <4 x i64> %a, <4 x i64> <i64 3, i64 3, i64 3, i64 3>)
+  ret <4 x i64> %d
+}
+
+define <2 x i128> @rotl_v2i128_c(<2 x i128> %a) {
+; CHECK-SD-LABEL: rotl_v2i128_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    extr x8, x2, x3, #61
+; CHECK-SD-NEXT:    extr x9, x0, x1, #61
+; CHECK-SD-NEXT:    extr x1, x1, x0, #61
+; CHECK-SD-NEXT:    extr x3, x3, x2, #61
+; CHECK-SD-NEXT:    mov x0, x9
+; CHECK-SD-NEXT:    mov x2, x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rotl_v2i128_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    lsr x8, x1, #61
+; CHECK-GI-NEXT:    lsl x9, x1, #3
+; CHECK-GI-NEXT:    lsl x10, x3, #3
+; CHECK-GI-NEXT:    lsr x11, x3, #61
+; CHECK-GI-NEXT:    orr x8, x8, x0, lsl #3
+; CHECK-GI-NEXT:    orr x1, x9, x0, lsr #61
+; CHECK-GI-NEXT:    orr x3, x10, x2, lsr #61
+; CHECK-GI-NEXT:    orr x2, x11, x2, lsl #3
+; CHECK-GI-NEXT:    mov x0, x8
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <2 x i128> @llvm.fshl(<2 x i128> %a, <2 x i128> %a, <2 x i128> <i128 3, i128 3>)
+  ret <2 x i128> %d
+}
+
+define <2 x i128> @rotr_v2i128_c(<2 x i128> %a) {
+; CHECK-SD-LABEL: rotr_v2i128_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    extr x8, x3, x2, #3
+; CHECK-SD-NEXT:    extr x9, x1, x0, #3
+; CHECK-SD-NEXT:    extr x1, x0, x1, #3
+; CHECK-SD-NEXT:    extr x3, x2, x3, #3
+; CHECK-SD-NEXT:    mov x0, x9
+; CHECK-SD-NEXT:    mov x2, x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: rotr_v2i128_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    lsl x8, x1, #61
+; CHECK-GI-NEXT:    lsl x9, x3, #61
+; CHECK-GI-NEXT:    lsl x10, x0, #61
+; CHECK-GI-NEXT:    lsl x11, x2, #61
+; CHECK-GI-NEXT:    orr x0, x8, x0, lsr #3
+; CHECK-GI-NEXT:    orr x2, x9, x2, lsr #3
+; CHECK-GI-NEXT:    orr x1, x10, x1, lsr #3
+; CHECK-GI-NEXT:    orr x3, x11, x3, lsr #3
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <2 x i128> @llvm.fshr(<2 x i128> %a, <2 x i128> %a, <2 x i128> <i128 3, i128 3>)
+  ret <2 x i128> %d
+}
+
+define <8 x i8> @fshl_v8i8_c(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-SD-LABEL: fshl_v8i8_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ushr v1.8b, v1.8b, #1
+; CHECK-SD-NEXT:    shl v0.8b, v0.8b, #3
+; CHECK-SD-NEXT:    usra v0.8b, v1.8b, #4
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshl_v8i8_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    shl v0.8b, v0.8b, #3
+; CHECK-GI-NEXT:    ushr v1.8b, v1.8b, #5
+; CHECK-GI-NEXT:    orr v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <8 x i8> @llvm.fshl(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+  ret <8 x i8> %d
+}
+
+define <8 x i8> @fshr_v8i8_c(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-SD-LABEL: fshr_v8i8_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    add v0.8b, v0.8b, v0.8b
+; CHECK-SD-NEXT:    shl v0.8b, v0.8b, #4
+; CHECK-SD-NEXT:    usra v0.8b, v1.8b, #3
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshr_v8i8_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    shl v0.8b, v0.8b, #5
+; CHECK-GI-NEXT:    ushr v1.8b, v1.8b, #3
+; CHECK-GI-NEXT:    orr v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <8 x i8> @llvm.fshr(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+  ret <8 x i8> %d
+}
+
+define <16 x i8> @fshl_v16i8_c(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-SD-LABEL: fshl_v16i8_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ushr v1.16b, v1.16b, #1
+; CHECK-SD-NEXT:    shl v0.16b, v0.16b, #3
+; CHECK-SD-NEXT:    usra v0.16b, v1.16b, #4
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshl_v16i8_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    shl v0.16b, v0.16b, #3
+; CHECK-GI-NEXT:    ushr v1.16b, v1.16b, #5
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <16 x i8> @llvm.fshl(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+  ret <16 x i8> %d
+}
+
+define <16 x i8> @fshr_v16i8_c(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-SD-LABEL: fshr_v16i8_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    add v0.16b, v0.16b, v0.16b
+; CHECK-SD-NEXT:    shl v0.16b, v0.16b, #4
+; CHECK-SD-NEXT:    usra v0.16b, v1.16b, #3
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshr_v16i8_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    shl v0.16b, v0.16b, #5
+; CHECK-GI-NEXT:    ushr v1.16b, v1.16b, #3
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <16 x i8> @llvm.fshr(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+  ret <16 x i8> %d
+}
+
+define <4 x i16> @fshl_v4i16_c(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-SD-LABEL: fshl_v4i16_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ushr v1.4h, v1.4h, #1
+; CHECK-SD-NEXT:    shl v0.4h, v0.4h, #3
+; CHECK-SD-NEXT:    usra v0.4h, v1.4h, #12
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshl_v4i16_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    shl v0.4h, v0.4h, #3
+; CHECK-GI-NEXT:    ushr v1.4h, v1.4h, #13
+; CHECK-GI-NEXT:    orr v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <4 x i16> @llvm.fshl(<4 x i16> %a, <4 x i16> %b, <4 x i16> <i16 3, i16 3, i16 3, i16 3>)
+  ret <4 x i16> %d
+}
+
+define <4 x i16> @fshr_v4i16_c(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-SD-LABEL: fshr_v4i16_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    add v0.4h, v0.4h, v0.4h
+; CHECK-SD-NEXT:    shl v0.4h, v0.4h, #12
+; CHECK-SD-NEXT:    usra v0.4h, v1.4h, #3
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshr_v4i16_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    shl v0.4h, v0.4h, #13
+; CHECK-GI-NEXT:    ushr v1.4h, v1.4h, #3
+; CHECK-GI-NEXT:    orr v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <4 x i16> @llvm.fshr(<4 x i16> %a, <4 x i16> %b, <4 x i16> <i16 3, i16 3, i16 3, i16 3>)
+  ret <4 x i16> %d
+}
+
+define <7 x i16> @fshl_v7i16_c(<7 x i16> %a, <7 x i16> %b) {
+; CHECK-SD-LABEL: fshl_v7i16_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    adrp x8, .LCPI124_0
+; CHECK-SD-NEXT:    ushr v1.8h, v1.8h, #1
+; CHECK-SD-NEXT:    adrp x9, .LCPI124_1
+; CHECK-SD-NEXT:    ldr q2, [x8, :lo12:.LCPI124_0]
+; CHECK-SD-NEXT:    ldr q3, [x9, :lo12:.LCPI124_1]
+; CHECK-SD-NEXT:    ushl v1.8h, v1.8h, v2.8h
+; CHECK-SD-NEXT:    ushl v0.8h, v0.8h, v3.8h
+; CHECK-SD-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshl_v7i16_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #13 // =0xd
+; CHECK-GI-NEXT:    mov w9, #3 // =0x3
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    fmov s3, w9
+; CHECK-GI-NEXT:    mov v2.h[1], w8
+; CHECK-GI-NEXT:    mov v3.h[1], w9
+; CHECK-GI-NEXT:    mov v2.h[2], w8
+; CHECK-GI-NEXT:    mov v3.h[2], w9
+; CHECK-GI-NEXT:    mov v2.h[3], w8
+; CHECK-GI-NEXT:    mov v3.h[3], w9
+; CHECK-GI-NEXT:    mov v2.h[4], w8
+; CHECK-GI-NEXT:    mov v3.h[4], w9
+; CHECK-GI-NEXT:    mov v2.h[5], w8
+; CHECK-GI-NEXT:    mov v3.h[5], w9
+; CHECK-GI-NEXT:    mov v2.h[6], w8
+; CHECK-GI-NEXT:    mov v3.h[6], w9
+; CHECK-GI-NEXT:    neg v2.8h, v2.8h
+; CHECK-GI-NEXT:    ushl v0.8h, v0.8h, v3.8h
+; CHECK-GI-NEXT:    ushl v1.8h, v1.8h, v2.8h
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <7 x i16> @llvm.fshl(<7 x i16> %a, <7 x i16> %b, <7 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+  ret <7 x i16> %d
+}
+
+define <7 x i16> @fshr_v7i16_c(<7 x i16> %a, <7 x i16> %b) {
+; CHECK-SD-LABEL: fshr_v7i16_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    adrp x8, .LCPI125_0
+; CHECK-SD-NEXT:    adrp x9, .LCPI125_1
+; CHECK-SD-NEXT:    add v0.8h, v0.8h, v0.8h
+; CHECK-SD-NEXT:    ldr q2, [x8, :lo12:.LCPI125_0]
+; CHECK-SD-NEXT:    ldr q3, [x9, :lo12:.LCPI125_1]
+; CHECK-SD-NEXT:    ushl v1.8h, v1.8h, v2.8h
+; CHECK-SD-NEXT:    ushl v0.8h, v0.8h, v3.8h
+; CHECK-SD-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshr_v7i16_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #3 // =0x3
+; CHECK-GI-NEXT:    mov w9, #13 // =0xd
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    fmov s3, w9
+; CHECK-GI-NEXT:    mov v2.h[1], w8
+; CHECK-GI-NEXT:    mov v3.h[1], w9
+; CHECK-GI-NEXT:    mov v2.h[2], w8
+; CHECK-GI-NEXT:    mov v3.h[2], w9
+; CHECK-GI-NEXT:    mov v2.h[3], w8
+; CHECK-GI-NEXT:    mov v3.h[3], w9
+; CHECK-GI-NEXT:    mov v2.h[4], w8
+; CHECK-GI-NEXT:    mov v3.h[4], w9
+; CHECK-GI-NEXT:    mov v2.h[5], w8
+; CHECK-GI-NEXT:    mov v3.h[5], w9
+; CHECK-GI-NEXT:    mov v2.h[6], w8
+; CHECK-GI-NEXT:    mov v3.h[6], w9
+; CHECK-GI-NEXT:    neg v2.8h, v2.8h
+; CHECK-GI-NEXT:    ushl v0.8h, v0.8h, v3.8h
+; CHECK-GI-NEXT:    ushl v1.8h, v1.8h, v2.8h
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <7 x i16> @llvm.fshr(<7 x i16> %a, <7 x i16> %b, <7 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+  ret <7 x i16> %d
+}
+
+define <8 x i16> @fshl_v8i16_c(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-SD-LABEL: fshl_v8i16_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ushr v1.8h, v1.8h, #1
+; CHECK-SD-NEXT:    shl v0.8h, v0.8h, #3
+; CHECK-SD-NEXT:    usra v0.8h, v1.8h, #12
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshl_v8i16_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    shl v0.8h, v0.8h, #3
+; CHECK-GI-NEXT:    ushr v1.8h, v1.8h, #13
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <8 x i16> @llvm.fshl(<8 x i16> %a, <8 x i16> %b, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+  ret <8 x i16> %d
+}
+
+define <8 x i16> @fshr_v8i16_c(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-SD-LABEL: fshr_v8i16_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    add v0.8h, v0.8h, v0.8h
+; CHECK-SD-NEXT:    shl v0.8h, v0.8h, #12
+; CHECK-SD-NEXT:    usra v0.8h, v1.8h, #3
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshr_v8i16_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    shl v0.8h, v0.8h, #13
+; CHECK-GI-NEXT:    ushr v1.8h, v1.8h, #3
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <8 x i16> @llvm.fshr(<8 x i16> %a, <8 x i16> %b, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+  ret <8 x i16> %d
+}
+
+define <16 x i16> @fshl_v16i16_c(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-SD-LABEL: fshl_v16i16_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ushr v2.8h, v2.8h, #1
+; CHECK-SD-NEXT:    shl v0.8h, v0.8h, #3
+; CHECK-SD-NEXT:    ushr v3.8h, v3.8h, #1
+; CHECK-SD-NEXT:    shl v1.8h, v1.8h, #3
+; CHECK-SD-NEXT:    usra v0.8h, v2.8h, #12
+; CHECK-SD-NEXT:    usra v1.8h, v3.8h, #12
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshl_v16i16_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    shl v0.8h, v0.8h, #3
+; CHECK-GI-NEXT:    shl v1.8h, v1.8h, #3
+; CHECK-GI-NEXT:    ushr v2.8h, v2.8h, #13
+; CHECK-GI-NEXT:    ushr v3.8h, v3.8h, #13
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    orr v1.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <16 x i16> @llvm.fshl(<16 x i16> %a, <16 x i16> %b, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+  ret <16 x i16> %d
+}
+
+define <16 x i16> @fshr_v16i16_c(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-SD-LABEL: fshr_v16i16_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    add v1.8h, v1.8h, v1.8h
+; CHECK-SD-NEXT:    add v0.8h, v0.8h, v0.8h
+; CHECK-SD-NEXT:    shl v1.8h, v1.8h, #12
+; CHECK-SD-NEXT:    shl v0.8h, v0.8h, #12
+; CHECK-SD-NEXT:    usra v1.8h, v3.8h, #3
+; CHECK-SD-NEXT:    usra v0.8h, v2.8h, #3
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshr_v16i16_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    shl v0.8h, v0.8h, #13
+; CHECK-GI-NEXT:    shl v1.8h, v1.8h, #13
+; CHECK-GI-NEXT:    ushr v2.8h, v2.8h, #3
+; CHECK-GI-NEXT:    ushr v3.8h, v3.8h, #3
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    orr v1.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <16 x i16> @llvm.fshr(<16 x i16> %a, <16 x i16> %b, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+  ret <16 x i16> %d
+}
+
+define <2 x i32> @fshl_v2i32_c(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-SD-LABEL: fshl_v2i32_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    shl v0.2s, v0.2s, #3
+; CHECK-SD-NEXT:    usra v0.2s, v1.2s, #29
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshl_v2i32_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    shl v0.2s, v0.2s, #3
+; CHECK-GI-NEXT:    ushr v1.2s, v1.2s, #29
+; CHECK-GI-NEXT:    orr v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <2 x i32> @llvm.fshl(<2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 3, i32 3>)
+  ret <2 x i32> %d
+}
+
+define <2 x i32> @fshr_v2i32_c(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-SD-LABEL: fshr_v2i32_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    shl v0.2s, v0.2s, #29
+; CHECK-SD-NEXT:    usra v0.2s, v1.2s, #3
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshr_v2i32_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    shl v0.2s, v0.2s, #29
+; CHECK-GI-NEXT:    ushr v1.2s, v1.2s, #3
+; CHECK-GI-NEXT:    orr v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <2 x i32> @llvm.fshr(<2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 3, i32 3>)
+  ret <2 x i32> %d
+}
+
+define <4 x i32> @fshl_v4i32_c(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-SD-LABEL: fshl_v4i32_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    shl v0.4s, v0.4s, #3
+; CHECK-SD-NEXT:    usra v0.4s, v1.4s, #29
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshl_v4i32_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    shl v0.4s, v0.4s, #3
+; CHECK-GI-NEXT:    ushr v1.4s, v1.4s, #29
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <4 x i32> @llvm.fshl(<4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>)
+  ret <4 x i32> %d
+}
+
+define <4 x i32> @fshr_v4i32_c(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-SD-LABEL: fshr_v4i32_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    shl v0.4s, v0.4s, #29
+; CHECK-SD-NEXT:    usra v0.4s, v1.4s, #3
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshr_v4i32_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    shl v0.4s, v0.4s, #29
+; CHECK-GI-NEXT:    ushr v1.4s, v1.4s, #3
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <4 x i32> @llvm.fshr(<4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>)
+  ret <4 x i32> %d
+}
+
+define <7 x i32> @fshl_v7i32_c(<7 x i32> %a, <7 x i32> %b) {
+; CHECK-SD-LABEL: fshl_v7i32_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov s0, w0
+; CHECK-SD-NEXT:    fmov s2, w4
+; CHECK-SD-NEXT:    ldr s1, [sp, #24]
+; CHECK-SD-NEXT:    fmov s3, w7
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    add x9, sp, #32
+; CHECK-SD-NEXT:    ld1 { v1.s }[1], [x9]
+; CHECK-SD-NEXT:    add x9, sp, #40
+; CHECK-SD-NEXT:    adrp x10, .LCPI134_1
+; CHECK-SD-NEXT:    mov v0.s[1], w1
+; CHECK-SD-NEXT:    mov v2.s[1], w5
+; CHECK-SD-NEXT:    ldr q5, [x10, :lo12:.LCPI134_1]
+; CHECK-SD-NEXT:    ld1 { v3.s }[1], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #8
+; CHECK-SD-NEXT:    ld1 { v1.s }[2], [x9]
+; CHECK-SD-NEXT:    add x9, sp, #16
+; CHECK-SD-NEXT:    mov v0.s[2], w2
+; CHECK-SD-NEXT:    mov v2.s[2], w6
+; CHECK-SD-NEXT:    ld1 { v3.s }[2], [x8]
+; CHECK-SD-NEXT:    adrp x8, .LCPI134_0
+; CHECK-SD-NEXT:    ldr q4, [x8, :lo12:.LCPI134_0]
+; CHECK-SD-NEXT:    ld1 { v3.s }[3], [x9]
+; CHECK-SD-NEXT:    mov v0.s[3], w3
+; CHECK-SD-NEXT:    ushl v1.4s, v1.4s, v4.4s
+; CHECK-SD-NEXT:    ushl v2.4s, v2.4s, v5.4s
+; CHECK-SD-NEXT:    orr v1.16b, v2.16b, v1.16b
+; CHECK-SD-NEXT:    shl v0.4s, v0.4s, #3
+; CHECK-SD-NEXT:    mov w5, v1.s[1]
+; CHECK-SD-NEXT:    mov w6, v1.s[2]
+; CHECK-SD-NEXT:    fmov w4, s1
+; CHECK-SD-NEXT:    usra v0.4s, v3.4s, #29
+; CHECK-SD-NEXT:    mov w1, v0.s[1]
+; CHECK-SD-NEXT:    mov w2, v0.s[2]
+; CHECK-SD-NEXT:    mov w3, v0.s[3]
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshl_v7i32_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v0.s[0], w0
+; CHECK-GI-NEXT:    mov v1.s[0], w7
+; CHECK-GI-NEXT:    mov w8, #29 // =0x1d
+; CHECK-GI-NEXT:    mov v3.s[0], w8
+; CHECK-GI-NEXT:    ldr s2, [sp]
+; CHECK-GI-NEXT:    mov w9, #3 // =0x3
+; CHECK-GI-NEXT:    mov v4.s[0], w9
+; CHECK-GI-NEXT:    ldr s5, [sp, #8]
+; CHECK-GI-NEXT:    ldr s6, [sp, #24]
+; CHECK-GI-NEXT:    ldr s7, [sp, #32]
+; CHECK-GI-NEXT:    mov v0.s[1], w1
+; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
+; CHECK-GI-NEXT:    mov v2.s[0], w4
+; CHECK-GI-NEXT:    mov v3.s[1], w8
+; CHECK-GI-NEXT:    mov v6.s[1], v7.s[0]
+; CHECK-GI-NEXT:    ldr s7, [sp, #40]
+; CHECK-GI-NEXT:    mov v4.s[1], w9
+; CHECK-GI-NEXT:    mov v0.s[2], w2
+; CHECK-GI-NEXT:    mov v1.s[2], v5.s[0]
+; CHECK-GI-NEXT:    mov v2.s[1], w5
+; CHECK-GI-NEXT:    mov v3.s[2], w8
+; CHECK-GI-NEXT:    ldr s5, [sp, #16]
+; CHECK-GI-NEXT:    mov v6.s[2], v7.s[0]
+; CHECK-GI-NEXT:    mov v4.s[2], w9
+; CHECK-GI-NEXT:    mov v0.s[3], w3
+; CHECK-GI-NEXT:    mov v1.s[3], v5.s[0]
+; CHECK-GI-NEXT:    mov v2.s[2], w6
+; CHECK-GI-NEXT:    neg v3.4s, v3.4s
+; CHECK-GI-NEXT:    shl v0.4s, v0.4s, #3
+; CHECK-GI-NEXT:    ushr v1.4s, v1.4s, #29
+; CHECK-GI-NEXT:    ushl v2.4s, v2.4s, v4.4s
+; CHECK-GI-NEXT:    ushl v3.4s, v6.4s, v3.4s
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    orr v1.16b, v2.16b, v3.16b
+; CHECK-GI-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-NEXT:    mov s3, v0.s[2]
+; CHECK-GI-NEXT:    mov s4, v0.s[3]
+; CHECK-GI-NEXT:    mov s5, v1.s[1]
+; CHECK-GI-NEXT:    mov s6, v1.s[2]
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    fmov w4, s1
+; CHECK-GI-NEXT:    fmov w1, s2
+; CHECK-GI-NEXT:    fmov w2, s3
+; CHECK-GI-NEXT:    fmov w3, s4
+; CHECK-GI-NEXT:    fmov w5, s5
+; CHECK-GI-NEXT:    fmov w6, s6
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <7 x i32> @llvm.fshl(<7 x i32> %a, <7 x i32> %b, <7 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>)
+  ret <7 x i32> %d
+}
+
+define <7 x i32> @fshr_v7i32_c(<7 x i32> %a, <7 x i32> %b) {
+; CHECK-SD-LABEL: fshr_v7i32_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov s0, w0
+; CHECK-SD-NEXT:    fmov s2, w4
+; CHECK-SD-NEXT:    ldr s1, [sp, #24]
+; CHECK-SD-NEXT:    fmov s3, w7
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    add x9, sp, #32
+; CHECK-SD-NEXT:    ld1 { v1.s }[1], [x9]
+; CHECK-SD-NEXT:    add x9, sp, #40
+; CHECK-SD-NEXT:    adrp x10, .LCPI135_1
+; CHECK-SD-NEXT:    mov v0.s[1], w1
+; CHECK-SD-NEXT:    mov v2.s[1], w5
+; CHECK-SD-NEXT:    ldr q5, [x10, :lo12:.LCPI135_1]
+; CHECK-SD-NEXT:    ld1 { v3.s }[1], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #8
+; CHECK-SD-NEXT:    ld1 { v1.s }[2], [x9]
+; CHECK-SD-NEXT:    add x9, sp, #16
+; CHECK-SD-NEXT:    mov v0.s[2], w2
+; CHECK-SD-NEXT:    mov v2.s[2], w6
+; CHECK-SD-NEXT:    ld1 { v3.s }[2], [x8]
+; CHECK-SD-NEXT:    adrp x8, .LCPI135_0
+; CHECK-SD-NEXT:    ldr q4, [x8, :lo12:.LCPI135_0]
+; CHECK-SD-NEXT:    ld1 { v3.s }[3], [x9]
+; CHECK-SD-NEXT:    mov v0.s[3], w3
+; CHECK-SD-NEXT:    ushl v1.4s, v1.4s, v4.4s
+; CHECK-SD-NEXT:    ushl v2.4s, v2.4s, v5.4s
+; CHECK-SD-NEXT:    orr v1.16b, v2.16b, v1.16b
+; CHECK-SD-NEXT:    shl v0.4s, v0.4s, #29
+; CHECK-SD-NEXT:    mov w5, v1.s[1]
+; CHECK-SD-NEXT:    mov w6, v1.s[2]
+; CHECK-SD-NEXT:    fmov w4, s1
+; CHECK-SD-NEXT:    usra v0.4s, v3.4s, #3
+; CHECK-SD-NEXT:    mov w1, v0.s[1]
+; CHECK-SD-NEXT:    mov w2, v0.s[2]
+; CHECK-SD-NEXT:    mov w3, v0.s[3]
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshr_v7i32_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v0.s[0], w0
+; CHECK-GI-NEXT:    mov v1.s[0], w7
+; CHECK-GI-NEXT:    mov w8, #3 // =0x3
+; CHECK-GI-NEXT:    mov v3.s[0], w8
+; CHECK-GI-NEXT:    ldr s2, [sp]
+; CHECK-GI-NEXT:    mov w9, #29 // =0x1d
+; CHECK-GI-NEXT:    mov v4.s[0], w9
+; CHECK-GI-NEXT:    ldr s5, [sp, #8]
+; CHECK-GI-NEXT:    ldr s6, [sp, #24]
+; CHECK-GI-NEXT:    ldr s7, [sp, #32]
+; CHECK-GI-NEXT:    mov v0.s[1], w1
+; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
+; CHECK-GI-NEXT:    mov v2.s[0], w4
+; CHECK-GI-NEXT:    mov v3.s[1], w8
+; CHECK-GI-NEXT:    mov v6.s[1], v7.s[0]
+; CHECK-GI-NEXT:    ldr s7, [sp, #40]
+; CHECK-GI-NEXT:    mov v4.s[1], w9
+; CHECK-GI-NEXT:    mov v0.s[2], w2
+; CHECK-GI-NEXT:    mov v1.s[2], v5.s[0]
+; CHECK-GI-NEXT:    mov v2.s[1], w5
+; CHECK-GI-NEXT:    mov v3.s[2], w8
+; CHECK-GI-NEXT:    ldr s5, [sp, #16]
+; CHECK-GI-NEXT:    mov v6.s[2], v7.s[0]
+; CHECK-GI-NEXT:    mov v4.s[2], w9
+; CHECK-GI-NEXT:    mov v0.s[3], w3
+; CHECK-GI-NEXT:    mov v1.s[3], v5.s[0]
+; CHECK-GI-NEXT:    mov v2.s[2], w6
+; CHECK-GI-NEXT:    neg v3.4s, v3.4s
+; CHECK-GI-NEXT:    shl v0.4s, v0.4s, #29
+; CHECK-GI-NEXT:    ushr v1.4s, v1.4s, #3
+; CHECK-GI-NEXT:    ushl v2.4s, v2.4s, v4.4s
+; CHECK-GI-NEXT:    ushl v3.4s, v6.4s, v3.4s
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    orr v1.16b, v2.16b, v3.16b
+; CHECK-GI-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-NEXT:    mov s3, v0.s[2]
+; CHECK-GI-NEXT:    mov s4, v0.s[3]
+; CHECK-GI-NEXT:    mov s5, v1.s[1]
+; CHECK-GI-NEXT:    mov s6, v1.s[2]
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    fmov w4, s1
+; CHECK-GI-NEXT:    fmov w1, s2
+; CHECK-GI-NEXT:    fmov w2, s3
+; CHECK-GI-NEXT:    fmov w3, s4
+; CHECK-GI-NEXT:    fmov w5, s5
+; CHECK-GI-NEXT:    fmov w6, s6
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <7 x i32> @llvm.fshr(<7 x i32> %a, <7 x i32> %b, <7 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>)
+  ret <7 x i32> %d
+}
+
+define <8 x i32> @fshl_v8i32_c(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-SD-LABEL: fshl_v8i32_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    shl v1.4s, v1.4s, #3
+; CHECK-SD-NEXT:    shl v0.4s, v0.4s, #3
+; CHECK-SD-NEXT:    usra v1.4s, v3.4s, #29
+; CHECK-SD-NEXT:    usra v0.4s, v2.4s, #29
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshl_v8i32_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    shl v0.4s, v0.4s, #3
+; CHECK-GI-NEXT:    shl v1.4s, v1.4s, #3
+; CHECK-GI-NEXT:    ushr v2.4s, v2.4s, #29
+; CHECK-GI-NEXT:    ushr v3.4s, v3.4s, #29
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    orr v1.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <8 x i32> @llvm.fshl(<8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>)
+  ret <8 x i32> %d
+}
+
+define <8 x i32> @fshr_v8i32_c(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-SD-LABEL: fshr_v8i32_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    shl v1.4s, v1.4s, #29
+; CHECK-SD-NEXT:    shl v0.4s, v0.4s, #29
+; CHECK-SD-NEXT:    usra v1.4s, v3.4s, #3
+; CHECK-SD-NEXT:    usra v0.4s, v2.4s, #3
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshr_v8i32_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    shl v0.4s, v0.4s, #29
+; CHECK-GI-NEXT:    shl v1.4s, v1.4s, #29
+; CHECK-GI-NEXT:    ushr v2.4s, v2.4s, #3
+; CHECK-GI-NEXT:    ushr v3.4s, v3.4s, #3
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    orr v1.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <8 x i32> @llvm.fshr(<8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>)
+  ret <8 x i32> %d
+}
+
+define <2 x i64> @fshl_v2i64_c(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-SD-LABEL: fshl_v2i64_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    shl v0.2d, v0.2d, #3
+; CHECK-SD-NEXT:    usra v0.2d, v1.2d, #61
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshl_v2i64_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #3
+; CHECK-GI-NEXT:    ushr v1.2d, v1.2d, #61
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <2 x i64> @llvm.fshl(<2 x i64> %a, <2 x i64> %b, <2 x i64> <i64 3, i64 3>)
+  ret <2 x i64> %d
+}
+
+define <2 x i64> @fshr_v2i64_c(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-SD-LABEL: fshr_v2i64_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    shl v0.2d, v0.2d, #61
+; CHECK-SD-NEXT:    usra v0.2d, v1.2d, #3
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshr_v2i64_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #61
+; CHECK-GI-NEXT:    ushr v1.2d, v1.2d, #3
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <2 x i64> @llvm.fshr(<2 x i64> %a, <2 x i64> %b, <2 x i64> <i64 3, i64 3>)
+  ret <2 x i64> %d
+}
+
+define <4 x i64> @fshl_v4i64_c(<4 x i64> %a, <4 x i64> %b) {
+; CHECK-SD-LABEL: fshl_v4i64_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    shl v1.2d, v1.2d, #3
+; CHECK-SD-NEXT:    shl v0.2d, v0.2d, #3
+; CHECK-SD-NEXT:    usra v1.2d, v3.2d, #61
+; CHECK-SD-NEXT:    usra v0.2d, v2.2d, #61
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshl_v4i64_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #3
+; CHECK-GI-NEXT:    shl v1.2d, v1.2d, #3
+; CHECK-GI-NEXT:    ushr v2.2d, v2.2d, #61
+; CHECK-GI-NEXT:    ushr v3.2d, v3.2d, #61
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    orr v1.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <4 x i64> @llvm.fshl(<4 x i64> %a, <4 x i64> %b, <4 x i64> <i64 3, i64 3, i64 3, i64 3>)
+  ret <4 x i64> %d
+}
+
+define <4 x i64> @fshr_v4i64_c(<4 x i64> %a, <4 x i64> %b) {
+; CHECK-SD-LABEL: fshr_v4i64_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    shl v1.2d, v1.2d, #61
+; CHECK-SD-NEXT:    shl v0.2d, v0.2d, #61
+; CHECK-SD-NEXT:    usra v1.2d, v3.2d, #3
+; CHECK-SD-NEXT:    usra v0.2d, v2.2d, #3
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshr_v4i64_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #61
+; CHECK-GI-NEXT:    shl v1.2d, v1.2d, #61
+; CHECK-GI-NEXT:    ushr v2.2d, v2.2d, #3
+; CHECK-GI-NEXT:    ushr v3.2d, v3.2d, #3
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    orr v1.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <4 x i64> @llvm.fshr(<4 x i64> %a, <4 x i64> %b, <4 x i64> <i64 3, i64 3, i64 3, i64 3>)
+  ret <4 x i64> %d
+}
+
+define <2 x i128> @fshl_v2i128_c(<2 x i128> %a, <2 x i128> %b) {
+; CHECK-SD-LABEL: fshl_v2i128_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    extr x8, x0, x5, #61
+; CHECK-SD-NEXT:    extr x9, x2, x7, #61
+; CHECK-SD-NEXT:    extr x1, x1, x0, #61
+; CHECK-SD-NEXT:    extr x3, x3, x2, #61
+; CHECK-SD-NEXT:    mov x0, x8
+; CHECK-SD-NEXT:    mov x2, x9
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshl_v2i128_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    lsr x8, x5, #61
+; CHECK-GI-NEXT:    lsl x9, x1, #3
+; CHECK-GI-NEXT:    lsl x10, x3, #3
+; CHECK-GI-NEXT:    lsr x11, x7, #61
+; CHECK-GI-NEXT:    orr x8, x8, x0, lsl #3
+; CHECK-GI-NEXT:    orr x1, x9, x0, lsr #61
+; CHECK-GI-NEXT:    orr x3, x10, x2, lsr #61
+; CHECK-GI-NEXT:    orr x2, x11, x2, lsl #3
+; CHECK-GI-NEXT:    mov x0, x8
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <2 x i128> @llvm.fshl(<2 x i128> %a, <2 x i128> %b, <2 x i128> <i128 3, i128 3>)
+  ret <2 x i128> %d
+}
+
+define <2 x i128> @fshr_v2i128_c(<2 x i128> %a, <2 x i128> %b) {
+; CHECK-SD-LABEL: fshr_v2i128_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    extr x8, x5, x4, #3
+; CHECK-SD-NEXT:    extr x9, x7, x6, #3
+; CHECK-SD-NEXT:    extr x1, x0, x5, #3
+; CHECK-SD-NEXT:    extr x3, x2, x7, #3
+; CHECK-SD-NEXT:    mov x0, x8
+; CHECK-SD-NEXT:    mov x2, x9
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fshr_v2i128_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    lsl x8, x5, #61
+; CHECK-GI-NEXT:    lsl x9, x7, #61
+; CHECK-GI-NEXT:    lsr x10, x5, #3
+; CHECK-GI-NEXT:    lsr x11, x7, #3
+; CHECK-GI-NEXT:    orr x8, x8, x4, lsr #3
+; CHECK-GI-NEXT:    orr x9, x9, x6, lsr #3
+; CHECK-GI-NEXT:    orr x1, x10, x0, lsl #61
+; CHECK-GI-NEXT:    orr x3, x11, x2, lsl #61
+; CHECK-GI-NEXT:    mov x0, x8
+; CHECK-GI-NEXT:    mov x2, x9
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = call <2 x i128> @llvm.fshr(<2 x i128> %a, <2 x i128> %b, <2 x i128> <i128 3, i128 3>)
+  ret <2 x i128> %d
+}
diff --git llvm/test/CodeGen/AArch64/vector-extract-last-active.ll llvm/test/CodeGen/AArch64/vector-extract-last-active.ll
index 5212acc6fca0..3b11e67d072e 100644
--- llvm/test/CodeGen/AArch64/vector-extract-last-active.ll
+++ llvm/test/CodeGen/AArch64/vector-extract-last-active.ll
@@ -318,7 +318,7 @@ define i16 @extract_last_i16_scalable(<vscale x 8 x i16> %data, <vscale x 8 x i1
 ; CHECK-NEXT:    sel z1.h, p0, z1.h, z2.h
 ; CHECK-NEXT:    umaxv h1, p1, z1.h
 ; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    and x8, x8, #0xff
+; CHECK-NEXT:    and x8, x8, #0xffff
 ; CHECK-NEXT:    whilels p2.h, xzr, x8
 ; CHECK-NEXT:    ptest p1, p0.b
 ; CHECK-NEXT:    lastb w8, p2, z0.h
@@ -337,7 +337,7 @@ define i32 @extract_last_i32_scalable(<vscale x 4 x i32> %data, <vscale x 4 x i1
 ; CHECK-NEXT:    sel z1.s, p0, z1.s, z2.s
 ; CHECK-NEXT:    umaxv s1, p1, z1.s
 ; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    and x8, x8, #0xff
+; CHECK-NEXT:    mov w8, w8
 ; CHECK-NEXT:    whilels p2.s, xzr, x8
 ; CHECK-NEXT:    ptest p1, p0.b
 ; CHECK-NEXT:    lastb w8, p2, z0.s
@@ -356,7 +356,6 @@ define i64 @extract_last_i64_scalable(<vscale x 2 x i64> %data, <vscale x 2 x i1
 ; CHECK-NEXT:    sel z1.d, p0, z1.d, z2.d
 ; CHECK-NEXT:    umaxv d1, p1, z1.d
 ; CHECK-NEXT:    fmov x8, d1
-; CHECK-NEXT:    and x8, x8, #0xff
 ; CHECK-NEXT:    whilels p2.d, xzr, x8
 ; CHECK-NEXT:    ptest p1, p0.b
 ; CHECK-NEXT:    lastb x8, p2, z0.d
@@ -375,7 +374,7 @@ define float @extract_last_float_scalable(<vscale x 4 x float> %data, <vscale x
 ; CHECK-NEXT:    sel z2.s, p0, z2.s, z3.s
 ; CHECK-NEXT:    umaxv s2, p1, z2.s
 ; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    and x8, x8, #0xff
+; CHECK-NEXT:    mov w8, w8
 ; CHECK-NEXT:    whilels p2.s, xzr, x8
 ; CHECK-NEXT:    ptest p1, p0.b
 ; CHECK-NEXT:    lastb s0, p2, z0.s
@@ -394,7 +393,6 @@ define double @extract_last_double_scalable(<vscale x 2 x double> %data, <vscale
 ; CHECK-NEXT:    sel z2.d, p0, z2.d, z3.d
 ; CHECK-NEXT:    umaxv d2, p1, z2.d
 ; CHECK-NEXT:    fmov x8, d2
-; CHECK-NEXT:    and x8, x8, #0xff
 ; CHECK-NEXT:    whilels p2.d, xzr, x8
 ; CHECK-NEXT:    ptest p1, p0.b
 ; CHECK-NEXT:    lastb d0, p2, z0.d
@@ -404,6 +402,24 @@ define double @extract_last_double_scalable(<vscale x 2 x double> %data, <vscale
   ret double %res
 }
 
+;; If the passthru parameter is poison, we shouldn't see a select at the end.
+define i8 @extract_last_i8_scalable_poison_passthru(<vscale x 16 x i8> %data, <vscale x 16 x i1> %mask) #0 {
+; CHECK-LABEL: extract_last_i8_scalable_poison_passthru:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    index z1.b, #0, #1
+; CHECK-NEXT:    mov z2.b, #0 // =0x0
+; CHECK-NEXT:    sel z1.b, p0, z1.b, z2.b
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    umaxv b1, p0, z1.b
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    and x8, x8, #0xff
+; CHECK-NEXT:    whilels p0.b, xzr, x8
+; CHECK-NEXT:    lastb w0, p0, z0.b
+; CHECK-NEXT:    ret
+  %res = call i8 @llvm.experimental.vector.extract.last.active.nxv16i8(<vscale x 16 x i8> %data, <vscale x 16 x i1> %mask, i8 poison)
+  ret i8 %res
+}
+
 declare i8 @llvm.experimental.vector.extract.last.active.v16i8(<16 x i8>, <16 x i1>, i8)
 declare i16 @llvm.experimental.vector.extract.last.active.v8i16(<8 x i16>, <8 x i1>, i16)
 declare i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32>, <4 x i1>, i32)
diff --git llvm/test/CodeGen/AArch64/zeroing-forms-counts-not.ll llvm/test/CodeGen/AArch64/zeroing-forms-counts-not.ll
new file mode 100644
index 000000000000..f7970ca81f60
--- /dev/null
+++ llvm/test/CodeGen/AArch64/zeroing-forms-counts-not.ll
@@ -0,0 +1,2048 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mattr=+bf16,+sve    < %s | FileCheck %s
+; RUN: llc -mattr=+bf16,+sve2p2 < %s | FileCheck %s -check-prefix CHECK-2p2
+
+; RUN: llc -mattr=+bf16,+sme    -force-streaming < %s | FileCheck %s
+; RUN: llc -mattr=+bf16,+sme2p2 -force-streaming < %s | FileCheck %s -check-prefix CHECK-2p2
+
+target triple = "aarch64-linux"
+
+define <vscale x 16 x i8> @test_svcls_s8_x_1(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %x) {
+; CHECK-LABEL: test_svcls_s8_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cls z0.b, p0/m, z0.b
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcls_s8_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    cls z0.b, p0/z, z0.b
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.cls.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %x)
+  ret <vscale x 16 x i8> %0
+}
+
+define <vscale x 16 x i8> @test_svcls_s8_x_2(<vscale x 16 x i1> %pg, double %z0, <vscale x 16 x i8> %x) {
+; CHECK-LABEL: test_svcls_s8_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    cls z0.b, p0/m, z1.b
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcls_s8_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    cls z0.b, p0/z, z1.b
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.cls.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %x)
+  ret <vscale x 16 x i8> %0
+}
+
+define <vscale x 16 x i8> @test_svcls_s8_z(<vscale x 16 x i1> %pg, double %z0, <vscale x 16 x i8> %x) {
+; CHECK-LABEL: test_svcls_s8_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.b, #0 // =0x0
+; CHECK-NEXT:    cls z0.b, p0/m, z1.b
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcls_s8_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    cls z0.b, p0/z, z1.b
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.cls.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %x)
+  ret <vscale x 16 x i8> %0
+}
+
+define <vscale x 8 x i16> @test_svcls_s16_x_1(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %x) {
+; CHECK-LABEL: test_svcls_s16_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cls z0.h, p0/m, z0.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcls_s16_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    cls z0.h, p0/z, z0.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.cls.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 8 x i16> @test_svcls_s16_x_2(<vscale x 8 x i1> %pg, double %z0, <vscale x 8 x i16> %x) {
+; CHECK-LABEL: test_svcls_s16_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    cls z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcls_s16_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    cls z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.cls.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 8 x i16> @test_svcls_s16_z(<vscale x 8 x i1> %pg, double %z0, <vscale x 8 x i16> %x) {
+; CHECK-LABEL: test_svcls_s16_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    cls z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcls_s16_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    cls z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.cls.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 4 x i32> @test_svcls_s32_x_1(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svcls_s32_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cls z0.s, p0/m, z0.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcls_s32_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    cls z0.s, p0/z, z0.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.cls.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_svcls_s32_x_2(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svcls_s32_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    cls z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcls_s32_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    cls z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.cls.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_svcls_s32_z(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svcls_s32_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NEXT:    cls z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcls_s32_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    cls z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.cls.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 2 x i64> @test_svcls_s64_x_1(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svcls_s64_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cls z0.d, p0/m, z0.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcls_s64_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    cls z0.d, p0/z, z0.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.cls.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svcls_s64_x_2(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svcls_s64_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    cls z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcls_s64_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    cls z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.cls.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svcls_s64_z(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svcls_s64_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, #0 // =0x0
+; CHECK-NEXT:    cls z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcls_s64_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    cls z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.cls.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 16 x i8> @test_svclz_s8_x_1(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %x) {
+; CHECK-LABEL: test_svclz_s8_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    clz z0.b, p0/m, z0.b
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svclz_s8_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    clz z0.b, p0/z, z0.b
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.clz.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %x)
+  ret <vscale x 16 x i8> %0
+}
+
+define <vscale x 16 x i8> @test_svclz_s8_x_2(<vscale x 16 x i1> %pg, double %z0, <vscale x 16 x i8> %x) {
+; CHECK-LABEL: test_svclz_s8_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    clz z0.b, p0/m, z1.b
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svclz_s8_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    clz z0.b, p0/z, z1.b
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.clz.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %x)
+  ret <vscale x 16 x i8> %0
+}
+
+define <vscale x 16 x i8> @test_svclz_s8_z(<vscale x 16 x i1> %pg, double %z0, <vscale x 16 x i8> %x) {
+; CHECK-LABEL: test_svclz_s8_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.b, #0 // =0x0
+; CHECK-NEXT:    clz z0.b, p0/m, z1.b
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svclz_s8_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    clz z0.b, p0/z, z1.b
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.clz.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %x)
+  ret <vscale x 16 x i8> %0
+}
+
+define <vscale x 8 x i16> @test_svclz_s16_x_1(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %x) {
+; CHECK-LABEL: test_svclz_s16_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    clz z0.h, p0/m, z0.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svclz_s16_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    clz z0.h, p0/z, z0.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.clz.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 8 x i16> @test_svclz_s16_x_2(<vscale x 8 x i1> %pg, double %z0, <vscale x 8 x i16> %x) {
+; CHECK-LABEL: test_svclz_s16_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    clz z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svclz_s16_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    clz z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.clz.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 8 x i16> @test_svclz_s16_z(<vscale x 8 x i1> %pg, double %z0, <vscale x 8 x i16> %x) {
+; CHECK-LABEL: test_svclz_s16_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    clz z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svclz_s16_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    clz z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.clz.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 4 x i32> @test_svclz_s32_x_1(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svclz_s32_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    clz z0.s, p0/m, z0.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svclz_s32_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    clz z0.s, p0/z, z0.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.clz.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_svclz_s32_x_2(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svclz_s32_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    clz z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svclz_s32_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    clz z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.clz.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_svclz_s32_z(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svclz_s32_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NEXT:    clz z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svclz_s32_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    clz z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.clz.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 2 x i64> @test_svclz_s64_x_1(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svclz_s64_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    clz z0.d, p0/m, z0.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svclz_s64_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    clz z0.d, p0/z, z0.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.clz.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svclz_s64_x_2(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svclz_s64_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    clz z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svclz_s64_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    clz z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.clz.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svclz_s64_z(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svclz_s64_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, #0 // =0x0
+; CHECK-NEXT:    clz z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svclz_s64_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    clz z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.clz.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 16 x i8> @test_svcnt_s8_x_1(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %x) {
+; CHECK-LABEL: test_svcnt_s8_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cnt z0.b, p0/m, z0.b
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcnt_s8_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    cnt z0.b, p0/z, z0.b
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.cnt.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %x)
+  ret <vscale x 16 x i8> %0
+}
+
+define <vscale x 16 x i8> @test_svcnt_s8_x_2(<vscale x 16 x i1> %pg, double %z0, <vscale x 16 x i8> %x) {
+; CHECK-LABEL: test_svcnt_s8_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    cnt z0.b, p0/m, z1.b
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcnt_s8_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    cnt z0.b, p0/z, z1.b
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.cnt.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %x)
+  ret <vscale x 16 x i8> %0
+}
+
+define <vscale x 16 x i8> @test_svcnt_s8_z(<vscale x 16 x i1> %pg, double %z0, <vscale x 16 x i8> %x) {
+; CHECK-LABEL: test_svcnt_s8_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.b, #0 // =0x0
+; CHECK-NEXT:    cnt z0.b, p0/m, z1.b
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcnt_s8_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    cnt z0.b, p0/z, z1.b
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.cnt.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %x)
+  ret <vscale x 16 x i8> %0
+}
+
+define <vscale x 8 x i16> @test_svcnt_s16_x_1(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %x) {
+; CHECK-LABEL: test_svcnt_s16_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cnt z0.h, p0/m, z0.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcnt_s16_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    cnt z0.h, p0/z, z0.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.cnt.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 8 x i16> @test_svcnt_s16_x_2(<vscale x 8 x i1> %pg, double %z0, <vscale x 8 x i16> %x) {
+; CHECK-LABEL: test_svcnt_s16_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    cnt z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcnt_s16_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    cnt z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.cnt.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 8 x i16> @test_svcnt_s16_z(<vscale x 8 x i1> %pg, double %z0, <vscale x 8 x i16> %x) {
+; CHECK-LABEL: test_svcnt_s16_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    cnt z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcnt_s16_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    cnt z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.cnt.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 4 x i32> @test_svcnt_s32_x_1(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svcnt_s32_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cnt z0.s, p0/m, z0.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcnt_s32_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    cnt z0.s, p0/z, z0.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.cnt.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_svcnt_s32_x_2(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svcnt_s32_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    cnt z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcnt_s32_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    cnt z0.s, p0/z, z1.s
########## TRUNCATED ###########