diff --git .github/new-prs-labeler.yml .github/new-prs-labeler.yml
index 566308bb3df8..9863ff087ca8 100644
--- .github/new-prs-labeler.yml
+++ .github/new-prs-labeler.yml
@@ -730,6 +730,9 @@ llvm:regalloc:
 lldb:
   - lldb/**
 
+lldb-dap:
+  - lldb/tools/lldb-dap/**
+
 backend:AMDGPU:
   - '**/*amdgpu*'
   - '**/*AMDGPU*'
diff --git .github/workflows/build-ci-container.yml .github/workflows/build-ci-container.yml
index 8a81d4718646..c419986da79f 100644
--- .github/workflows/build-ci-container.yml
+++ .github/workflows/build-ci-container.yml
@@ -20,24 +20,31 @@ on:
 jobs:
   build-ci-container:
     if: github.repository_owner == 'llvm'
-    runs-on: depot-ubuntu-22.04-16
-    outputs:
-      container-name: ${{ steps.vars.outputs.container-name }}
-      container-name-agent: ${{ steps.vars.outputs.container-name-agent }}
-      container-name-tag: ${{ steps.vars.outputs.container-name-tag }}
-      container-name-agent-tag: ${{ steps.vars.outputs.container-name-agent-tag }}
-      container-filename: ${{ steps.vars.outputs.container-filename }}
-      container-agent-filename: ${{ steps.vars.outputs.container-agent-filename }}
+    runs-on: ${{ matrix.runs-on }}
+    strategy:
+      matrix:
+        include:
+          # The arch names should match the names used on dockerhub.
+          # See https://github.com/docker-library/official-images#architectures-other-than-amd64
+          - arch: amd64
+            runs-on: depot-ubuntu-22.04-16
+          - arch: arm64v8
+            runs-on: depot-ubuntu-22.04-arm-16
     steps:
       - name: Checkout LLVM
         uses: actions/checkout@v4
         with:
           sparse-checkout: .github/workflows/containers/github-action-ci/
+      # podman is not installed by default on the ARM64 images.
+      - name: Install Podman
+        if: runner.arch == 'ARM64'
+        run: |
+          sudo apt-get install podman
       - name: Write Variables
         id: vars
         run: |
-          tag=`date +%s`
-          container_name="ghcr.io/$GITHUB_REPOSITORY_OWNER/ci-ubuntu-22.04"
+          tag=$(git rev-parse --short=12 HEAD)
+          container_name="ghcr.io/$GITHUB_REPOSITORY_OWNER/${{ matrix.arch }}/ci-ubuntu-22.04"
           echo "container-name=$container_name" >> $GITHUB_OUTPUT
           echo "container-name-agent=$container_name-agent" >> $GITHUB_OUTPUT
           echo "container-name-tag=$container_name:$tag" >> $GITHUB_OUTPUT
@@ -61,7 +68,7 @@ jobs:
       - name: Upload container image
         uses: actions/upload-artifact@v4
         with:
-          name: container
+          name: container-${{ matrix.arch }}
           path: "*.tar"
           retention-days: 14
 
@@ -84,18 +91,29 @@ jobs:
     steps:
       - name: Download container
         uses: actions/download-artifact@v4
-        with:
-          name: container
 
       - name: Push Container
         run: |
-          podman load -i ${{ needs.build-ci-container.outputs.container-filename }}
-          podman tag ${{ needs.build-ci-container.outputs.container-name-tag }} ${{ needs.build-ci-container.outputs.container-name }}:latest
+          function push_container {
+            image_name=$1
+            latest_name=$(echo $image_name | sed 's/:[.0-9]\+$/:latest/g')
+            podman tag $image_name $latest_name
+            echo "Pushing $image_name ..."
+            podman push $image_name
+            echo "Pushing $latest_name ..."
+            podman push $latest_name
+          }
+
           podman login -u ${{ github.actor }} -p $GITHUB_TOKEN ghcr.io
-          podman push ${{ needs.build-ci-container.outputs.container-name-tag }}
-          podman push ${{ needs.build-ci-container.outputs.container-name }}:latest
+          for f in $(find . -iname *.tar); do
+            image_name=$(podman load -q -i $f | sed 's/Loaded image: //g')
+            push_container $image_name
 
-          podman load -i ${{ needs.build-ci-container.outputs.container-agent-filename }}
-          podman tag ${{ needs.build-ci-container.outputs.container-name-agent-tag }} ${{ needs.build-ci-container.outputs.container-name-agent }}:latest
-          podman push ${{ needs.build-ci-container.outputs.container-name-agent-tag }}
-          podman push ${{ needs.build-ci-container.outputs.container-name-agent }}:latest
+            if echo $image_name | grep '/amd64/'; then
+              # For amd64, create an alias with the arch component removed.
+              # This matches the convention used on dockerhub.
+              default_image_name=$(echo $(dirname $(dirname $image_name))/$(basename $image_name))
+              podman tag $image_name $default_image_name
+              push_container $default_image_name
+            fi
+          done
diff --git .github/workflows/clang-tests.yml .github/workflows/clang-tests.yml
deleted file mode 100644
index 2569ce19518e..000000000000
--- .github/workflows/clang-tests.yml
+++ /dev/null
@@ -1,38 +0,0 @@
-name: Clang Tests
-
-permissions:
-  contents: read
-
-on:
-  workflow_dispatch:
-  push:
-    branches:
-      - 'release/**'
-    paths:
-      - 'clang/**'
-      - '.github/workflows/clang-tests.yml'
-      - '.github/workflows/llvm-project-tests.yml'
-      - '!llvm/**'
-  pull_request:
-    branches:
-      - 'release/**'
-    paths:
-      - 'clang/**'
-      - '.github/workflows/clang-tests.yml'
-      - '.github/workflows/llvm-project-tests.yml'
-      - '!llvm/**'
-
-concurrency:
-  # Skip intermediate builds: always.
-  # Cancel intermediate builds: only if it is a pull request build.
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
-
-jobs:
-  check_clang:
-    if: github.repository_owner == 'llvm'
-    name: Test clang,lldb,libclc
-    uses: ./.github/workflows/llvm-project-tests.yml
-    with:
-      build_target: check-clang
-      projects: clang;lldb;libclc
diff --git .github/workflows/commit-access-greeter.yml .github/workflows/commit-access-greeter.yml
new file mode 100644
index 000000000000..29a1b578f8af
--- /dev/null
+++ .github/workflows/commit-access-greeter.yml
@@ -0,0 +1,39 @@
+name: Commit Access Greeter
+
+on:
+  issues:
+    types:
+      - labeled
+
+permissions:
+  contents: read
+
+jobs:
+  commit-access-greeter:
+    permissions:
+      issues: write
+    if: >-
+      github.repository_owner == 'llvm' &&
+      github.event.label.name == 'infra:commit-access-request'
+    runs-on: ubuntu-22.04
+    steps:
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        with:
+          sparse-checkout: llvm/utils/git/
+
+      - name: Setup Automation Script
+        working-directory: ./llvm/utils/git/
+        run: |
+          pip install --require-hashes -r requirements.txt
+
+      - name: Add comments to issue
+        working-directory: ./llvm/utils/git/
+        env:
+          LABEL_NAME: ${{ github.event.label.name }}
+          GITHUB_TOKEN: ${{ github.token }}
+          ISSUE_NUMBER: ${{ github.event.issue.number }}
+        run: |
+          python3 ./github-automation.py \
+            --token $GITHUB_TOKEN \
+             commit-request-greeter \
+             --issue-number $ISSUE_NUMBER
diff --git .github/workflows/libclc-tests.yml .github/workflows/libclc-tests.yml
deleted file mode 100644
index 23192f776a98..000000000000
--- .github/workflows/libclc-tests.yml
+++ /dev/null
@@ -1,39 +0,0 @@
-name: libclc Tests
-
-permissions:
-  contents: read
-
-on:
-  workflow_dispatch:
-  push:
-    branches:
-      - 'release/**'
-    paths:
-      - 'libclc/**'
-      - '.github/workflows/libclc-tests.yml'
-      - '.github/workflows/llvm-project-tests.yml'
-      - '!clang/**'
-      - '!llvm/**'
-  pull_request:
-    branches:
-      - 'release/**'
-    paths:
-      - 'libclc/**'
-      - '.github/workflows/libclc-tests.yml'
-      - '.github/workflows/llvm-project-tests.yml'
-      - '!clang/**'
-      - '!llvm/**'
-
-concurrency:
-  # Skip intermediate builds: always.
-  # Cancel intermediate builds: only if it is a pull request build.
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
-
-jobs:
-  check_libclc:
-    if: github.repository_owner == 'llvm'
-    name: Test libclc
-    uses: ./.github/workflows/llvm-project-tests.yml
-    with:
-      projects: clang;libclc
diff --git .github/workflows/lld-tests.yml .github/workflows/lld-tests.yml
deleted file mode 100644
index 599c0975fa68..000000000000
--- .github/workflows/lld-tests.yml
+++ /dev/null
@@ -1,38 +0,0 @@
-name: LLD Tests
-
-permissions:
-  contents: read
-
-on:
-  workflow_dispatch:
-  push:
-    branches:
-      - 'release/**'
-    paths:
-      - 'lld/**'
-      - '.github/workflows/lld-tests.yml'
-      - '.github/workflows/llvm-project-tests.yml'
-      - '!llvm/**'
-  pull_request:
-    branches:
-      - 'release/**'
-    paths:
-      - 'lld/**'
-      - '.github/workflows/lld-tests.yml'
-      - '.github/workflows/llvm-project-tests.yml'
-      - '!llvm/**'
-
-concurrency:
-  # Skip intermediate builds: always.
-  # Cancel intermediate builds: only if it is a pull request build.
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
-
-jobs:
-  check_lld:
-    if: github.repository_owner == 'llvm'
-    name: Test lld
-    uses: ./.github/workflows/llvm-project-tests.yml
-    with:
-      build_target: check-lld
-      projects: lld
diff --git .github/workflows/lldb-tests.yml .github/workflows/lldb-tests.yml
deleted file mode 100644
index 6bb972195625..000000000000
--- .github/workflows/lldb-tests.yml
+++ /dev/null
@@ -1,39 +0,0 @@
-name: lldb Tests
-
-permissions:
-  contents: read
-
-on:
-  workflow_dispatch:
-  push:
-    branches:
-      - 'release/**'
-    paths:
-      - 'lldb/**'
-      - '.github/workflows/lldb-tests.yml'
-      - '.github/workflows/llvm-project-tests.yml'
-      - '!clang/**'
-      - '!llvm/**'
-  pull_request:
-    branches:
-      - 'release/**'
-    paths:
-      - 'lldb/**'
-      - '.github/workflows/lldb-tests.yml'
-      - '.github/workflows/llvm-project-tests.yml'
-      - '!clang/**'
-      - '!llvm/**'
-
-concurrency:
-  # Skip intermediate builds: always.
-  # Cancel intermediate builds: only if it is a pull request build.
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
-
-jobs:
-  build_lldb:
-    if: github.repository_owner == 'llvm'
-    name: Build lldb
-    uses: ./.github/workflows/llvm-project-tests.yml
-    with:
-      projects: clang;lldb
diff --git .github/workflows/llvm-tests.yml .github/workflows/llvm-tests.yml
index 4e570a7cb145..9b3d49d4e99b 100644
--- .github/workflows/llvm-tests.yml
+++ .github/workflows/llvm-tests.yml
@@ -11,14 +11,12 @@ on:
     paths:
       - 'llvm/**'
       - '.github/workflows/llvm-tests.yml'
-      - '.github/workflows/llvm-project-tests.yml'
   pull_request:
     branches:
       - 'release/**'
     paths:
       - 'llvm/**'
       - '.github/workflows/llvm-tests.yml'
-      - '.github/workflows/llvm-project-tests.yml'
 
 concurrency:
   # Skip intermediate builds: always.
@@ -27,14 +25,6 @@ concurrency:
   cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
 
 jobs:
-  check-all:
-    if: github.repository_owner == 'llvm'
-    name: Build and Test
-    uses: ./.github/workflows/llvm-project-tests.yml
-    with:
-      build_target: check-all
-      projects: clang;lld;libclc;lldb
-
   abi-dump-setup:
     if: github.repository_owner == 'llvm'
     runs-on: ubuntu-latest
diff --git .github/workflows/premerge.yaml .github/workflows/premerge.yaml
index 54d6e1bf092c..b268f1faab98 100644
--- .github/workflows/premerge.yaml
+++ .github/workflows/premerge.yaml
@@ -5,15 +5,27 @@ permissions:
 
 on:
   pull_request:
+    types:
+      - opened
+      - synchronize
+      - reopened
+      # When a PR is closed, we still start this workflow, but then skip
+      # all the jobs, which makes it effectively a no-op.  The reason to
+      # do this is that it allows us to take advantage of concurrency groups
+      # to cancel in progress CI jobs whenever the PR is closed.
+      - closed
     paths:
       - .github/workflows/premerge.yaml
   push:
     branches:
       - 'main'
+      - 'release/**'
 
 jobs:
   premerge-checks-linux:
-    if: github.repository_owner == 'llvm'
+    if: >-
+        github.repository_owner == 'llvm' &&
+        (github.event_name != 'pull_request' || github.event.action != 'closed')
     runs-on: llvm-premerge-linux-runners
     concurrency:
       group: ${{ github.workflow }}-linux-${{ github.event.pull_request.number || github.sha }}
@@ -72,7 +84,9 @@ jobs:
           ./.ci/monolithic-linux.sh "$(echo ${linux_projects} | tr ' ' ';')" "$(echo ${linux_check_targets})" "$(echo ${linux_runtimes} | tr ' ' ';')" "$(echo ${linux_runtime_check_targets})"
 
   premerge-checks-windows:
-    if: github.repository_owner == 'llvm'
+    if: >-
+        github.repository_owner == 'llvm' &&
+        (github.event_name != 'pull_request' || github.event.action != 'closed')
     runs-on: llvm-premerge-windows-runners
     concurrency:
       group: ${{ github.workflow }}-windows-${{ github.event.pull_request.number || github.sha }}
@@ -132,3 +146,74 @@ jobs:
           call C:\\BuildTools\\Common7\\Tools\\VsDevCmd.bat -arch=amd64 -host_arch=amd64
           bash .ci/monolithic-windows.sh "${{ steps.vars.outputs.windows-projects }}" "${{ steps.vars.outputs.windows-check-targets }}"
 
+  permerge-check-macos:
+    runs-on: macos-14
+    concurrency:
+      group: ${{ github.workflow }}-macos-${{ github.event.pull_request.number || github.sha }}
+      cancel-in-progress: true
+    if: >-
+      github.repository_owner == 'llvm' &&
+      (startswith(github.ref_name, 'release/') ||
+       startswith(github.base_ref, 'release/')) &&
+      (github.event_name != 'pull_request' || github.event.action != 'closed')
+    steps:
+      - name: Checkout LLVM
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 2
+      - name: Setup ccache
+        uses: hendrikmuhs/ccache-action@v1.2.14
+        with:
+          max-size: "2000M"
+      - name: Install Ninja
+        uses: llvm/actions/install-ninja@main
+      - name: Build and Test
+        run: |
+          modified_files=$(git diff --name-only HEAD~1...HEAD)
+          modified_dirs=$(echo "$modified_files" | cut -d'/' -f1 | sort -u)
+
+          echo $modified_files
+          echo $modified_dirs
+
+          . ./.ci/compute-projects.sh
+
+          all_projects="clang clang-tools-extra lld lldb llvm mlir"
+          modified_projects="$(keep-modified-projects ${all_projects})"
+
+          # We have to disable the runtimes builds due to https://github.com/llvm/llvm-project/issues/90568
+          # and the lldb tests depend on libcxx, so we need to skip them.
+          mac_check_targets=$(check-targets ${modified_projects} | sort | uniq | tr '\n' ' ' | sed -e 's/check-lldb //g')
+          mac_projects=$(add-dependencies ${modified_projects} | sort | uniq | tr '\n' ' ')
+
+          mac_runtimes_to_test=$(compute-runtimes-to-test ${modified_projects})
+          mac_runtime_check_targets=$(check-targets ${mac_runtimes_to_test} | sort | uniq | tr '\n' ' ')
+          mac_runtimes=$(echo ${mac_runtimes_to_test} | tr ' ' '\n' | sort | uniq | tr '\n' ' ')
+
+          if [[ "${mac_projects}" == "" ]]; then
+            echo "No projects to build"
+            exit 0
+          fi
+
+          echo "Projects to test: ${modified_projects}"
+          echo "Runtimes to test: ${mac_runtimes_to_test}"
+          echo "Building projects: ${mac_projects}"
+          echo "Running project checks targets: ${mac_check_targets}"
+          echo "Building runtimes: ${mac_runtimes}"
+          echo "Running runtimes checks targets: ${mac_runtime_check_targets}"
+
+          # -DLLVM_DISABLE_ASSEMBLY_FILES=ON is for
+          # https://github.com/llvm/llvm-project/issues/81967
+          # Disable sharding in lit so that the LIT_XFAIL environment var works.
+          cmake -G Ninja \
+                -B build \
+                -S llvm \
+                -DLLVM_ENABLE_PROJECTS="$(echo ${mac_projects} | tr ' ' ';')" \
+                -DLLVM_DISABLE_ASSEMBLY_FILES=ON \
+                -DCMAKE_BUILD_TYPE=Release \
+                -DLLDB_INCLUDE_TESTS=OFF \
+                -DLLVM_ENABLE_ASSERTIONS=ON \
+                -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+                -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
+
+          # The libcxx tests fail, so we are skipping the runtime targets.
+          ninja -C build  $mac_check_targets
diff --git .github/workflows/release-tasks.yml .github/workflows/release-tasks.yml
index 780dd0ff6325..52076ea1821b 100644
--- .github/workflows/release-tasks.yml
+++ .github/workflows/release-tasks.yml
@@ -89,20 +89,10 @@ jobs:
     needs:
       - validate-tag
       - release-create
-    strategy:
-      fail-fast: false
-      matrix:
-        runs-on:
-          - ubuntu-22.04
-          - windows-2022
-          - macos-13
-          - macos-14
-
-    uses: ./.github/workflows/release-binaries.yml
+    uses: ./.github/workflows/release-binaries-all.yml
     with:
       release-version: ${{ needs.validate-tag.outputs.release-version }}
       upload: true
-      runs-on: ${{ matrix.runs-on }}
     # Called workflows don't have access to secrets by default, so we need to explicitly pass secrets that we use.
     secrets:
       RELEASE_TASKS_USER_TOKEN: ${{ secrets.RELEASE_TASKS_USER_TOKEN }}
diff --git bolt/test/X86/dynamic-relocs-on-entry.s bolt/test/X86/dynamic-relocs-on-entry.s
index 2a29a43c4939..4ec8ba4ad446 100644
--- bolt/test/X86/dynamic-relocs-on-entry.s
+++ bolt/test/X86/dynamic-relocs-on-entry.s
@@ -4,12 +4,12 @@
 
 # RUN: %clang %cflags -fPIC -pie %s -o %t.exe -nostdlib -Wl,-q
 # RUN: llvm-bolt %t.exe -o %t.bolt > %t.out.txt
-# RUN: readelf -r %t.bolt >> %t.out.txt
+# RUN: llvm-readelf -r %t.bolt >> %t.out.txt
 # RUN: llvm-objdump --disassemble-symbols=chain %t.bolt >> %t.out.txt
 # RUN: FileCheck %s --input-file=%t.out.txt
 
 ## Check if the new address in `chain` is correctly updated by BOLT
-# CHECK: Relocation section '.rela.dyn' at offset 0x{{.*}} contains 1 entry:
+# CHECK: Relocation section '.rela.dyn' at offset 0x{{.*}} contains 1 entries:
 # CHECK: {{.*}} R_X86_64_RELATIVE [[#%x,ADDR:]]
 # CHECK: [[#ADDR]]: c3 retq
 	.text
@@ -29,4 +29,4 @@ _start:
 
 	.data
 .Lfoo:
-	.quad Label
\ No newline at end of file
+	.quad Label
diff --git clang-tools-extra/clang-tidy/bugprone/UnsafeFunctionsCheck.cpp clang-tools-extra/clang-tidy/bugprone/UnsafeFunctionsCheck.cpp
index 604a7cac0e49..a45949314a4c 100644
--- clang-tools-extra/clang-tidy/bugprone/UnsafeFunctionsCheck.cpp
+++ clang-tools-extra/clang-tidy/bugprone/UnsafeFunctionsCheck.cpp
@@ -256,13 +256,32 @@ void UnsafeFunctionsCheck::registerMatchers(MatchFinder *Finder) {
                                           .bind(CustomFunctionNamesId)))
                            .bind(DeclRefId),
                        this);
+    // C++ member calls do not contain a DeclRefExpr to the function decl.
+    // Instead, they contain a MemberExpr that refers to the decl.
+    Finder->addMatcher(memberExpr(member(functionDecl(CustomFunctionsMatcher)
+                                             .bind(CustomFunctionNamesId)))
+                           .bind(DeclRefId),
+                       this);
   }
 }
 
 void UnsafeFunctionsCheck::check(const MatchFinder::MatchResult &Result) {
-  const auto *DeclRef = Result.Nodes.getNodeAs<DeclRefExpr>(DeclRefId);
-  const auto *FuncDecl = cast<FunctionDecl>(DeclRef->getDecl());
-  assert(DeclRef && FuncDecl && "No valid matched node in check()");
+  const Expr *SourceExpr;
+  const FunctionDecl *FuncDecl;
+
+  if (const auto *DeclRef = Result.Nodes.getNodeAs<DeclRefExpr>(DeclRefId)) {
+    SourceExpr = DeclRef;
+    FuncDecl = cast<FunctionDecl>(DeclRef->getDecl());
+  } else if (const auto *Member =
+                 Result.Nodes.getNodeAs<MemberExpr>(DeclRefId)) {
+    SourceExpr = Member;
+    FuncDecl = cast<FunctionDecl>(Member->getMemberDecl());
+  } else {
+    llvm_unreachable("No valid matched node in check()");
+    return;
+  }
+
+  assert(SourceExpr && FuncDecl && "No valid matched node in check()");
 
   // Only one of these are matched at a time.
   const auto *AnnexK = Result.Nodes.getNodeAs<FunctionDecl>(
@@ -286,14 +305,15 @@ void UnsafeFunctionsCheck::check(const MatchFinder::MatchResult &Result) {
             Entry.Reason.empty() ? "is marked as unsafe" : Entry.Reason.c_str();
 
         if (Entry.Replacement.empty()) {
-          diag(DeclRef->getExprLoc(), "function %0 %1; it should not be used")
+          diag(SourceExpr->getExprLoc(),
+               "function %0 %1; it should not be used")
               << FuncDecl << Reason << Entry.Replacement
-              << DeclRef->getSourceRange();
+              << SourceExpr->getSourceRange();
         } else {
-          diag(DeclRef->getExprLoc(),
+          diag(SourceExpr->getExprLoc(),
                "function %0 %1; '%2' should be used instead")
               << FuncDecl << Reason << Entry.Replacement
-              << DeclRef->getSourceRange();
+              << SourceExpr->getSourceRange();
         }
 
         return;
@@ -323,9 +343,9 @@ void UnsafeFunctionsCheck::check(const MatchFinder::MatchResult &Result) {
   if (!ReplacementFunctionName)
     return;
 
-  diag(DeclRef->getExprLoc(), "function %0 %1; '%2' should be used instead")
+  diag(SourceExpr->getExprLoc(), "function %0 %1; '%2' should be used instead")
       << FuncDecl << getRationaleFor(FunctionName)
-      << ReplacementFunctionName.value() << DeclRef->getSourceRange();
+      << ReplacementFunctionName.value() << SourceExpr->getSourceRange();
 }
 
 void UnsafeFunctionsCheck::registerPPCallbacks(
diff --git clang-tools-extra/clang-tidy/bugprone/UnsafeFunctionsCheck.h clang-tools-extra/clang-tidy/bugprone/UnsafeFunctionsCheck.h
index 63058c326ef2..9b2ec990be01 100644
--- clang-tools-extra/clang-tidy/bugprone/UnsafeFunctionsCheck.h
+++ clang-tools-extra/clang-tidy/bugprone/UnsafeFunctionsCheck.h
@@ -43,7 +43,7 @@ public:
 private:
   const std::vector<CheckedFunction> CustomFunctions;
 
-  // If true, the default set of functions are reported.
+  /// If true, the default set of functions are reported.
   const bool ReportDefaultFunctions;
   /// If true, additional functions from widely used API-s (such as POSIX) are
   /// added to the list of reported functions.
diff --git clang-tools-extra/docs/ReleaseNotes.rst clang-tools-extra/docs/ReleaseNotes.rst
index 727c7622426c..3bddeeda06e0 100644
--- clang-tools-extra/docs/ReleaseNotes.rst
+++ clang-tools-extra/docs/ReleaseNotes.rst
@@ -97,6 +97,10 @@ New check aliases
 Changes in existing checks
 ^^^^^^^^^^^^^^^^^^^^^^^^^^
 
+- Improved :doc:`bugprone-unsafe-functions
+  <clang-tidy/checks/bugprone/unsafe-functions>` check to allow specifying
+  additional C++ member functions to match.
+
 Removed checks
 ^^^^^^^^^^^^^^
 
diff --git clang-tools-extra/docs/clang-tidy/checks/bugprone/unsafe-functions.rst clang-tools-extra/docs/clang-tidy/checks/bugprone/unsafe-functions.rst
index fb070627e31b..317db9c5564e 100644
--- clang-tools-extra/docs/clang-tidy/checks/bugprone/unsafe-functions.rst
+++ clang-tools-extra/docs/clang-tidy/checks/bugprone/unsafe-functions.rst
@@ -114,6 +114,17 @@ qualified name (i.e. ``std::original``), otherwise the regex is matched against
 If the regular expression starts with `::` (or `^::`), it is matched against the
 fully qualified name (``::std::original``).
 
+.. note::
+
+   Fully qualified names can contain template parameters on certain C++ classes, but not on C++ functions.
+   Type aliases are resolved before matching.
+
+   As an example, the member function ``open`` in the class ``std::ifstream``
+   has a fully qualified name of ``::std::basic_ifstream<char>::open``.
+
+   The example could also be matched with the regex ``::std::basic_ifstream<[^>]*>::open``, which matches all potential
+   template parameters, but does not match nested template classes.
+
 Options
 -------
 
diff --git clang-tools-extra/include-cleaner/lib/WalkAST.cpp clang-tools-extra/include-cleaner/lib/WalkAST.cpp
index aae3eda519ff..7a140c991925 100644
--- clang-tools-extra/include-cleaner/lib/WalkAST.cpp
+++ clang-tools-extra/include-cleaner/lib/WalkAST.cpp
@@ -22,6 +22,7 @@
 #include "clang/AST/Type.h"
 #include "clang/AST/TypeLoc.h"
 #include "clang/Basic/IdentifierTable.h"
+#include "clang/Basic/OperatorKinds.h"
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Basic/Specifiers.h"
 #include "llvm/ADT/STLExtras.h"
@@ -32,6 +33,11 @@
 
 namespace clang::include_cleaner {
 namespace {
+bool isOperatorNewDelete(OverloadedOperatorKind OpKind) {
+  return OpKind == OO_New || OpKind == OO_Delete || OpKind == OO_Array_New ||
+         OpKind == OO_Array_Delete;
+}
+
 using DeclCallback =
     llvm::function_ref<void(SourceLocation, NamedDecl &, RefType)>;
 
@@ -158,7 +164,15 @@ public:
     // the container decl instead, which is preferred as it'll handle
     // aliases/exports properly.
     if (!FD->isCXXClassMember() && !llvm::isa<EnumConstantDecl>(FD)) {
-      report(DRE->getLocation(), FD);
+      // Global operator new/delete [] is available implicitly in every
+      // translation unit, even without including any explicit headers. So treat
+      // those as ambigious to not force inclusion in TUs that transitively
+      // depend on those.
+      RefType RT =
+          isOperatorNewDelete(FD->getDeclName().getCXXOverloadedOperator())
+              ? RefType::Ambiguous
+              : RefType::Explicit;
+      report(DRE->getLocation(), FD, RT);
       return true;
     }
     // If the ref is without a qualifier, and is a member, ignore it. As it is
diff --git clang-tools-extra/include-cleaner/unittests/AnalysisTest.cpp clang-tools-extra/include-cleaner/unittests/AnalysisTest.cpp
index d2d137a0dfb4..74321c312cb7 100644
--- clang-tools-extra/include-cleaner/unittests/AnalysisTest.cpp
+++ clang-tools-extra/include-cleaner/unittests/AnalysisTest.cpp
@@ -397,6 +397,55 @@ TEST_F(AnalyzeTest, SpellingIncludesWithSymlinks) {
   }
 }
 
+// Make sure that the references to implicit operator new/delete are reported as
+// ambigious.
+TEST_F(AnalyzeTest, ImplicitOperatorNewDeleteNotMissing) {
+  ExtraFS = llvm::makeIntrusiveRefCnt<llvm::vfs::InMemoryFileSystem>();
+  ExtraFS->addFile("header.h",
+                   /*ModificationTime=*/{},
+                   llvm::MemoryBuffer::getMemBufferCopy(guard(R"cpp(
+  void* operator new(decltype(sizeof(int)));
+  )cpp")));
+  ExtraFS->addFile("wrapper.h",
+                   /*ModificationTime=*/{},
+                   llvm::MemoryBuffer::getMemBufferCopy(guard(R"cpp(
+  #include "header.h"
+  )cpp")));
+
+  Inputs.Code = R"cpp(
+      #include "wrapper.h"
+      void bar() {
+        operator new(3);
+      })cpp";
+  TestAST AST(Inputs);
+  std::vector<Decl *> DeclsInTU;
+  for (auto *D : AST.context().getTranslationUnitDecl()->decls())
+    DeclsInTU.push_back(D);
+  auto Results = analyze(DeclsInTU, {}, PP.Includes, &PI, AST.preprocessor());
+  EXPECT_THAT(Results.Missing, testing::IsEmpty());
+}
+
+TEST_F(AnalyzeTest, ImplicitOperatorNewDeleteNotUnused) {
+  ExtraFS = llvm::makeIntrusiveRefCnt<llvm::vfs::InMemoryFileSystem>();
+  ExtraFS->addFile("header.h",
+                   /*ModificationTime=*/{},
+                   llvm::MemoryBuffer::getMemBufferCopy(guard(R"cpp(
+  void* operator new(decltype(sizeof(int)));
+  )cpp")));
+
+  Inputs.Code = R"cpp(
+      #include "header.h"
+      void bar() {
+        operator new(3);
+      })cpp";
+  TestAST AST(Inputs);
+  std::vector<Decl *> DeclsInTU;
+  for (auto *D : AST.context().getTranslationUnitDecl()->decls())
+    DeclsInTU.push_back(D);
+  auto Results = analyze(DeclsInTU, {}, PP.Includes, &PI, AST.preprocessor());
+  EXPECT_THAT(Results.Unused, testing::IsEmpty());
+}
+
 TEST(FixIncludes, Basic) {
   llvm::StringRef Code = R"cpp(#include "d.h"
 #include "a.h"
diff --git clang-tools-extra/test/clang-tidy/checkers/bugprone/unsafe-functions-custom-regex.cpp clang-tools-extra/test/clang-tidy/checkers/bugprone/unsafe-functions-custom-regex.cpp
index fc97d1bc93bc..ad0ba8739be2 100644
--- clang-tools-extra/test/clang-tidy/checkers/bugprone/unsafe-functions-custom-regex.cpp
+++ clang-tools-extra/test/clang-tidy/checkers/bugprone/unsafe-functions-custom-regex.cpp
@@ -1,11 +1,19 @@
 // RUN: %check_clang_tidy -check-suffix=NON-STRICT-REGEX         %s bugprone-unsafe-functions %t --\
-// RUN:   -config="{CheckOptions: {bugprone-unsafe-functions.CustomFunctions: '::name_match,replacement,is a qualname match;^::prefix_match,,is matched on qualname prefix'}}"
+// RUN:   -config="{CheckOptions: {bugprone-unsafe-functions.CustomFunctions: '::name_match,replacement,is a qualname match;^::prefix_match,,is matched on qualname prefix;^::S::member_match_,,is matched on a C++ class member'}}"
 // RUN: %check_clang_tidy -check-suffix=STRICT-REGEX         %s bugprone-unsafe-functions %t --\
-// RUN:   -config="{CheckOptions: {bugprone-unsafe-functions.CustomFunctions: '^name_match$,replacement,is matched on function name only;^::prefix_match$,,is a full qualname match'}}"
+// RUN:   -config="{CheckOptions: {bugprone-unsafe-functions.CustomFunctions: '^name_match$,replacement,is matched on function name only;^::prefix_match$,,is a full qualname match;^::S::member_match_1$,,is matched on a C++ class member'}}"
 
 void name_match();
 void prefix_match();
 
+struct S {
+  static void member_match_1() {}
+  void member_match_2() {}
+};
+
+void member_match_1() {}
+void member_match_unmatched() {}
+
 namespace regex_test {
 void name_match();
 void prefix_match();
@@ -42,3 +50,25 @@ void f1() {
   // CHECK-MESSAGES-NON-STRICT-REGEX: :[[@LINE-1]]:3: warning: function 'prefix_match_regex' is matched on qualname prefix; it should not be used
   // no-warning STRICT-REGEX
 }
+
+void f2() {
+  S s;
+
+  S::member_match_1();
+  // CHECK-MESSAGES-NON-STRICT-REGEX: :[[@LINE-1]]:3: warning: function 'member_match_1' is matched on a C++ class member; it should not be used
+  // CHECK-MESSAGES-STRICT-REGEX: :[[@LINE-2]]:3: warning: function 'member_match_1' is matched on a C++ class member; it should not be used
+
+  s.member_match_1();
+  // CHECK-MESSAGES-NON-STRICT-REGEX: :[[@LINE-1]]:5: warning: function 'member_match_1' is matched on a C++ class member; it should not be used
+  // CHECK-MESSAGES-STRICT-REGEX: :[[@LINE-2]]:5: warning: function 'member_match_1' is matched on a C++ class member; it should not be used
+
+  s.member_match_2();
+  // CHECK-MESSAGES-NON-STRICT-REGEX: :[[@LINE-1]]:5: warning: function 'member_match_2' is matched on a C++ class member; it should not be used
+  // no-warning STRICT-REGEX
+
+  member_match_1();
+  // no-warning
+
+  member_match_unmatched();
+  // no-warning
+}
diff --git clang/docs/ReleaseNotes.rst clang/docs/ReleaseNotes.rst
index b8b47103d951..33a37bdf3f32 100644
--- clang/docs/ReleaseNotes.rst
+++ clang/docs/ReleaseNotes.rst
@@ -83,6 +83,9 @@ Resolutions to C++ Defect Reports
 C Language Changes
 ------------------
 
+- Clang now allows an ``inline`` specifier on a typedef declaration of a
+  function type in Microsoft compatibility mode. #GH124869
+
 C2y Feature Support
 ^^^^^^^^^^^^^^^^^^^
 
@@ -110,6 +113,13 @@ Attribute Changes in Clang
 Improvements to Clang's diagnostics
 -----------------------------------
 
+- Improve the diagnostics for deleted default constructor errors for C++ class
+  initializer lists that don't explicitly list a class member and thus attempt
+  to implicitly default construct that member.
+- The ``-Wunique-object-duplication`` warning has been added to warn about objects
+  which are supposed to only exist once per program, but may get duplicated when
+  built into a shared library.
+
 Improvements to Clang's time-trace
 ----------------------------------
 
@@ -122,14 +132,14 @@ Bug Fixes in This Version
 Bug Fixes to Compiler Builtins
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
+- The behvaiour of ``__add_pointer`` and ``__remove_pointer`` for Objective-C++'s ``id`` and interfaces has been fixed.
+
 Bug Fixes to Attribute Support
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Bug Fixes to C++ Support
 ^^^^^^^^^^^^^^^^^^^^^^^^
 
-- Clang is now better at keeping track of friend function template instance contexts. (#GH55509)
-
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -151,9 +161,19 @@ AMDGPU Support
 NVPTX Support
 ^^^^^^^^^^^^^^
 
+Hexagon Support
+^^^^^^^^^^^^^^^
+
+-  The default compilation target has been changed from V60 to V68.
+
 X86 Support
 ^^^^^^^^^^^
 
+- Disable ``-m[no-]avx10.1`` and switch ``-m[no-]avx10.2`` to alias of 512 bit
+  options.
+- Change ``-mno-avx10.1-512`` to alias of ``-mno-avx10.1-256`` to disable both
+  256 and 512 bit instructions.
+
 Arm and AArch64 Support
 ^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git clang/docs/TypeSanitizer.rst clang/docs/TypeSanitizer.rst
index 8b815d8804fa..4d1dfc23a6c5 100644
--- clang/docs/TypeSanitizer.rst
+++ clang/docs/TypeSanitizer.rst
@@ -202,4 +202,4 @@ enough for TypeSanitizer's runtime.
 
 We are actively working on enhancing the tool --- stay tuned.  Any help, 
 issues, pull requests, ideas, is more than welcome. You can find the 
-`issue tracker here.<https://github.com/llvm/llvm-project/issues?q=is%3Aissue%20state%3Aopen%20TySan%20label%3Acompiler-rt%3Atysan>`
+`issue tracker here. <https://github.com/llvm/llvm-project/issues?q=is%3Aissue%20state%3Aopen%20TySan%20label%3Acompiler-rt%3Atysan>`_
diff --git clang/docs/UsersManual.rst clang/docs/UsersManual.rst
index a56c9425ebb7..943a9218ccbc 100644
--- clang/docs/UsersManual.rst
+++ clang/docs/UsersManual.rst
@@ -2489,6 +2489,82 @@ are listed below.
 
     $ clang -fuse-ld=lld -Oz -Wl,--icf=safe -fcodegen-data-use code.cc
 
+.. _strict_aliasing:
+
+Strict Aliasing
+---------------
+
+The C and C++ standards require accesses to objects in memory to use l-values of
+an appropriate type for the object. This is called *strict aliasing* or
+*type-based alias analysis*. Strict aliasing enhances a variety of powerful
+memory optimizations, including reordering, combining, and eliminating memory
+accesses. These optimizations can lead to unexpected behavior in code that
+violates the strict aliasing rules. For example:
+
+.. code-block:: c++
+
+    void advance(size_t *index, double *data) {
+      double value = data[*index];
+      /* Clang may assume that this store does not change the contents of `data`. */
+      *index += 1;
+      /* Clang may assume that this store does not change the contents of `index`. */
+      data[*index] = value;
+      /* Either of these facts may create significant optimization opportunities
+       if Clang is able to inline this function. */
+  }
+
+Strict aliasing can be explicitly enabled with ``-fstrict-aliasing`` and
+disabled with ``-fno-strict-aliasing``. ``clang-cl`` defaults to
+``-fno-strict-aliasing``; see . Otherwise, Clang defaults to ``-fstrict-aliasing``.
+
+C and C++ specify slightly different rules for strict aliasing. To improve
+language interoperability, Clang allows two types to alias if either language
+would permit it. This includes applying the C++ similar types rule to C,
+allowing ``int **`` to alias ``int const * const *``. Clang also relaxes the
+standard aliasing rules in the following ways:
+
+* All integer types of the same size are permitted to alias each other,
+  including signed and unsigned types.
+* ``void*`` is permitted to alias any pointer type, ``void**`` is permitted to
+  alias any pointer to pointer type, and so on.
+
+Code which violates strict aliasing has undefined behavior. A program that
+works in one version of Clang may not work in another because of changes to the
+optimizer. Clang provides a :doc:`TypeSanitizer` to help detect
+violations of the strict aliasing rules, but it is currently still experimental.
+Code that is known to violate strict aliasing should generally be built with
+``-fno-strict-aliasing`` if the violation cannot be fixed.
+
+Clang supports several ways to fix a violation of strict aliasing:
+
+* L-values of the character types ``char`` and ``unsigned char`` (as well as
+  other types, depending on the standard) are permitted to access objects of
+  any type.
+
+* Library functions such as ``memcpy`` and ``memset`` are specified as treating
+  memory as characters and therefore are not limited by strict aliasing. If a
+  value of one type must be reinterpreted as another (e.g. to read the bits of a
+  floating-point number), use ``memcpy`` to copy the representation to an object
+  of the destination type. This has no overhead over a direct l-value access
+  because Clang should reliably optimize calls to these functions to use simple
+  loads and stores when they are used with small constant sizes.
+
+* The attribute ``may_alias`` can be added to a ``typedef`` to give l-values of
+  that type the same aliasing power as the character types.
+
+Clang makes a best effort to avoid obvious miscompilations from strict aliasing
+by only considering type information when it cannot prove that two accesses must
+refer to the same memory. However, it is not recommended that programmers
+intentionally rely on this instead of using one of the solutions above because
+it is too easy for the compiler's analysis to be blocked in surprising ways.
+
+In Clang 20, Clang strengthened its implementation of strict aliasing for
+accesses of pointer type. Previously, all accesses of pointer type were
+permitted to alias each other, but Clang now distinguishes different pointers
+by their pointee type, except as limited by the relaxations around qualifiers
+and ``void*`` described above. The previous behavior of treating all pointers as
+aliasing can be restored using ``-fno-pointer-tbaa``.
+
 Profile Guided Optimization
 ---------------------------
 
@@ -5272,12 +5348,6 @@ The Visual C++ Toolset has a slightly more elaborate mechanism for detection.
 Restrictions and Limitations compared to Clang
 ----------------------------------------------
 
-Strict Aliasing
-^^^^^^^^^^^^^^^
-
-Strict aliasing (TBAA) is always off by default in clang-cl. Whereas in clang,
-strict aliasing is turned on by default for all optimization levels.
-
-To enable LLVM optimizations based on strict aliasing rules (e.g., optimizations
-based on type of expressions in C/C++), user will need to explicitly pass
-`-fstrict-aliasing` to clang-cl.
+Strict aliasing (TBAA) is always off by default in clang-cl whereas in clang,
+strict aliasing is turned on by default for all optimization levels. For more
+details, see :ref:`Strict aliasing <strict_aliasing>`.
diff --git clang/docs/analyzer/user-docs/Annotations.rst clang/docs/analyzer/user-docs/Annotations.rst
index d87e8f4df99c..11f15939ecfa 100644
--- clang/docs/analyzer/user-docs/Annotations.rst
+++ clang/docs/analyzer/user-docs/Annotations.rst
@@ -23,8 +23,8 @@ recognized by GCC. Their use can be conditioned using preprocessor macros
 .. contents::
    :local:
 
-Annotations to Enhance Generic Checks
-_____________________________________
+General Purpose Annotations
+___________________________
 
 Null Pointer Checking
 #####################
@@ -79,7 +79,7 @@ implemented with a macro, with the macro performing a check for the assertion
 condition and, when the check fails, calling an assertion handler.  For
 example, consider the following code fragment:
 
-.. code-block: c
+.. code-block:: c
 
   void foo(int *p) {
     assert(p != NULL);
@@ -87,7 +87,7 @@ example, consider the following code fragment:
 
 When this code is preprocessed on Mac OS X it expands to the following:
 
-.. code-block: c
+.. code-block:: c
 
   void foo(int *p) {
     (__builtin_expect(!(p != NULL), 0) ? __assert_rtn(__func__, "t.c", 4, "p != NULL") : (void)0);
@@ -131,7 +131,7 @@ return.
 On Mac OS X, the function prototype for ``__assert_rtn`` (declared in
 ``assert.h``) is specifically annotated with the 'noreturn' attribute:
 
-.. code-block: c
+.. code-block:: c
 
   void __assert_rtn(const char *, const char *, int, const char *) __attribute__((__noreturn__));
 
@@ -151,7 +151,7 @@ the use of preprocessor macros.
 
 **Example**
 
-.. code-block: c
+.. code-block:: c
 
   #ifndef CLANG_ANALYZER_NORETURN
   #if __has_feature(attribute_analyzer_noreturn)
@@ -163,6 +163,43 @@ the use of preprocessor macros.
 
   void my_assert_rtn(const char *, const char *, int, const char *) CLANG_ANALYZER_NORETURN;
 
+Dynamic Memory Modeling Annotations
+###################################
+
+If a project uses custom functions for dynamic memory management (that e.g. act as wrappers around ``malloc``/``free`` or ``new``/``delete`` in C++) and the analyzer cannot "see" the _definitions_ of these functions, it's possible to annotate their declarations to let the analyzer model their behavior. (Otherwise the analyzer cannot know that the opaque ``my_free()`` is basically equivalent to a standard ``free()`` call.)
+
+.. note::
+  **This page only provides a brief list of these annotations.** For a full documentation, see the main `Attributes in Clang <../../AttributeReference.html#ownership-holds-ownership-returns-ownership-takes-clang-static-analyzer>`_ page.
+
+Attribute 'ownership_returns' (Clang-specific)
+----------------------------------------------
+
+Use this attribute to mark functions that return dynamically allocated memory. Takes a single argument, the type of the allocation (e.g. ``malloc`` or ``new``).
+
+.. code-block:: c
+
+  void __attribute((ownership_returns(malloc))) *my_malloc(size_t);
+
+Attribute 'ownership_takes' (Clang-specific)
+--------------------------------------------
+
+Use this attribute to mark functions that deallocate memory. Takes two arguments: the type of the allocation (e.g. ``malloc`` or ``new``) and the index of the parameter that is being deallocated (counting from 1).
+
+.. code-block:: c
+
+  void __attribute((ownership_takes(malloc, 1))) my_free(void *);
+
+Attribute 'ownership_holds' (Clang-specific)
+--------------------------------------------
+
+Use this attribute to mark functions that take ownership of memory and will deallocate it at some unspecified point in the future. Takes two arguments: the type of the allocation (e.g. ``malloc`` or ``new``) and the index of the parameter that is being held (counting from 1).
+
+.. code-block:: c
+
+  void __attribute((ownership_holds(malloc, 2))) store_in_table(int key, record_t *val);
+
+The annotations ``ownership_takes`` and ``ownership_holds`` both prevent memory leak reports (concerning the specified argument); the difference between them is that using taken memory is a use-after-free error, while using held memory is assumed to be legitimate.
+
 Mac OS X API Annotations
 ________________________
 
@@ -207,7 +244,7 @@ functions allows the analyzer to perform extra checking.
 
 **Example**
 
-.. code-block: objc
+.. code-block:: objc
 
   #import <Foundation/Foundation.h>;
 
@@ -597,7 +634,6 @@ returned object.
     LIBKERN_RETURNS_NOT_RETAINED OSObject *myFieldGetter();
   }
 
-
   // Note that the annotation only has to be applied to the function declaration.
   OSObject * MyClass::myFieldGetter() {
     return f;
diff --git clang/include/clang-c/Index.h clang/include/clang-c/Index.h
index cc7c65b15088..61e361faabda 100644
--- clang/include/clang-c/Index.h
+++ clang/include/clang-c/Index.h
@@ -2206,7 +2206,11 @@ enum CXCursorKind {
    */
   CXCursor_OpenACCUpdateConstruct = 331,
 
-  CXCursor_LastStmt = CXCursor_OpenACCUpdateConstruct,
+  /** OpenACC atomic Construct.
+   */
+  CXCursor_OpenACCAtomicConstruct = 332,
+
+  CXCursor_LastStmt = CXCursor_OpenACCAtomicConstruct,
 
   /**
    * Cursor that represents the translation unit itself.
diff --git clang/include/clang/AST/Decl.h clang/include/clang/AST/Decl.h
index f305cbbce4c6..499d27a9be5a 100644
--- clang/include/clang/AST/Decl.h
+++ clang/include/clang/AST/Decl.h
@@ -2298,13 +2298,6 @@ public:
     FunctionDeclBits.IsLateTemplateParsed = ILT;
   }
 
-  bool isInstantiatedFromMemberTemplate() const {
-    return FunctionDeclBits.IsInstantiatedFromMemberTemplate;
-  }
-  void setInstantiatedFromMemberTemplate(bool Val = true) {
-    FunctionDeclBits.IsInstantiatedFromMemberTemplate = Val;
-  }
-
   /// Whether this function is "trivial" in some specialized C++ senses.
   /// Can only be true for default constructors, copy constructors,
   /// copy assignment operators, and destructors.  Not meaningful until
diff --git clang/include/clang/AST/DeclBase.h clang/include/clang/AST/DeclBase.h
index 3a13309a6100..3bb82c1572ef 100644
--- clang/include/clang/AST/DeclBase.h
+++ clang/include/clang/AST/DeclBase.h
@@ -1257,8 +1257,11 @@ public:
   int64_t getID() const;
 
   /// Looks through the Decl's underlying type to extract a FunctionType
-  /// when possible. Will return null if the type underlying the Decl does not
-  /// have a FunctionType.
+  /// when possible. This includes direct FunctionDecls, along with various
+  /// function types and typedefs. This includes function pointers/references,
+  /// member function pointers, and optionally if \p BlocksToo is set
+  /// Objective-C block pointers. Returns nullptr if the type underlying the
+  /// Decl does not have a FunctionType.
   const FunctionType *getFunctionType(bool BlocksToo = true) const;
 
   // Looks through the Decl's underlying type to determine if it's a
@@ -1777,8 +1780,6 @@ protected:
     uint64_t HasImplicitReturnZero : 1;
     LLVM_PREFERRED_TYPE(bool)
     uint64_t IsLateTemplateParsed : 1;
-    LLVM_PREFERRED_TYPE(bool)
-    uint64_t IsInstantiatedFromMemberTemplate : 1;
 
     /// Kind of contexpr specifier as defined by ConstexprSpecKind.
     LLVM_PREFERRED_TYPE(ConstexprSpecKind)
@@ -1829,7 +1830,7 @@ protected:
   };
 
   /// Number of inherited and non-inherited bits in FunctionDeclBitfields.
-  enum { NumFunctionDeclBits = NumDeclContextBits + 32 };
+  enum { NumFunctionDeclBits = NumDeclContextBits + 31 };
 
   /// Stores the bits used by CXXConstructorDecl. If modified
   /// NumCXXConstructorDeclBits and the accessor
@@ -1840,12 +1841,12 @@ protected:
     LLVM_PREFERRED_TYPE(FunctionDeclBitfields)
     uint64_t : NumFunctionDeclBits;
 
-    /// 19 bits to fit in the remaining available space.
+    /// 20 bits to fit in the remaining available space.
     /// Note that this makes CXXConstructorDeclBitfields take
     /// exactly 64 bits and thus the width of NumCtorInitializers
     /// will need to be shrunk if some bit is added to NumDeclContextBitfields,
     /// NumFunctionDeclBitfields or CXXConstructorDeclBitfields.
-    uint64_t NumCtorInitializers : 16;
+    uint64_t NumCtorInitializers : 17;
     LLVM_PREFERRED_TYPE(bool)
     uint64_t IsInheritingConstructor : 1;
 
@@ -1859,7 +1860,7 @@ protected:
   };
 
   /// Number of inherited and non-inherited bits in CXXConstructorDeclBitfields.
-  enum { NumCXXConstructorDeclBits = NumFunctionDeclBits + 19 };
+  enum { NumCXXConstructorDeclBits = NumFunctionDeclBits + 20 };
 
   /// Stores the bits used by ObjCMethodDecl.
   /// If modified NumObjCMethodDeclBits and the accessor
diff --git clang/include/clang/AST/DeclTemplate.h clang/include/clang/AST/DeclTemplate.h
index 0c706036ff70..9ecff2c898ac 100644
--- clang/include/clang/AST/DeclTemplate.h
+++ clang/include/clang/AST/DeclTemplate.h
@@ -1011,15 +1011,6 @@ public:
     return getTemplatedDecl()->isThisDeclarationADefinition();
   }
 
-  bool isCompatibleWithDefinition() const {
-    return getTemplatedDecl()->isInstantiatedFromMemberTemplate() ||
-           isThisDeclarationADefinition();
-  }
-  void setInstantiatedFromMemberTemplate(FunctionTemplateDecl *D) {
-    getTemplatedDecl()->setInstantiatedFromMemberTemplate();
-    RedeclarableTemplateDecl::setInstantiatedFromMemberTemplate(D);
-  }
-
   /// Return the specialization with the provided arguments if it exists,
   /// otherwise return the insertion point.
   FunctionDecl *findSpecialization(ArrayRef<TemplateArgument> Args,
diff --git clang/include/clang/AST/DynamicRecursiveASTVisitor.h clang/include/clang/AST/DynamicRecursiveASTVisitor.h
index 4e0ba568263b..703cca22777a 100644
--- clang/include/clang/AST/DynamicRecursiveASTVisitor.h
+++ clang/include/clang/AST/DynamicRecursiveASTVisitor.h
@@ -251,11 +251,11 @@ public:
   // Decls.
 #define ABSTRACT_DECL(DECL)
 #define DECL(CLASS, BASE)                                                      \
+  bool WalkUpFrom##CLASS##Decl(MaybeConst<CLASS##Decl> *D);                    \
   virtual bool Traverse##CLASS##Decl(MaybeConst<CLASS##Decl> *D);
 #include "clang/AST/DeclNodes.inc"
 
 #define DECL(CLASS, BASE)                                                      \
-  bool WalkUpFrom##CLASS##Decl(MaybeConst<CLASS##Decl> *D);                    \
   virtual bool Visit##CLASS##Decl(MaybeConst<CLASS##Decl> *D) { return true; }
 #include "clang/AST/DeclNodes.inc"
 
@@ -272,11 +272,11 @@ public:
   // Types.
 #define ABSTRACT_TYPE(CLASS, BASE)
 #define TYPE(CLASS, BASE)                                                      \
+  bool WalkUpFrom##CLASS##Type(MaybeConst<CLASS##Type> *T);                    \
   virtual bool Traverse##CLASS##Type(MaybeConst<CLASS##Type> *T);
 #include "clang/AST/TypeNodes.inc"
 
 #define TYPE(CLASS, BASE)                                                      \
-  bool WalkUpFrom##CLASS##Type(MaybeConst<CLASS##Type> *T);                    \
   virtual bool Visit##CLASS##Type(MaybeConst<CLASS##Type> *T) { return true; }
 #include "clang/AST/TypeNodes.inc"
 
diff --git clang/include/clang/AST/ExprCXX.h clang/include/clang/AST/ExprCXX.h
index 0b6c8cfb163c..98ba2bb41bb5 100644
--- clang/include/clang/AST/ExprCXX.h
+++ clang/include/clang/AST/ExprCXX.h
@@ -5040,7 +5040,7 @@ public:
   }
 
   const FieldDecl *getInitializedFieldInUnion() const {
-    return ArrayFillerOrUnionFieldInit.dyn_cast<FieldDecl *>();
+    return dyn_cast_if_present<FieldDecl *>(ArrayFillerOrUnionFieldInit);
   }
 
   child_range children() {
diff --git clang/include/clang/AST/RecursiveASTVisitor.h clang/include/clang/AST/RecursiveASTVisitor.h
index 5f4c39b9cbdb..55505794e705 100644
--- clang/include/clang/AST/RecursiveASTVisitor.h
+++ clang/include/clang/AST/RecursiveASTVisitor.h
@@ -4099,6 +4099,8 @@ DEF_TRAVERSE_STMT(OpenACCSetConstruct,
                   { TRY_TO(VisitOpenACCClauseList(S->clauses())); })
 DEF_TRAVERSE_STMT(OpenACCUpdateConstruct,
                   { TRY_TO(VisitOpenACCClauseList(S->clauses())); })
+DEF_TRAVERSE_STMT(OpenACCAtomicConstruct,
+                  { TRY_TO(TraverseOpenACCAssociatedStmtConstruct(S)); })
 
 // Traverse HLSL: Out argument expression
 DEF_TRAVERSE_STMT(HLSLOutArgExpr, {})
diff --git clang/include/clang/AST/StmtOpenACC.h clang/include/clang/AST/StmtOpenACC.h
index ebbee152f918..bd6c95d342ce 100644
--- clang/include/clang/AST/StmtOpenACC.h
+++ clang/include/clang/AST/StmtOpenACC.h
@@ -751,5 +751,50 @@ public:
   Create(const ASTContext &C, SourceLocation Start, SourceLocation DirectiveLoc,
          SourceLocation End, ArrayRef<const OpenACCClause *> Clauses);
 };
+
+// This class represents the 'atomic' construct, which has an associated
+// statement, but no clauses.
+class OpenACCAtomicConstruct final : public OpenACCAssociatedStmtConstruct {
+
+  friend class ASTStmtReader;
+  OpenACCAtomicKind AtomicKind = OpenACCAtomicKind::None;
+
+  OpenACCAtomicConstruct(EmptyShell)
+      : OpenACCAssociatedStmtConstruct(
+            OpenACCAtomicConstructClass, OpenACCDirectiveKind::Atomic,
+            SourceLocation{}, SourceLocation{}, SourceLocation{},
+            /*AssociatedStmt=*/nullptr) {}
+
+  OpenACCAtomicConstruct(SourceLocation Start, SourceLocation DirectiveLoc,
+                         OpenACCAtomicKind AtKind, SourceLocation End,
+                         Stmt *AssociatedStmt)
+      : OpenACCAssociatedStmtConstruct(OpenACCAtomicConstructClass,
+                                       OpenACCDirectiveKind::Atomic, Start,
+                                       DirectiveLoc, End, AssociatedStmt),
+        AtomicKind(AtKind) {}
+
+  void setAssociatedStmt(Stmt *S) {
+    OpenACCAssociatedStmtConstruct::setAssociatedStmt(S);
+  }
+
+public:
+  static bool classof(const Stmt *T) {
+    return T->getStmtClass() == OpenACCAtomicConstructClass;
+  }
+
+  static OpenACCAtomicConstruct *CreateEmpty(const ASTContext &C);
+  static OpenACCAtomicConstruct *
+  Create(const ASTContext &C, SourceLocation Start, SourceLocation DirectiveLoc,
+         OpenACCAtomicKind AtKind, SourceLocation End, Stmt *AssociatedStmt);
+
+  OpenACCAtomicKind getAtomicKind() const { return AtomicKind; }
+  const Stmt *getAssociatedStmt() const {
+    return OpenACCAssociatedStmtConstruct::getAssociatedStmt();
+  }
+  Stmt *getAssociatedStmt() {
+    return OpenACCAssociatedStmtConstruct::getAssociatedStmt();
+  }
+};
+
 } // namespace clang
 #endif // LLVM_CLANG_AST_STMTOPENACC_H
diff --git clang/include/clang/AST/TextNodeDumper.h clang/include/clang/AST/TextNodeDumper.h
index 4aaae48ba8b4..bfd205ffb0d9 100644
--- clang/include/clang/AST/TextNodeDumper.h
+++ clang/include/clang/AST/TextNodeDumper.h
@@ -420,6 +420,7 @@ public:
   void VisitOpenACCSetConstruct(const OpenACCSetConstruct *S);
   void VisitOpenACCShutdownConstruct(const OpenACCShutdownConstruct *S);
   void VisitOpenACCUpdateConstruct(const OpenACCUpdateConstruct *S);
+  void VisitOpenACCAtomicConstruct(const OpenACCAtomicConstruct *S);
   void VisitOpenACCAsteriskSizeExpr(const OpenACCAsteriskSizeExpr *S);
   void VisitEmbedExpr(const EmbedExpr *S);
   void VisitAtomicExpr(const AtomicExpr *AE);
diff --git clang/include/clang/Basic/AArch64SVEACLETypes.def clang/include/clang/Basic/AArch64SVEACLETypes.def
index a408bb0c5405..6a6f51c95ebd 100644
--- clang/include/clang/Basic/AArch64SVEACLETypes.def
+++ clang/include/clang/Basic/AArch64SVEACLETypes.def
@@ -109,97 +109,97 @@
 
 //===- Vector point types -----------------------------------------------===//
 
-SVE_VECTOR_TYPE_INT("__SVInt8_t",  "__SVInt8_t",  SveInt8,  SveInt8Ty, 16,  8, 1, true)
-SVE_VECTOR_TYPE_INT("__SVInt16_t", "__SVInt16_t", SveInt16, SveInt16Ty, 8, 16, 1, true)
-SVE_VECTOR_TYPE_INT("__SVInt32_t", "__SVInt32_t", SveInt32, SveInt32Ty, 4, 32, 1, true)
-SVE_VECTOR_TYPE_INT("__SVInt64_t", "__SVInt64_t", SveInt64, SveInt64Ty, 2, 64, 1, true)
+SVE_VECTOR_TYPE_INT(__SVInt8_t,  __SVInt8_t,  SveInt8,  SveInt8Ty, 16,  8, 1, true)
+SVE_VECTOR_TYPE_INT(__SVInt16_t, __SVInt16_t, SveInt16, SveInt16Ty, 8, 16, 1, true)
+SVE_VECTOR_TYPE_INT(__SVInt32_t, __SVInt32_t, SveInt32, SveInt32Ty, 4, 32, 1, true)
+SVE_VECTOR_TYPE_INT(__SVInt64_t, __SVInt64_t, SveInt64, SveInt64Ty, 2, 64, 1, true)
 
-SVE_VECTOR_TYPE_INT("__SVUint8_t",  "__SVUint8_t",  SveUint8,  SveUint8Ty, 16, 8, 1, false)
-SVE_VECTOR_TYPE_INT("__SVUint16_t", "__SVUint16_t", SveUint16, SveUint16Ty, 8, 16, 1, false)
-SVE_VECTOR_TYPE_INT("__SVUint32_t", "__SVUint32_t", SveUint32, SveUint32Ty, 4, 32, 1, false)
-SVE_VECTOR_TYPE_INT("__SVUint64_t", "__SVUint64_t", SveUint64, SveUint64Ty, 2, 64, 1, false)
+SVE_VECTOR_TYPE_INT(__SVUint8_t,  __SVUint8_t,  SveUint8,  SveUint8Ty, 16, 8, 1, false)
+SVE_VECTOR_TYPE_INT(__SVUint16_t, __SVUint16_t, SveUint16, SveUint16Ty, 8, 16, 1, false)
+SVE_VECTOR_TYPE_INT(__SVUint32_t, __SVUint32_t, SveUint32, SveUint32Ty, 4, 32, 1, false)
+SVE_VECTOR_TYPE_INT(__SVUint64_t, __SVUint64_t, SveUint64, SveUint64Ty, 2, 64, 1, false)
 
-SVE_VECTOR_TYPE_FLOAT("__SVFloat16_t", "__SVFloat16_t", SveFloat16, SveFloat16Ty, 8, 16, 1)
-SVE_VECTOR_TYPE_FLOAT("__SVFloat32_t", "__SVFloat32_t", SveFloat32, SveFloat32Ty, 4, 32, 1)
-SVE_VECTOR_TYPE_FLOAT("__SVFloat64_t", "__SVFloat64_t", SveFloat64, SveFloat64Ty, 2, 64, 1)
+SVE_VECTOR_TYPE_FLOAT(__SVFloat16_t, __SVFloat16_t, SveFloat16, SveFloat16Ty, 8, 16, 1)
+SVE_VECTOR_TYPE_FLOAT(__SVFloat32_t, __SVFloat32_t, SveFloat32, SveFloat32Ty, 4, 32, 1)
+SVE_VECTOR_TYPE_FLOAT(__SVFloat64_t, __SVFloat64_t, SveFloat64, SveFloat64Ty, 2, 64, 1)
 
-SVE_VECTOR_TYPE_BFLOAT("__SVBfloat16_t", "__SVBfloat16_t", SveBFloat16, SveBFloat16Ty, 8, 16, 1)
+SVE_VECTOR_TYPE_BFLOAT(__SVBfloat16_t, __SVBfloat16_t, SveBFloat16, SveBFloat16Ty, 8, 16, 1)
 
-SVE_VECTOR_TYPE_MFLOAT("__SVMfloat8_t", "__SVMfloat8_t",  SveMFloat8, SveMFloat8Ty, 16, 8, 1)
+SVE_VECTOR_TYPE_MFLOAT(__SVMfloat8_t, __SVMfloat8_t,  SveMFloat8, SveMFloat8Ty, 16, 8, 1)
 
 //
 // x2
 //
 
-SVE_VECTOR_TYPE_INT("__clang_svint8x2_t",  "svint8x2_t",  SveInt8x2,  SveInt8x2Ty, 16, 8, 2, true)
-SVE_VECTOR_TYPE_INT("__clang_svint16x2_t", "svint16x2_t", SveInt16x2, SveInt16x2Ty, 8, 16, 2, true)
-SVE_VECTOR_TYPE_INT("__clang_svint32x2_t", "svint32x2_t", SveInt32x2, SveInt32x2Ty, 4, 32, 2, true)
-SVE_VECTOR_TYPE_INT("__clang_svint64x2_t", "svint64x2_t", SveInt64x2, SveInt64x2Ty, 2, 64, 2, true)
+SVE_VECTOR_TYPE_INT(__clang_svint8x2_t,  svint8x2_t,  SveInt8x2,  SveInt8x2Ty, 16, 8, 2, true)
+SVE_VECTOR_TYPE_INT(__clang_svint16x2_t, svint16x2_t, SveInt16x2, SveInt16x2Ty, 8, 16, 2, true)
+SVE_VECTOR_TYPE_INT(__clang_svint32x2_t, svint32x2_t, SveInt32x2, SveInt32x2Ty, 4, 32, 2, true)
+SVE_VECTOR_TYPE_INT(__clang_svint64x2_t, svint64x2_t, SveInt64x2, SveInt64x2Ty, 2, 64, 2, true)
 
-SVE_VECTOR_TYPE_INT("__clang_svuint8x2_t",  "svuint8x2_t",  SveUint8x2,  SveUint8x2Ty, 16 , 8, 2, false)
-SVE_VECTOR_TYPE_INT("__clang_svuint16x2_t", "svuint16x2_t", SveUint16x2, SveUint16x2Ty, 8, 16, 2, false)
-SVE_VECTOR_TYPE_INT("__clang_svuint32x2_t", "svuint32x2_t", SveUint32x2, SveUint32x2Ty, 4, 32, 2, false)
-SVE_VECTOR_TYPE_INT("__clang_svuint64x2_t", "svuint64x2_t", SveUint64x2, SveUint64x2Ty, 2, 64, 2, false)
+SVE_VECTOR_TYPE_INT(__clang_svuint8x2_t,  svuint8x2_t,  SveUint8x2,  SveUint8x2Ty, 16 , 8, 2, false)
+SVE_VECTOR_TYPE_INT(__clang_svuint16x2_t, svuint16x2_t, SveUint16x2, SveUint16x2Ty, 8, 16, 2, false)
+SVE_VECTOR_TYPE_INT(__clang_svuint32x2_t, svuint32x2_t, SveUint32x2, SveUint32x2Ty, 4, 32, 2, false)
+SVE_VECTOR_TYPE_INT(__clang_svuint64x2_t, svuint64x2_t, SveUint64x2, SveUint64x2Ty, 2, 64, 2, false)
 
-SVE_VECTOR_TYPE_FLOAT("__clang_svfloat16x2_t", "svfloat16x2_t", SveFloat16x2, SveFloat16x2Ty, 8, 16, 2)
-SVE_VECTOR_TYPE_FLOAT("__clang_svfloat32x2_t", "svfloat32x2_t", SveFloat32x2, SveFloat32x2Ty, 4, 32, 2)
-SVE_VECTOR_TYPE_FLOAT("__clang_svfloat64x2_t", "svfloat64x2_t", SveFloat64x2, SveFloat64x2Ty, 2, 64, 2)
+SVE_VECTOR_TYPE_FLOAT(__clang_svfloat16x2_t, svfloat16x2_t, SveFloat16x2, SveFloat16x2Ty, 8, 16, 2)
+SVE_VECTOR_TYPE_FLOAT(__clang_svfloat32x2_t, svfloat32x2_t, SveFloat32x2, SveFloat32x2Ty, 4, 32, 2)
+SVE_VECTOR_TYPE_FLOAT(__clang_svfloat64x2_t, svfloat64x2_t, SveFloat64x2, SveFloat64x2Ty, 2, 64, 2)
 
-SVE_VECTOR_TYPE_BFLOAT("__clang_svbfloat16x2_t", "svbfloat16x2_t", SveBFloat16x2, SveBFloat16x2Ty, 8, 16, 2)
+SVE_VECTOR_TYPE_BFLOAT(__clang_svbfloat16x2_t, svbfloat16x2_t, SveBFloat16x2, SveBFloat16x2Ty, 8, 16, 2)
 
-SVE_VECTOR_TYPE_MFLOAT("__clang_svmfloat8x2_t", "svmfloat8x2_t", SveMFloat8x2, SveMFloat8x2Ty, 16, 8, 2)
+SVE_VECTOR_TYPE_MFLOAT(__clang_svmfloat8x2_t, svmfloat8x2_t, SveMFloat8x2, SveMFloat8x2Ty, 16, 8, 2)
 
 //
 // x3
 //
 
-SVE_VECTOR_TYPE_INT("__clang_svint8x3_t",  "svint8x3_t",  SveInt8x3,  SveInt8x3Ty, 16,  8, 3, true)
-SVE_VECTOR_TYPE_INT("__clang_svint16x3_t", "svint16x3_t", SveInt16x3, SveInt16x3Ty, 8, 16, 3, true)
-SVE_VECTOR_TYPE_INT("__clang_svint32x3_t", "svint32x3_t", SveInt32x3, SveInt32x3Ty, 4, 32, 3, true)
-SVE_VECTOR_TYPE_INT("__clang_svint64x3_t", "svint64x3_t", SveInt64x3, SveInt64x3Ty, 2, 64, 3, true)
+SVE_VECTOR_TYPE_INT(__clang_svint8x3_t,  svint8x3_t,  SveInt8x3,  SveInt8x3Ty, 16,  8, 3, true)
+SVE_VECTOR_TYPE_INT(__clang_svint16x3_t, svint16x3_t, SveInt16x3, SveInt16x3Ty, 8, 16, 3, true)
+SVE_VECTOR_TYPE_INT(__clang_svint32x3_t, svint32x3_t, SveInt32x3, SveInt32x3Ty, 4, 32, 3, true)
+SVE_VECTOR_TYPE_INT(__clang_svint64x3_t, svint64x3_t, SveInt64x3, SveInt64x3Ty, 2, 64, 3, true)
 
-SVE_VECTOR_TYPE_INT("__clang_svuint8x3_t",  "svuint8x3_t",  SveUint8x3,  SveUint8x3Ty, 16,  8, 3, false)
-SVE_VECTOR_TYPE_INT("__clang_svuint16x3_t", "svuint16x3_t", SveUint16x3, SveUint16x3Ty, 8, 16, 3, false)
-SVE_VECTOR_TYPE_INT("__clang_svuint32x3_t", "svuint32x3_t", SveUint32x3, SveUint32x3Ty, 4, 32, 3, false)
-SVE_VECTOR_TYPE_INT("__clang_svuint64x3_t", "svuint64x3_t", SveUint64x3, SveUint64x3Ty, 2, 64, 3, false)
+SVE_VECTOR_TYPE_INT(__clang_svuint8x3_t,  svuint8x3_t,  SveUint8x3,  SveUint8x3Ty, 16,  8, 3, false)
+SVE_VECTOR_TYPE_INT(__clang_svuint16x3_t, svuint16x3_t, SveUint16x3, SveUint16x3Ty, 8, 16, 3, false)
+SVE_VECTOR_TYPE_INT(__clang_svuint32x3_t, svuint32x3_t, SveUint32x3, SveUint32x3Ty, 4, 32, 3, false)
+SVE_VECTOR_TYPE_INT(__clang_svuint64x3_t, svuint64x3_t, SveUint64x3, SveUint64x3Ty, 2, 64, 3, false)
 
-SVE_VECTOR_TYPE_FLOAT("__clang_svfloat16x3_t", "svfloat16x3_t", SveFloat16x3, SveFloat16x3Ty, 8, 16, 3)
-SVE_VECTOR_TYPE_FLOAT("__clang_svfloat32x3_t", "svfloat32x3_t", SveFloat32x3, SveFloat32x3Ty, 4, 32, 3)
-SVE_VECTOR_TYPE_FLOAT("__clang_svfloat64x3_t", "svfloat64x3_t", SveFloat64x3, SveFloat64x3Ty, 2, 64, 3)
+SVE_VECTOR_TYPE_FLOAT(__clang_svfloat16x3_t, svfloat16x3_t, SveFloat16x3, SveFloat16x3Ty, 8, 16, 3)
+SVE_VECTOR_TYPE_FLOAT(__clang_svfloat32x3_t, svfloat32x3_t, SveFloat32x3, SveFloat32x3Ty, 4, 32, 3)
+SVE_VECTOR_TYPE_FLOAT(__clang_svfloat64x3_t, svfloat64x3_t, SveFloat64x3, SveFloat64x3Ty, 2, 64, 3)
 
-SVE_VECTOR_TYPE_BFLOAT("__clang_svbfloat16x3_t", "svbfloat16x3_t", SveBFloat16x3, SveBFloat16x3Ty, 8, 16, 3)
+SVE_VECTOR_TYPE_BFLOAT(__clang_svbfloat16x3_t, svbfloat16x3_t, SveBFloat16x3, SveBFloat16x3Ty, 8, 16, 3)
 
-SVE_VECTOR_TYPE_MFLOAT("__clang_svmfloat8x3_t", "svmfloat8x3_t", SveMFloat8x3, SveMFloat8x3Ty, 16, 8, 3)
+SVE_VECTOR_TYPE_MFLOAT(__clang_svmfloat8x3_t, svmfloat8x3_t, SveMFloat8x3, SveMFloat8x3Ty, 16, 8, 3)
 
 //
 // x4
 //
 
-SVE_VECTOR_TYPE_INT("__clang_svint8x4_t",  "svint8x4_t",  SveInt8x4,  SveInt8x4Ty, 16,  8, 4, true)
-SVE_VECTOR_TYPE_INT("__clang_svint16x4_t", "svint16x4_t", SveInt16x4, SveInt16x4Ty, 8, 16, 4, true)
-SVE_VECTOR_TYPE_INT("__clang_svint32x4_t", "svint32x4_t", SveInt32x4, SveInt32x4Ty, 4, 32, 4, true)
-SVE_VECTOR_TYPE_INT("__clang_svint64x4_t", "svint64x4_t", SveInt64x4, SveInt64x4Ty, 2, 64, 4, true)
+SVE_VECTOR_TYPE_INT(__clang_svint8x4_t,  svint8x4_t,  SveInt8x4,  SveInt8x4Ty, 16,  8, 4, true)
+SVE_VECTOR_TYPE_INT(__clang_svint16x4_t, svint16x4_t, SveInt16x4, SveInt16x4Ty, 8, 16, 4, true)
+SVE_VECTOR_TYPE_INT(__clang_svint32x4_t, svint32x4_t, SveInt32x4, SveInt32x4Ty, 4, 32, 4, true)
+SVE_VECTOR_TYPE_INT(__clang_svint64x4_t, svint64x4_t, SveInt64x4, SveInt64x4Ty, 2, 64, 4, true)
 
-SVE_VECTOR_TYPE_INT("__clang_svuint8x4_t",  "svuint8x4_t",  SveUint8x4,  SveUint8x4Ty, 16,  8, 4, false)
-SVE_VECTOR_TYPE_INT("__clang_svuint16x4_t", "svuint16x4_t", SveUint16x4, SveUint16x4Ty, 8, 16, 4, false)
-SVE_VECTOR_TYPE_INT("__clang_svuint32x4_t", "svuint32x4_t", SveUint32x4, SveUint32x4Ty, 4, 32, 4, false)
-SVE_VECTOR_TYPE_INT("__clang_svuint64x4_t", "svuint64x4_t", SveUint64x4, SveUint64x4Ty, 2, 64, 4, false)
+SVE_VECTOR_TYPE_INT(__clang_svuint8x4_t,  svuint8x4_t,  SveUint8x4,  SveUint8x4Ty, 16,  8, 4, false)
+SVE_VECTOR_TYPE_INT(__clang_svuint16x4_t, svuint16x4_t, SveUint16x4, SveUint16x4Ty, 8, 16, 4, false)
+SVE_VECTOR_TYPE_INT(__clang_svuint32x4_t, svuint32x4_t, SveUint32x4, SveUint32x4Ty, 4, 32, 4, false)
+SVE_VECTOR_TYPE_INT(__clang_svuint64x4_t, svuint64x4_t, SveUint64x4, SveUint64x4Ty, 2, 64, 4, false)
 
-SVE_VECTOR_TYPE_FLOAT("__clang_svfloat16x4_t", "svfloat16x4_t", SveFloat16x4, SveFloat16x4Ty, 8, 16, 4)
-SVE_VECTOR_TYPE_FLOAT("__clang_svfloat32x4_t", "svfloat32x4_t", SveFloat32x4, SveFloat32x4Ty, 4, 32, 4)
-SVE_VECTOR_TYPE_FLOAT("__clang_svfloat64x4_t", "svfloat64x4_t", SveFloat64x4, SveFloat64x4Ty, 2, 64, 4)
+SVE_VECTOR_TYPE_FLOAT(__clang_svfloat16x4_t, svfloat16x4_t, SveFloat16x4, SveFloat16x4Ty, 8, 16, 4)
+SVE_VECTOR_TYPE_FLOAT(__clang_svfloat32x4_t, svfloat32x4_t, SveFloat32x4, SveFloat32x4Ty, 4, 32, 4)
+SVE_VECTOR_TYPE_FLOAT(__clang_svfloat64x4_t, svfloat64x4_t, SveFloat64x4, SveFloat64x4Ty, 2, 64, 4)
 
-SVE_VECTOR_TYPE_BFLOAT("__clang_svbfloat16x4_t", "svbfloat16x4_t", SveBFloat16x4, SveBFloat16x4Ty, 8, 16, 4)
+SVE_VECTOR_TYPE_BFLOAT(__clang_svbfloat16x4_t, svbfloat16x4_t, SveBFloat16x4, SveBFloat16x4Ty, 8, 16, 4)
 
-SVE_VECTOR_TYPE_MFLOAT("__clang_svmfloat8x4_t", "svmfloat8x4_t", SveMFloat8x4, SveMFloat8x4Ty, 16, 8, 4)
+SVE_VECTOR_TYPE_MFLOAT(__clang_svmfloat8x4_t, svmfloat8x4_t, SveMFloat8x4, SveMFloat8x4Ty, 16, 8, 4)
 
-SVE_PREDICATE_TYPE_ALL("__SVBool_t", "__SVBool_t", SveBool, SveBoolTy, 16, 1)
-SVE_PREDICATE_TYPE_ALL("__clang_svboolx2_t", "svboolx2_t", SveBoolx2, SveBoolx2Ty, 16, 2)
-SVE_PREDICATE_TYPE_ALL("__clang_svboolx4_t", "svboolx4_t", SveBoolx4, SveBoolx4Ty, 16, 4)
+SVE_PREDICATE_TYPE_ALL(__SVBool_t, __SVBool_t, SveBool, SveBoolTy, 16, 1)
+SVE_PREDICATE_TYPE_ALL(__clang_svboolx2_t, svboolx2_t, SveBoolx2, SveBoolx2Ty, 16, 2)
+SVE_PREDICATE_TYPE_ALL(__clang_svboolx4_t, svboolx4_t, SveBoolx4, SveBoolx4Ty, 16, 4)
 
-SVE_OPAQUE_TYPE("__SVCount_t", "__SVCount_t", SveCount, SveCountTy)
+SVE_OPAQUE_TYPE(__SVCount_t, __SVCount_t, SveCount, SveCountTy)
 
-SVE_SCALAR_TYPE("__mfp8", "__mfp8", MFloat8, MFloat8Ty, 8)
+SVE_SCALAR_TYPE(__mfp8, __mfp8, MFloat8, MFloat8Ty, 8)
 
 #undef SVE_VECTOR_TYPE
 #undef SVE_VECTOR_TYPE_MFLOAT
diff --git clang/include/clang/Basic/Attr.td clang/include/clang/Basic/Attr.td
index f4ba2bc3c6de..2a3a29bd2ee1 100644
--- clang/include/clang/Basic/Attr.td
+++ clang/include/clang/Basic/Attr.td
@@ -198,7 +198,7 @@ def OpenCLKernelFunction
 // inclusive nature of subject testing).
 def HasFunctionProto : SubsetSubject<DeclBase,
                                      [{(S->getFunctionType(true) != nullptr &&
-                              isa<FunctionProtoType>(S->getFunctionType())) ||
+                                       isa<FunctionProtoType>(S->getFunctionType())) ||
                                        isa<ObjCMethodDecl>(S) ||
                                        isa<BlockDecl>(S)}],
                                      "non-K&R-style functions">;
diff --git clang/include/clang/Basic/BuiltinsX86.td clang/include/clang/Basic/BuiltinsX86.td
index 572ac7235be0..7f3c52ae329d 100644
--- clang/include/clang/Basic/BuiltinsX86.td
+++ clang/include/clang/Basic/BuiltinsX86.td
@@ -130,6 +130,10 @@ let Attributes = [Const, NoThrow, RequiredVectorWidth<128>] in {
   }
 }
 
+let Features = "sse", Header = "xmmintrin.h", Attributes = [NoThrow, Const] in {
+  def _mm_prefetch : X86LibBuiltin<"void(void const *, int)">;
+}
+
 // AVX
 let Attributes = [Const, NoThrow, RequiredVectorWidth<256>], Features = "avx" in {
   foreach Op = ["addsub", "hadd", "hsub", "max", "min"] in {
@@ -138,6 +142,12 @@ let Attributes = [Const, NoThrow, RequiredVectorWidth<256>], Features = "avx" in
   }
 }
 
+// PRFCHW
+let Features = "prfchw", Header = "intrin.h", Attributes = [NoThrow, Const] in {
+  def _m_prefetch : X86LibBuiltin<"void(void *)">;
+  def _m_prefetchw : X86LibBuiltin<"void(void volatile const *)">;
+}
+
 
 // Mechanically ported builtins from the original `.def` file.
 //
@@ -146,10 +156,6 @@ let Attributes = [Const, NoThrow, RequiredVectorWidth<256>], Features = "avx" in
 // current formulation is based on what was easiest to recognize from the
 // pre-TableGen version.
 
-let Features = "mmx", Attributes = [NoThrow, Const] in {
-  def _mm_prefetch : X86NoPrefixBuiltin<"void(char const *, int)">;
-}
-
 let Features = "sse", Attributes = [NoThrow] in {
   def ldmxcsr : X86Builtin<"void(unsigned int)">;
 }
diff --git clang/include/clang/Basic/CodeGenOptions.def clang/include/clang/Basic/CodeGenOptions.def
index 259972bdf8f0..386652d2efa9 100644
--- clang/include/clang/Basic/CodeGenOptions.def
+++ clang/include/clang/Basic/CodeGenOptions.def
@@ -465,6 +465,10 @@ ENUM_CODEGENOPT(ZeroCallUsedRegs, llvm::ZeroCallUsedRegs::ZeroCallUsedRegsKind,
 /// non-deleting destructors. (No effect on Microsoft ABI.)
 CODEGENOPT(CtorDtorReturnThis, 1, 0)
 
+/// Enables emitting Import Call sections on supported targets that can be used
+/// by the Windows kernel to enable import call optimization.
+CODEGENOPT(ImportCallOptimization, 1, 0)
+
 /// FIXME: Make DebugOptions its own top-level .def file.
 #include "DebugOptions.def"
 
diff --git clang/include/clang/Basic/DiagnosticGroups.td clang/include/clang/Basic/DiagnosticGroups.td
index 527e588d46a0..05e39899e6f2 100644
--- clang/include/clang/Basic/DiagnosticGroups.td
+++ clang/include/clang/Basic/DiagnosticGroups.td
@@ -694,6 +694,36 @@ def SuspiciousMemaccess : DiagGroup<"suspicious-memaccess",
    NonTrivialMemaccess, MemsetTransposedArgs, SuspiciousBzero]>;
 def StaticInInline : DiagGroup<"static-in-inline">;
 def StaticLocalInInline : DiagGroup<"static-local-in-inline">;
+def UniqueObjectDuplication : DiagGroup<"unique-object-duplication"> {
+  code Documentation = [{
+Warns when objects which are supposed to be globally unique might get duplicated
+when built into a shared library.
+
+If an object with hidden visibility is built into a shared library, each instance
+of the library will get its own copy. This can cause very subtle bugs if there was
+only supposed to be one copy of the object in question: singletons aren't single,
+changes to one object won't affect the others, the object's initializer will run
+once per copy, etc.
+
+Specifically, this warning fires when it detects an object which:
+  1. Appears in a header file (so it might get compiled into multiple libaries), and
+  2. Has external linkage (otherwise it's supposed to be duplicated), and
+  3. Has hidden visibility.
+
+As well as one of the following:
+  1. The object is mutable, or
+  2. The object's initializer definitely has side effects.
+
+The warning is best resolved by making the object ``const`` (if possible), or by explicitly
+giving the object non-hidden visibility, e.g. using ``__attribute((visibility("default")))``.
+Note that all levels of a pointer variable must be constant; ``const int*`` will
+trigger the warning because the pointer itself is mutable.
+
+This warning is currently disabled on Windows since it uses import/export rules
+instead of visibility.
+}];
+}
+
 def GNUStaticFloatInit : DiagGroup<"gnu-static-float-init">;
 def StaticFloatInit : DiagGroup<"static-float-init", [GNUStaticFloatInit]>;
 // Allow differentiation between GNU statement expressions in a macro versus
@@ -1304,6 +1334,8 @@ def MicrosoftStaticAssert : DiagGroup<"microsoft-static-assert">;
 def MicrosoftInitFromPredefined : DiagGroup<"microsoft-init-from-predefined">;
 def MicrosoftStringLiteralFromPredefined : DiagGroup<
     "microsoft-string-literal-from-predefined">;
+def MicrosoftInlineOnNonFunction : DiagGroup<
+    "microsoft-inline-on-non-function">;
 
 // Aliases.
 def : DiagGroup<"msvc-include", [MicrosoftInclude]>;
@@ -1322,7 +1354,7 @@ def Microsoft : DiagGroup<"microsoft",
      MicrosoftConstInit, MicrosoftVoidPseudoDtor, MicrosoftAnonTag,
      MicrosoftCommentPaste, MicrosoftEndOfFile, MicrosoftStaticAssert,
      MicrosoftInitFromPredefined, MicrosoftStringLiteralFromPredefined,
-     MicrosoftInconsistentDllImport]>;
+     MicrosoftInconsistentDllImport, MicrosoftInlineOnNonFunction]>;
 
 def ClangClPch : DiagGroup<"clang-cl-pch">;
 
diff --git clang/include/clang/Basic/DiagnosticSemaKinds.td clang/include/clang/Basic/DiagnosticSemaKinds.td
index 2ac3879a4caa..7b3b932c482b 100644
--- clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -482,6 +482,8 @@ def ext_use_out_of_scope_declaration : ExtWarn<
   InGroup<DiagGroup<"out-of-scope-function">>;
 def err_inline_non_function : Error<
   "'inline' can only appear on functions%select{| and non-local variables}0">;
+def warn_ms_inline_non_function : ExtWarn<err_inline_non_function.Summary>,
+  InGroup<MicrosoftInlineOnNonFunction>;
 def err_noreturn_non_function : Error<
   "'_Noreturn' can only appear on functions">;
 def warn_qual_return_type : Warning<
@@ -6111,6 +6113,8 @@ def note_deleted_special_member_class_subobject : Note<
   "destructor}5"
   "%select{||s||}4"
   "|is an ObjC pointer}6">;
+def note_default_constructed_field
+    : Note<"default constructed field %0 declared here">;
 def note_deleted_default_ctor_uninit_field : Note<
   "%select{default constructor of|constructor inherited by}0 "
   "%1 is implicitly deleted because field %2 of "
@@ -6165,6 +6169,15 @@ def warn_static_local_in_extern_inline : Warning<
 def note_convert_inline_to_static : Note<
   "use 'static' to give inline function %0 internal linkage">;
 
+def warn_possible_object_duplication_mutable : Warning<
+  "%0 may be duplicated when built into a shared library: "
+  "it is mutable, has hidden visibility, and external linkage">,
+  InGroup<UniqueObjectDuplication>, DefaultIgnore;
+def warn_possible_object_duplication_init : Warning<
+  "initializeation of %0 may run twice when built into a shared library: "
+  "it has hidden visibility and external linkage">,
+  InGroup<UniqueObjectDuplication>, DefaultIgnore;
+
 def ext_redefinition_of_typedef : ExtWarn<
   "redefinition of typedef %0 is a C11 feature">,
   InGroup<DiagGroup<"typedef-redefinition"> >;
@@ -12901,6 +12914,48 @@ def err_acc_update_as_body
     : Error<"OpenACC 'update' construct may not appear in place of the "
             "statement following a%select{n if statement| while statement| do "
             "statement| switch statement| label statement}0">;
+def err_acc_invalid_atomic
+    : Error<"statement associated with OpenACC 'atomic%select{| "
+            "%1}0' directive is invalid">;
+def note_acc_atomic_expr_must_be
+    : Note<"expected "
+           "%enum_select<OACCAtomicExpr>{%Assign{assignment}|%UnaryCompAssign{"
+           "assignment, compound assignment, increment, or decrement}}0 "
+           "expression">;
+def note_acc_atomic_unsupported_unary_operator
+    : Note<"unary operator not supported, only increment and decrement "
+           "operations permitted">;
+def note_acc_atomic_unsupported_binary_operator
+    : Note<"binary operator not supported, only +, *, -, /, &, ^, |, <<, or >> "
+           "are permitted">;
+def note_acc_atomic_unsupported_compound_binary_operator
+    : Note<"compound binary operator not supported, only +=, *=, -=, /=, &=, "
+           "^=, |=, <<=, or >>= are permitted">;
+
+def note_acc_atomic_operand_lvalue_scalar
+    : Note<"%select{left |right |}0operand to "
+           "%enum_select<OACCAtomicOpKind>{%Assign{assignment}|%CompoundAssign{"
+           "compound assignment}|%Inc{increment}|"
+           "%Dec{decrement}}1 "
+           "expression must be "
+           "%enum_select<OACCLValScalar>{%LVal{an l-value}|%Scalar{of scalar "
+           "type (was %3)}}2">;
+def note_acc_atomic_too_many_stmts
+    : Note<"'atomic capture' with a compound statement only supports two "
+           "statements">;
+def note_acc_atomic_expected_binop : Note<"expected binary operation on right "
+                                          "hand side of assignment operator">;
+def note_acc_atomic_mismatch_operand
+    : Note<"left hand side of assignment operation('%0') must match one side "
+           "of the sub-operation on the right hand side('%1' and '%2')">;
+def note_acc_atomic_mismatch_compound_operand
+    : Note<"variable %select{|in unary expression|on right hand side of "
+           "assignment|on left hand side of assignment|on left hand side of "
+           "compound assignment|on left hand side of assignment}2('%3') must "
+           "match variable used %select{|in unary expression|on right hand "
+           "side of assignment|<not possible>|on left hand side of compound "
+           "assignment|on left hand side of assignment}0('%1') from the first "
+           "statement">;
 
 // AMDGCN builtins diagnostics
 def err_amdgcn_global_load_lds_size_invalid_value : Error<"invalid size value">;
diff --git clang/include/clang/Basic/LangOptions.h clang/include/clang/Basic/LangOptions.h
index 114a5d34a008..16c35bcf4933 100644
--- clang/include/clang/Basic/LangOptions.h
+++ clang/include/clang/Basic/LangOptions.h
@@ -246,6 +246,8 @@ public:
     ///   construction vtable because it hasn't added 'type' as a substitution.
     ///   - Skip mangling enclosing class templates of member-like friend
     ///   function templates.
+    ///   - Ignore empty struct arguments in C++ mode for ARM, instead of
+    ///   passing them as if they had a size of 1 byte.
     Ver19,
 
     /// Conform to the underlying platform's C and C++ ABIs as closely
diff --git clang/include/clang/Basic/OpenACCKinds.h clang/include/clang/Basic/OpenACCKinds.h
index 7fb76271826a..c2d7732123ef 100644
--- clang/include/clang/Basic/OpenACCKinds.h
+++ clang/include/clang/Basic/OpenACCKinds.h
@@ -171,9 +171,34 @@ enum class OpenACCAtomicKind : uint8_t {
   Write,
   Update,
   Capture,
-  Invalid,
+  None,
 };
 
+template <typename StreamTy>
+inline StreamTy &printOpenACCAtomicKind(StreamTy &Out, OpenACCAtomicKind AK) {
+  switch (AK) {
+  case OpenACCAtomicKind::Read:
+    return Out << "read";
+  case OpenACCAtomicKind::Write:
+    return Out << "write";
+  case OpenACCAtomicKind::Update:
+    return Out << "update";
+  case OpenACCAtomicKind::Capture:
+    return Out << "capture";
+  case OpenACCAtomicKind::None:
+    return Out << "<none>";
+  }
+  llvm_unreachable("unknown atomic kind");
+}
+inline const StreamingDiagnostic &operator<<(const StreamingDiagnostic &Out,
+                                             OpenACCAtomicKind AK) {
+  return printOpenACCAtomicKind(Out, AK);
+}
+inline llvm::raw_ostream &operator<<(llvm::raw_ostream &Out,
+                                     OpenACCAtomicKind AK) {
+  return printOpenACCAtomicKind(Out, AK);
+}
+
 /// Represents the kind of an OpenACC clause.
 enum class OpenACCClauseKind : uint8_t {
   /// 'finalize' clause, allowed on 'exit data' directive.
diff --git clang/include/clang/Basic/Sanitizers.h clang/include/clang/Basic/Sanitizers.h
index fc0576d452b1..3782deb5962b 100644
--- clang/include/clang/Basic/Sanitizers.h
+++ clang/include/clang/Basic/Sanitizers.h
@@ -162,6 +162,11 @@ public:
 
   void set(SanitizerMask K, double V);
   void clear(SanitizerMask K = SanitizerKind::All);
+
+  // Returns nullopt if all the values are zero.
+  // Otherwise, return value contains a vector of all the scaled values.
+  std::optional<std::vector<unsigned>>
+  getAllScaled(unsigned ScalingFactor) const;
 };
 
 struct SanitizerSet {
diff --git clang/include/clang/Basic/StmtNodes.td clang/include/clang/Basic/StmtNodes.td
index 2fea05e322c7..d47e0a8157fc 100644
--- clang/include/clang/Basic/StmtNodes.td
+++ clang/include/clang/Basic/StmtNodes.td
@@ -319,6 +319,7 @@ def OpenACCInitConstruct : StmtNode<OpenACCConstructStmt>;
 def OpenACCShutdownConstruct : StmtNode<OpenACCConstructStmt>;
 def OpenACCSetConstruct : StmtNode<OpenACCConstructStmt>;
 def OpenACCUpdateConstruct : StmtNode<OpenACCConstructStmt>;
+def OpenACCAtomicConstruct : StmtNode<OpenACCAssociatedStmtConstruct>;
 
 // OpenACC Additional Expressions.
 def OpenACCAsteriskSizeExpr : StmtNode<Expr>;
diff --git clang/include/clang/Basic/TargetInfo.h clang/include/clang/Basic/TargetInfo.h
index 43c09cf1f973..d762144478b4 100644
--- clang/include/clang/Basic/TargetInfo.h
+++ clang/include/clang/Basic/TargetInfo.h
@@ -1023,7 +1023,8 @@ public:
 
   /// Returns target-specific min and max values VScale_Range.
   virtual std::optional<std::pair<unsigned, unsigned>>
-  getVScaleRange(const LangOptions &LangOpts) const {
+  getVScaleRange(const LangOptions &LangOpts,
+                 bool IsArmStreamingFunction) const {
     return std::nullopt;
   }
   /// The __builtin_clz* and __builtin_ctz* built-in
diff --git clang/include/clang/Basic/arm_sme.td clang/include/clang/Basic/arm_sme.td
index 891ed9874bb3..b33570fcaec2 100644
--- clang/include/clang/Basic/arm_sme.td
+++ clang/include/clang/Basic/arm_sme.td
@@ -748,30 +748,30 @@ let SMETargetGuard = "sme2" in {
 
 // FDOT
 let SMETargetGuard = "sme-f8f32" in {
-  def SVDOT_LANE_FP8_ZA32_VG1x2 : Inst<"svdot_lane_za32[_mf8]_vg1x2_fpm", "vm2di>", "m", MergeNone, "aarch64_sme_fp8_fdot_lane_za32_vg1x2", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_3>]>;
-  def SVDOT_LANE_FP8_ZA32_VG1x4 : Inst<"svdot_lane_za32[_mf8]_vg1x4_fpm", "vm4di>", "m", MergeNone, "aarch64_sme_fp8_fdot_lane_za32_vg1x4", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVDOT_LANE_FP8_ZA32_VG1x2 : Inst<"svdot_lane_za32[_mf8]_vg1x2", "vm2di>", "m", MergeNone, "aarch64_sme_fp8_fdot_lane_za32_vg1x2", [IsStreaming, IsInOutZA, IsOverloadNone], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVDOT_LANE_FP8_ZA32_VG1x4 : Inst<"svdot_lane_za32[_mf8]_vg1x4", "vm4di>", "m", MergeNone, "aarch64_sme_fp8_fdot_lane_za32_vg1x4", [IsStreaming, IsInOutZA, IsOverloadNone], [ImmCheck<3, ImmCheck0_3>]>;
 
-  def SVVDOTB_LANE_FP8_ZA32_VG1x4 : Inst<"svvdotb_lane_za32[_mf8]_vg1x4_fpm", "vm2di>", "m", MergeNone, "aarch64_sme_fp8_fvdotb_lane_za32_vg1x4", [IsOverloadNone, IsStreaming, IsInOutZA, SetsFPMR], [ImmCheck<3, ImmCheck0_3>]>;
-  def SVVDOTT_LANE_FP8_ZA32_VG1x4 : Inst<"svvdott_lane_za32[_mf8]_vg1x4_fpm", "vm2di>", "m", MergeNone, "aarch64_sme_fp8_fvdott_lane_za32_vg1x4", [IsOverloadNone, IsStreaming, IsInOutZA, SetsFPMR], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVVDOTB_LANE_FP8_ZA32_VG1x4 : Inst<"svvdotb_lane_za32[_mf8]_vg1x4", "vm2di>", "m", MergeNone, "aarch64_sme_fp8_fvdotb_lane_za32_vg1x4", [IsOverloadNone, IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVVDOTT_LANE_FP8_ZA32_VG1x4 : Inst<"svvdott_lane_za32[_mf8]_vg1x4", "vm2di>", "m", MergeNone, "aarch64_sme_fp8_fvdott_lane_za32_vg1x4", [IsOverloadNone, IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_3>]>;
 
-  def SVDOT_SINGLE_FP8_ZA32_VG1x2 : Inst<"svdot[_single]_za32[_mf8]_vg1x2_fpm", "vm2d>", "m", MergeNone, "aarch64_sme_fp8_fdot_single_za32_vg1x2", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
-  def SVDOT_SINGLE_FP8_ZA32_VG1x4 : Inst<"svdot[_single]_za32[_mf8]_vg1x4_fpm", "vm4d>", "m", MergeNone, "aarch64_sme_fp8_fdot_single_za32_vg1x4", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
+  def SVDOT_SINGLE_FP8_ZA32_VG1x2 : Inst<"svdot[_single]_za32[_mf8]_vg1x2", "vm2d>", "m", MergeNone, "aarch64_sme_fp8_fdot_single_za32_vg1x2", [IsStreaming, IsInOutZA, IsOverloadNone], []>;
+  def SVDOT_SINGLE_FP8_ZA32_VG1x4 : Inst<"svdot[_single]_za32[_mf8]_vg1x4", "vm4d>", "m", MergeNone, "aarch64_sme_fp8_fdot_single_za32_vg1x4", [IsStreaming, IsInOutZA, IsOverloadNone], []>;
 
-  def SVDOT_MULTI_FP8_ZA32_VG1x2 : Inst<"svdot_za32[_mf8]_vg1x2_fpm", "vm22>", "m", MergeNone, "aarch64_sme_fp8_fdot_multi_za32_vg1x2", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
-  def SVDOT_MULTI_FP8_ZA32_VG1x4 : Inst<"svdot_za32[_mf8]_vg1x4_fpm", "vm44>", "m", MergeNone, "aarch64_sme_fp8_fdot_multi_za32_vg1x4", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
+  def SVDOT_MULTI_FP8_ZA32_VG1x2 : Inst<"svdot_za32[_mf8]_vg1x2", "vm22>", "m", MergeNone, "aarch64_sme_fp8_fdot_multi_za32_vg1x2", [IsStreaming, IsInOutZA, IsOverloadNone], []>;
+  def SVDOT_MULTI_FP8_ZA32_VG1x4 : Inst<"svdot_za32[_mf8]_vg1x4", "vm44>", "m", MergeNone, "aarch64_sme_fp8_fdot_multi_za32_vg1x4", [IsStreaming, IsInOutZA, IsOverloadNone], []>;
 }
 
 let SMETargetGuard = "sme-f8f16" in {
-  def SVDOT_LANE_FP8_ZA16_VG1x2 : Inst<"svdot_lane_za16[_mf8]_vg1x2_fpm", "vm2di>", "m", MergeNone, "aarch64_sme_fp8_fdot_lane_za16_vg1x2", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_7>]>;
-  def SVDOT_LANE_FP8_ZA16_VG1x4 : Inst<"svdot_lane_za16[_mf8]_vg1x4_fpm", "vm4di>", "m", MergeNone, "aarch64_sme_fp8_fdot_lane_za16_vg1x4", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVDOT_LANE_FP8_ZA16_VG1x2 : Inst<"svdot_lane_za16[_mf8]_vg1x2", "vm2di>", "m", MergeNone, "aarch64_sme_fp8_fdot_lane_za16_vg1x2", [IsStreaming, IsInOutZA, IsOverloadNone], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVDOT_LANE_FP8_ZA16_VG1x4 : Inst<"svdot_lane_za16[_mf8]_vg1x4", "vm4di>", "m", MergeNone, "aarch64_sme_fp8_fdot_lane_za16_vg1x4", [IsStreaming, IsInOutZA, IsOverloadNone], [ImmCheck<3, ImmCheck0_7>]>;
 
-  def SVVDOT_LANE_FP8_ZA16_VG1x2 : Inst<"svvdot_lane_za16[_mf8]_vg1x2_fpm", "vm2di>", "m", MergeNone, "aarch64_sme_fp8_fvdot_lane_za16_vg1x2", [IsOverloadNone, IsStreaming, IsInOutZA, SetsFPMR], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVVDOT_LANE_FP8_ZA16_VG1x2 : Inst<"svvdot_lane_za16[_mf8]_vg1x2", "vm2di>", "m", MergeNone, "aarch64_sme_fp8_fvdot_lane_za16_vg1x2", [IsOverloadNone, IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
 
-  def SVDOT_SINGLE_FP8_ZA16_VG1x2 : Inst<"svdot[_single]_za16[_mf8]_vg1x2_fpm", "vm2d>", "m", MergeNone, "aarch64_sme_fp8_fdot_single_za16_vg1x2", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
-  def SVDOT_SINGLE_FP8_ZA16_VG1x4 : Inst<"svdot[_single]_za16[_mf8]_vg1x4_fpm", "vm4d>", "m", MergeNone, "aarch64_sme_fp8_fdot_single_za16_vg1x4", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
+  def SVDOT_SINGLE_FP8_ZA16_VG1x2 : Inst<"svdot[_single]_za16[_mf8]_vg1x2", "vm2d>", "m", MergeNone, "aarch64_sme_fp8_fdot_single_za16_vg1x2", [IsStreaming, IsInOutZA, IsOverloadNone], []>;
+  def SVDOT_SINGLE_FP8_ZA16_VG1x4 : Inst<"svdot[_single]_za16[_mf8]_vg1x4", "vm4d>", "m", MergeNone, "aarch64_sme_fp8_fdot_single_za16_vg1x4", [IsStreaming, IsInOutZA, IsOverloadNone], []>;
 
-  def SVDOT_MULTI_FP8_ZA16_VG1x2 : Inst<"svdot_za16[_mf8]_vg1x2_fpm", "vm22>", "m", MergeNone, "aarch64_sme_fp8_fdot_multi_za16_vg1x2", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
-  def SVDOT_MULTI_FP8_ZA16_VG1x4 : Inst<"svdot_za16[_mf8]_vg1x4_fpm", "vm44>", "m", MergeNone, "aarch64_sme_fp8_fdot_multi_za16_vg1x4", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
+  def SVDOT_MULTI_FP8_ZA16_VG1x2 : Inst<"svdot_za16[_mf8]_vg1x2", "vm22>", "m", MergeNone, "aarch64_sme_fp8_fdot_multi_za16_vg1x2", [IsStreaming, IsInOutZA, IsOverloadNone], []>;
+  def SVDOT_MULTI_FP8_ZA16_VG1x4 : Inst<"svdot_za16[_mf8]_vg1x4", "vm44>", "m", MergeNone, "aarch64_sme_fp8_fdot_multi_za16_vg1x4", [IsStreaming, IsInOutZA, IsOverloadNone], []>;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -859,51 +859,51 @@ let SMETargetGuard = "sme-lutv2" in {
 }
 
 let SMETargetGuard = "sme-f8f32" in {
-  def SVMOPA_FP8_ZA32 : Inst<"svmopa_za32[_mf8]_m_fpm", "viPPdd>", "m", MergeNone, "aarch64_sme_fp8_fmopa_za32",
-                             [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<0, ImmCheck0_3>]>;
+  def SVMOPA_FP8_ZA32 : Inst<"svmopa_za32[_mf8]_m", "viPPdd>", "m", MergeNone, "aarch64_sme_fp8_fmopa_za32",
+                             [IsStreaming, IsInOutZA, IsOverloadNone], [ImmCheck<0, ImmCheck0_3>]>;
    // FMLALL (indexed)
-  def SVMLA_FP8_LANE_ZA32_VG4x1 : Inst<"svmla_lane_za32[_mf8]_vg4x1_fpm", "vmddi>", "m", MergeNone, "aarch64_sme_fp8_fmlall_lane_za32_vg4x1",
-                                       [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
-  def SVMLA_FP8_LANE_ZA32_VG4x2 : Inst<"svmla_lane_za32[_mf8]_vg4x2_fpm", "vm2di>", "m", MergeNone, "aarch64_sme_fp8_fmlall_lane_za32_vg4x2",
-                                       [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
-  def SVMLA_FP8_LANE_ZA16_VG4x4 : Inst<"svmla_lane_za32[_mf8]_vg4x4_fpm", "vm4di>", "m", MergeNone, "aarch64_sme_fp8_fmlall_lane_za32_vg4x4",
-                                       [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVMLA_FP8_LANE_ZA32_VG4x1 : Inst<"svmla_lane_za32[_mf8]_vg4x1", "vmddi>", "m", MergeNone, "aarch64_sme_fp8_fmlall_lane_za32_vg4x1",
+                                       [IsStreaming, IsInOutZA, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVMLA_FP8_LANE_ZA32_VG4x2 : Inst<"svmla_lane_za32[_mf8]_vg4x2", "vm2di>", "m", MergeNone, "aarch64_sme_fp8_fmlall_lane_za32_vg4x2",
+                                       [IsStreaming, IsInOutZA, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVMLA_FP8_LANE_ZA16_VG4x4 : Inst<"svmla_lane_za32[_mf8]_vg4x4", "vm4di>", "m", MergeNone, "aarch64_sme_fp8_fmlall_lane_za32_vg4x4",
+                                       [IsStreaming, IsInOutZA, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
   // FMLALL (single)
-  def SVMLA_FP8_SINGLE_ZA32_VG4x1 : Inst<"svmla[_single]_za32[_mf8]_vg4x1_fpm", "vmdd>", "m", MergeNone, "aarch64_sme_fp8_fmlall_single_za32_vg4x1",
-                                         [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
-  def SVMLA_FP8_SINGLE_ZA32_VG4x2 : Inst<"svmla[_single]_za32[_mf8]_vg4x2_fpm", "vm2d>", "m", MergeNone, "aarch64_sme_fp8_fmlall_single_za32_vg4x2",
-                                         [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
-  def SVMLA_FP8_SINGLE_ZA32_VG4x4 : Inst<"svmla[_single]_za32[_mf8]_vg4x4_fpm", "vm4d>", "m", MergeNone, "aarch64_sme_fp8_fmlall_single_za32_vg4x4",
-                                         [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
+  def SVMLA_FP8_SINGLE_ZA32_VG4x1 : Inst<"svmla[_single]_za32[_mf8]_vg4x1", "vmdd>", "m", MergeNone, "aarch64_sme_fp8_fmlall_single_za32_vg4x1",
+                                         [IsStreaming, IsInOutZA, IsOverloadNone], []>;
+  def SVMLA_FP8_SINGLE_ZA32_VG4x2 : Inst<"svmla[_single]_za32[_mf8]_vg4x2", "vm2d>", "m", MergeNone, "aarch64_sme_fp8_fmlall_single_za32_vg4x2",
+                                         [IsStreaming, IsInOutZA, IsOverloadNone], []>;
+  def SVMLA_FP8_SINGLE_ZA32_VG4x4 : Inst<"svmla[_single]_za32[_mf8]_vg4x4", "vm4d>", "m", MergeNone, "aarch64_sme_fp8_fmlall_single_za32_vg4x4",
+                                         [IsStreaming, IsInOutZA, IsOverloadNone], []>;
   // FMLALL (multiple)
-  def SVMLA_FP8_MULTI_ZA32_VG4x2 : Inst<"svmla_za32[_mf8]_vg4x2_fpm", "vm22>", "m", MergeNone, "aarch64_sme_fp8_fmlall_multi_za32_vg4x2",
-                                        [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
-  def SVMLA_FP8_MULTI_ZA32_VG4x4 : Inst<"svmla_za32[_mf8]_vg4x4_fpm", "vm44>", "m", MergeNone, "aarch64_sme_fp8_fmlall_multi_za32_vg4x4",
-                                        [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
+  def SVMLA_FP8_MULTI_ZA32_VG4x2 : Inst<"svmla_za32[_mf8]_vg4x2", "vm22>", "m", MergeNone, "aarch64_sme_fp8_fmlall_multi_za32_vg4x2",
+                                        [IsStreaming, IsInOutZA, IsOverloadNone], []>;
+  def SVMLA_FP8_MULTI_ZA32_VG4x4 : Inst<"svmla_za32[_mf8]_vg4x4", "vm44>", "m", MergeNone, "aarch64_sme_fp8_fmlall_multi_za32_vg4x4",
+                                        [IsStreaming, IsInOutZA, IsOverloadNone], []>;
 }
 
 let SMETargetGuard = "sme-f8f16" in {
-  def SVMOPA_FP8_ZA16 : Inst<"svmopa_za16[_mf8]_m_fpm", "viPPdd>", "m", MergeNone, "aarch64_sme_fp8_fmopa_za16",
-                             [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<0, ImmCheck0_1>]>;
+  def SVMOPA_FP8_ZA16 : Inst<"svmopa_za16[_mf8]_m", "viPPdd>", "m", MergeNone, "aarch64_sme_fp8_fmopa_za16",
+                             [IsStreaming, IsInOutZA, IsOverloadNone], [ImmCheck<0, ImmCheck0_1>]>;
   // FMLAL (indexed)
-  def SVMLA_FP8_LANE_ZA16_VG2x1 : Inst<"svmla_lane_za16[_mf8]_vg2x1_fpm", "vmddi>", "m", MergeNone, "aarch64_sme_fp8_fmlal_lane_za16_vg2x1",
-                                       [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
-  def SVMLA_FP8_LANE_ZA16_VG2x2 : Inst<"svmla_lane_za16[_mf8]_vg2x2_fpm", "vm2di>", "m", MergeNone, "aarch64_sme_fp8_fmlal_lane_za16_vg2x2",
-                                       [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
-  def SVMLA_FP8_LANE_ZA16_VG2x4 : Inst<"svmla_lane_za16[_mf8]_vg2x4_fpm", "vm4di>", "m", MergeNone, "aarch64_sme_fp8_fmlal_lane_za16_vg2x4",
-                                       [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVMLA_FP8_LANE_ZA16_VG2x1 : Inst<"svmla_lane_za16[_mf8]_vg2x1", "vmddi>", "m", MergeNone, "aarch64_sme_fp8_fmlal_lane_za16_vg2x1",
+                                       [IsStreaming, IsInOutZA, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVMLA_FP8_LANE_ZA16_VG2x2 : Inst<"svmla_lane_za16[_mf8]_vg2x2", "vm2di>", "m", MergeNone, "aarch64_sme_fp8_fmlal_lane_za16_vg2x2",
+                                       [IsStreaming, IsInOutZA, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVMLA_FP8_LANE_ZA16_VG2x4 : Inst<"svmla_lane_za16[_mf8]_vg2x4", "vm4di>", "m", MergeNone, "aarch64_sme_fp8_fmlal_lane_za16_vg2x4",
+                                       [IsStreaming, IsInOutZA, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
   // FMLAL (single)
-  def SVMLA_FP8_SINGLE_ZA16_VG2x1 : Inst<"svmla[_single]_za16[_mf8]_vg2x1_fpm", "vmdd>", "m", MergeNone, "aarch64_sme_fp8_fmlal_single_za16_vg2x1",
-                                         [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
-  def SVMLA_FP8_SINGLE_ZA16_VG2x2 : Inst<"svmla[_single]_za16[_mf8]_vg2x2_fpm", "vm2d>", "m", MergeNone, "aarch64_sme_fp8_fmlal_single_za16_vg2x2",
-                                         [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
-  def SVMLA_FP8_SINGLE_ZA16_VG2x4 : Inst<"svmla[_single]_za16[_mf8]_vg2x4_fpm", "vm4d>", "m", MergeNone, "aarch64_sme_fp8_fmlal_single_za16_vg2x4",
-                                         [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
+  def SVMLA_FP8_SINGLE_ZA16_VG2x1 : Inst<"svmla[_single]_za16[_mf8]_vg2x1", "vmdd>", "m", MergeNone, "aarch64_sme_fp8_fmlal_single_za16_vg2x1",
+                                         [IsStreaming, IsInOutZA, IsOverloadNone], []>;
+  def SVMLA_FP8_SINGLE_ZA16_VG2x2 : Inst<"svmla[_single]_za16[_mf8]_vg2x2", "vm2d>", "m", MergeNone, "aarch64_sme_fp8_fmlal_single_za16_vg2x2",
+                                         [IsStreaming, IsInOutZA, IsOverloadNone], []>;
+  def SVMLA_FP8_SINGLE_ZA16_VG2x4 : Inst<"svmla[_single]_za16[_mf8]_vg2x4", "vm4d>", "m", MergeNone, "aarch64_sme_fp8_fmlal_single_za16_vg2x4",
+                                         [IsStreaming, IsInOutZA, IsOverloadNone], []>;
   // FMLAL (multiple)
-  def SVMLA_FP8_MULTI_ZA16_VG2x2 : Inst<"svmla_za16[_mf8]_vg2x2_fpm", "vm22>", "m", MergeNone, "aarch64_sme_fp8_fmlal_multi_za16_vg2x2",
-                                        [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
-  def SVMLA_FP8_MULTI_ZA16_VG2x4 : Inst<"svmla_za16[_mf8]_vg2x4_fpm", "vm44>", "m", MergeNone, "aarch64_sme_fp8_fmlal_multi_za16_vg2x4",
-                                        [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
+  def SVMLA_FP8_MULTI_ZA16_VG2x2 : Inst<"svmla_za16[_mf8]_vg2x2", "vm22>", "m", MergeNone, "aarch64_sme_fp8_fmlal_multi_za16_vg2x2",
+                                        [IsStreaming, IsInOutZA, IsOverloadNone], []>;
+  def SVMLA_FP8_MULTI_ZA16_VG2x4 : Inst<"svmla_za16[_mf8]_vg2x4", "vm44>", "m", MergeNone, "aarch64_sme_fp8_fmlal_multi_za16_vg2x4",
+                                        [IsStreaming, IsInOutZA, IsOverloadNone], []>;
 }
 
 } // let SVETargetGuard = InvalidMode
diff --git clang/include/clang/Basic/arm_sve.td clang/include/clang/Basic/arm_sve.td
index e7001bac450e..681bfc2d01f6 100644
--- clang/include/clang/Basic/arm_sve.td
+++ clang/include/clang/Basic/arm_sve.td
@@ -2432,18 +2432,18 @@ let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2,fp8" in {
   def FSCALE_X4 : Inst<"svscale[_{d}_x4]", "444.x", "fhd", MergeNone, "aarch64_sme_fp8_scale_x4", [IsStreaming],[]>;
 
   // Convert from FP8 to half-precision/BFloat16 multi-vector
-  def SVF1CVT_X2 : Inst<"svcvt1_{d}[_mf8]_x2_fpm", "2~>", "bh", MergeNone, "aarch64_sve_fp8_cvt1_x2", [IsStreaming, SetsFPMR], []>;
-  def SVF2CVT_X2 : Inst<"svcvt2_{d}[_mf8]_x2_fpm", "2~>", "bh", MergeNone, "aarch64_sve_fp8_cvt2_x2", [IsStreaming, SetsFPMR], []>;
+  def SVF1CVT_X2 : Inst<"svcvt1_{d}[_mf8]_x2", "2~>", "bh", MergeNone, "aarch64_sve_fp8_cvt1_x2", [IsStreaming], []>;
+  def SVF2CVT_X2 : Inst<"svcvt2_{d}[_mf8]_x2", "2~>", "bh", MergeNone, "aarch64_sve_fp8_cvt2_x2", [IsStreaming], []>;
 
   // Convert from FP8 to deinterleaved half-precision/BFloat16 multi-vector
-  def SVF1CVTL_X2 : Inst<"svcvtl1_{d}[_mf8]_x2_fpm",  "2~>", "bh", MergeNone, "aarch64_sve_fp8_cvtl1_x2",  [IsStreaming, SetsFPMR], []>;
-  def SVF2CVTL_X2 : Inst<"svcvtl2_{d}[_mf8]_x2_fpm",  "2~>", "bh", MergeNone, "aarch64_sve_fp8_cvtl2_x2",  [IsStreaming, SetsFPMR], []>;
+  def SVF1CVTL_X2 : Inst<"svcvtl1_{d}[_mf8]_x2",  "2~>", "bh", MergeNone, "aarch64_sve_fp8_cvtl1_x2",  [IsStreaming], []>;
+  def SVF2CVTL_X2 : Inst<"svcvtl2_{d}[_mf8]_x2",  "2~>", "bh", MergeNone, "aarch64_sve_fp8_cvtl2_x2",  [IsStreaming], []>;
 
   // Convert from single/half/bfloat multivector to FP8
-  def SVFCVT_X2 : Inst<"svcvt_mf8[_{d}_x2]_fpm", "~2>", "bh", MergeNone, "aarch64_sve_fp8_cvt_x2", [IsStreaming, SetsFPMR], []>;
-  def SVFCVT_X4 : Inst<"svcvt_mf8[_{d}_x4]_fpm", "~4>", "f",  MergeNone, "aarch64_sve_fp8_cvt_x4", [IsOverloadNone, IsStreaming, SetsFPMR], []>;
+  def SVFCVT_X2 : Inst<"svcvt_mf8[_{d}_x2]", "~2>", "bh", MergeNone, "aarch64_sve_fp8_cvt_x2", [IsStreaming], []>;
+  def SVFCVT_X4 : Inst<"svcvt_mf8[_{d}_x4]", "~4>", "f",  MergeNone, "aarch64_sve_fp8_cvt_x4", [IsOverloadNone, IsStreaming], []>;
   // interleaved
-  def SVFCVTN_X4 : Inst<"svcvtn_mf8[_{d}_x4]_fpm", "~4>", "f", MergeNone, "aarch64_sve_fp8_cvtn_x4", [IsOverloadNone, IsStreaming, SetsFPMR], []>;
+  def SVFCVTN_X4 : Inst<"svcvtn_mf8[_{d}_x4]", "~4>", "f", MergeNone, "aarch64_sve_fp8_cvtn_x4", [IsOverloadNone, IsStreaming], []>;
 }
 
 let SVETargetGuard = "sve2p1", SMETargetGuard = "sme2" in {
@@ -2464,67 +2464,67 @@ let SVETargetGuard = "sve2,fp8", SMETargetGuard = "sme2,fp8" in {
   // SVE FP8 widening conversions
 
   // 8-bit floating-point convert to BFloat16/Float16
-  def SVF1CVT : SInst<"svcvt1_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvt1", [VerifyRuntimeMode, SetsFPMR]>;
-  def SVF2CVT : SInst<"svcvt2_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvt2", [VerifyRuntimeMode, SetsFPMR]>;
+  def SVF1CVT : SInst<"svcvt1_{d}[_mf8]", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvt1", [VerifyRuntimeMode]>;
+  def SVF2CVT : SInst<"svcvt2_{d}[_mf8]", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvt2", [VerifyRuntimeMode]>;
 
   // 8-bit floating-point convert to BFloat16/Float16 (top)
-  def SVF1CVTLT : SInst<"svcvtlt1_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvtlt1", [VerifyRuntimeMode, SetsFPMR]>;
-  def SVF2CVTLT : SInst<"svcvtlt2_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvtlt2", [VerifyRuntimeMode, SetsFPMR]>;
+  def SVF1CVTLT : SInst<"svcvtlt1_{d}[_mf8]", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvtlt1", [VerifyRuntimeMode]>;
+  def SVF2CVTLT : SInst<"svcvtlt2_{d}[_mf8]", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvtlt2", [VerifyRuntimeMode]>;
 
   // BFloat16/Float16 convert, narrow and interleave to 8-bit floating-point
-  def SVFCVTN : SInst<"svcvtn_mf8[_{d}_x2]_fpm", "~2>", "bh", MergeNone, "aarch64_sve_fp8_cvtn", [VerifyRuntimeMode, SetsFPMR]>;
+  def SVFCVTN : SInst<"svcvtn_mf8[_{d}_x2]", "~2>", "bh", MergeNone, "aarch64_sve_fp8_cvtn", [VerifyRuntimeMode]>;
 
   // Single-precision convert, narrow and interleave to 8-bit floating-point (top and bottom)
-  def SVFCVTNB : SInst<"svcvtnb_mf8[_f32_x2]_fpm", "~2>",  "f", MergeNone, "aarch64_sve_fp8_cvtnb", [VerifyRuntimeMode, SetsFPMR]>;
-  def SVFCVTNT : SInst<"svcvtnt_mf8[_f32_x2]_fpm", "~~2>", "f", MergeNone, "aarch64_sve_fp8_cvtnt", [VerifyRuntimeMode, SetsFPMR]>;
+  def SVFCVTNB : SInst<"svcvtnb_mf8[_f32_x2]", "~2>",  "f", MergeNone, "aarch64_sve_fp8_cvtnb", [VerifyRuntimeMode]>;
+  def SVFCVTNT : SInst<"svcvtnt_mf8[_f32_x2]", "~~2>", "f", MergeNone, "aarch64_sve_fp8_cvtnt", [VerifyRuntimeMode]>;
 }
 
 let SVETargetGuard = "sve2,fp8dot2", SMETargetGuard ="sme,ssve-fp8dot2" in {
   // 8-bit floating-point dot product to half-precision (vectors)
-  def SVFDOT_2WAY   :  SInst<"svdot[_f16_mf8]_fpm",   "dd~~>", "h", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>;
-  def SVFDOT_N_2WAY :  SInst<"svdot[_n_f16_mf8]_fpm", "dd~!>", "h", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>;
+  def SVFDOT_2WAY   :  SInst<"svdot[_f16_mf8]",   "dd~~>", "h", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode]>;
+  def SVFDOT_N_2WAY :  SInst<"svdot[_n_f16_mf8]", "dd~!>", "h", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode]>;
 
   // 8-bit floating-point dot product to half-precision (indexed)
-  def SVFDOT_LANE_2WAY :  SInst<"svdot_lane[_f16_mf8]_fpm", "dd~~i>", "h", MergeNone, "aarch64_sve_fp8_fdot_lane", [VerifyRuntimeMode, SetsFPMR], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVFDOT_LANE_2WAY :  SInst<"svdot_lane[_f16_mf8]", "dd~~i>", "h", MergeNone, "aarch64_sve_fp8_fdot_lane", [VerifyRuntimeMode], [ImmCheck<3, ImmCheck0_7>]>;
 }
 
 let SVETargetGuard = "sve2,fp8dot4", SMETargetGuard ="sme,ssve-fp8dot4" in {
   // 8-bit floating-point dot product to single-precision (vectors)
-  def SVFDOT_4WAY   : SInst<"svdot[_f32_mf8]_fpm",   "dd~~>", "f", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>;
-  def SVFDOT_N_4WAY : SInst<"svdot[_n_f32_mf8]_fpm", "dd~!>", "f", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>;
+  def SVFDOT_4WAY   : SInst<"svdot[_f32_mf8]",   "dd~~>", "f", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode]>;
+  def SVFDOT_N_4WAY : SInst<"svdot[_n_f32_mf8]", "dd~!>", "f", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode]>;
 
   // 8-bit floating-point dot product to single-precision (indexed)
-  def SVFDOT_LANE_4WAY :  SInst<"svdot_lane[_f32_mf8]_fpm", "dd~~i>", "f", MergeNone, "aarch64_sve_fp8_fdot_lane", [VerifyRuntimeMode, SetsFPMR], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVFDOT_LANE_4WAY :  SInst<"svdot_lane[_f32_mf8]", "dd~~i>", "f", MergeNone, "aarch64_sve_fp8_fdot_lane", [VerifyRuntimeMode], [ImmCheck<3, ImmCheck0_3>]>;
 }
 
 let SVETargetGuard = "sve2,fp8fma", SMETargetGuard = "sme,ssve-fp8fma" in {
   // 8-bit floating-point multiply-add long to half-precision (bottom)
-  def SVFMLALB   : SInst<"svmlalb[_f16_mf8]_fpm",   "dd~~>", "h", MergeNone, "aarch64_sve_fp8_fmlalb", [VerifyRuntimeMode, SetsFPMR]>;
-  def SVFMLALB_N : SInst<"svmlalb[_n_f16_mf8]_fpm", "dd~!>", "h", MergeNone, "aarch64_sve_fp8_fmlalb", [VerifyRuntimeMode, SetsFPMR]>;
+  def SVFMLALB   : SInst<"svmlalb[_f16_mf8]",   "dd~~>", "h", MergeNone, "aarch64_sve_fp8_fmlalb", [VerifyRuntimeMode]>;
+  def SVFMLALB_N : SInst<"svmlalb[_n_f16_mf8]", "dd~!>", "h", MergeNone, "aarch64_sve_fp8_fmlalb", [VerifyRuntimeMode]>;
 
-  // 8-bit floating-point multiply-add long to ha_fpmlf-precision (bottom, indexed)
-  def SVFMLALB_LANE : SInst<"svmlalb_lane[_f16_mf8]_fpm", "dd~~i>", "h", MergeNone, "aarch64_sve_fp8_fmlalb_lane", [VerifyRuntimeMode, SetsFPMR], [ImmCheck<3, ImmCheck0_15>]>;
+  // 8-bit floating-point multiply-add long to half-precision (bottom, indexed)
+  def SVFMLALB_LANE : SInst<"svmlalb_lane[_f16_mf8]", "dd~~i>", "h", MergeNone, "aarch64_sve_fp8_fmlalb_lane", [VerifyRuntimeMode], [ImmCheck<3, ImmCheck0_15>]>;
 
   // 8-bit floating-point multiply-add long to half-precision (top)
-  def SVFMLALT   : SInst<"svmlalt[_f16_mf8]_fpm",   "dd~~>", "h", MergeNone, "aarch64_sve_fp8_fmlalt", [VerifyRuntimeMode, SetsFPMR]>;
-  def SVFMLALT_N : SInst<"svmlalt[_n_f16_mf8]_fpm", "dd~!>", "h", MergeNone, "aarch64_sve_fp8_fmlalt", [VerifyRuntimeMode, SetsFPMR]>;
+  def SVFMLALT   : SInst<"svmlalt[_f16_mf8]",   "dd~~>", "h", MergeNone, "aarch64_sve_fp8_fmlalt", [VerifyRuntimeMode]>;
+  def SVFMLALT_N : SInst<"svmlalt[_n_f16_mf8]", "dd~!>", "h", MergeNone, "aarch64_sve_fp8_fmlalt", [VerifyRuntimeMode]>;
 
   // 8-bit floating-point multiply-add long to half-precision (top, indexed)
-  def SVFMLALT_LANE : SInst<"svmlalt_lane[_f16_mf8]_fpm", "dd~~i>", "h", MergeNone, "aarch64_sve_fp8_fmlalt_lane", [VerifyRuntimeMode, SetsFPMR], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVFMLALT_LANE : SInst<"svmlalt_lane[_f16_mf8]", "dd~~i>", "h", MergeNone, "aarch64_sve_fp8_fmlalt_lane", [VerifyRuntimeMode], [ImmCheck<3, ImmCheck0_15>]>;
 
   // 8-bit floating-point multiply-add long long to single-precision (all top/bottom variants)
-  def SVFMLALLBB   : SInst<"svmlallbb[_f32_mf8]_fpm",   "dd~~>", "f", MergeNone, "aarch64_sve_fp8_fmlallbb", [VerifyRuntimeMode, SetsFPMR]>;
-  def SVFMLALLBB_N : SInst<"svmlallbb[_n_f32_mf8]_fpm", "dd~!>", "f", MergeNone, "aarch64_sve_fp8_fmlallbb", [VerifyRuntimeMode, SetsFPMR]>;
-  def SVFMLALLBT   : SInst<"svmlallbt[_f32_mf8]_fpm",   "dd~~>", "f", MergeNone, "aarch64_sve_fp8_fmlallbt", [VerifyRuntimeMode, SetsFPMR]>;
-  def SVFMLALLBT_N : SInst<"svmlallbt[_n_f32_mf8]_fpm", "dd~!>", "f", MergeNone, "aarch64_sve_fp8_fmlallbt", [VerifyRuntimeMode, SetsFPMR]>;
-  def SVFMLALLTB   : SInst<"svmlalltb[_f32_mf8]_fpm",   "dd~~>", "f", MergeNone, "aarch64_sve_fp8_fmlalltb", [VerifyRuntimeMode, SetsFPMR]>;
-  def SVFMLALLTB_N : SInst<"svmlalltb[_n_f32_mf8]_fpm", "dd~!>", "f", MergeNone, "aarch64_sve_fp8_fmlalltb", [VerifyRuntimeMode, SetsFPMR]>;
-  def SVFMLALLTT   : SInst<"svmlalltt[_f32_mf8]_fpm",   "dd~~>", "f", MergeNone, "aarch64_sve_fp8_fmlalltt", [VerifyRuntimeMode, SetsFPMR]>;
-  def SVFMLALLTT_N : SInst<"svmlalltt[_n_f32_mf8]_fpm", "dd~!>", "f", MergeNone, "aarch64_sve_fp8_fmlalltt", [VerifyRuntimeMode, SetsFPMR]>;
+  def SVFMLALLBB   : SInst<"svmlallbb[_f32_mf8]",   "dd~~>", "f", MergeNone, "aarch64_sve_fp8_fmlallbb", [VerifyRuntimeMode]>;
+  def SVFMLALLBB_N : SInst<"svmlallbb[_n_f32_mf8]", "dd~!>", "f", MergeNone, "aarch64_sve_fp8_fmlallbb", [VerifyRuntimeMode]>;
+  def SVFMLALLBT   : SInst<"svmlallbt[_f32_mf8]",   "dd~~>", "f", MergeNone, "aarch64_sve_fp8_fmlallbt", [VerifyRuntimeMode]>;
+  def SVFMLALLBT_N : SInst<"svmlallbt[_n_f32_mf8]", "dd~!>", "f", MergeNone, "aarch64_sve_fp8_fmlallbt", [VerifyRuntimeMode]>;
+  def SVFMLALLTB   : SInst<"svmlalltb[_f32_mf8]",   "dd~~>", "f", MergeNone, "aarch64_sve_fp8_fmlalltb", [VerifyRuntimeMode]>;
+  def SVFMLALLTB_N : SInst<"svmlalltb[_n_f32_mf8]", "dd~!>", "f", MergeNone, "aarch64_sve_fp8_fmlalltb", [VerifyRuntimeMode]>;
+  def SVFMLALLTT   : SInst<"svmlalltt[_f32_mf8]",   "dd~~>", "f", MergeNone, "aarch64_sve_fp8_fmlalltt", [VerifyRuntimeMode]>;
+  def SVFMLALLTT_N : SInst<"svmlalltt[_n_f32_mf8]", "dd~!>", "f", MergeNone, "aarch64_sve_fp8_fmlalltt", [VerifyRuntimeMode]>;
 
   // 8-bit floating-point multiply-add long long to single-precision (indexed, all top/bottom variants)
-  def SVFMLALLBB_LANE : SInst<"svmlallbb_lane[_f32_mf8]_fpm", "dd~~i>", "f", MergeNone, "aarch64_sve_fp8_fmlallbb_lane", [VerifyRuntimeMode, SetsFPMR], [ImmCheck<3, ImmCheck0_7>]>;
-  def SVFMLALLBT_LANE : SInst<"svmlallbt_lane[_f32_mf8]_fpm", "dd~~i>", "f", MergeNone, "aarch64_sve_fp8_fmlallbt_lane", [VerifyRuntimeMode, SetsFPMR], [ImmCheck<3, ImmCheck0_7>]>;
-  def SVFMLALLTB_LANE : SInst<"svmlalltb_lane[_f32_mf8]_fpm", "dd~~i>", "f", MergeNone, "aarch64_sve_fp8_fmlalltb_lane", [VerifyRuntimeMode, SetsFPMR], [ImmCheck<3, ImmCheck0_7>]>;
-  def SVFMLALLTT_LANE : SInst<"svmlalltt_lane[_f32_mf8]_fpm", "dd~~i>", "f", MergeNone, "aarch64_sve_fp8_fmlalltt_lane", [VerifyRuntimeMode, SetsFPMR], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVFMLALLBB_LANE : SInst<"svmlallbb_lane[_f32_mf8]", "dd~~i>", "f", MergeNone, "aarch64_sve_fp8_fmlallbb_lane", [VerifyRuntimeMode], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVFMLALLBT_LANE : SInst<"svmlallbt_lane[_f32_mf8]", "dd~~i>", "f", MergeNone, "aarch64_sve_fp8_fmlallbt_lane", [VerifyRuntimeMode], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVFMLALLTB_LANE : SInst<"svmlalltb_lane[_f32_mf8]", "dd~~i>", "f", MergeNone, "aarch64_sve_fp8_fmlalltb_lane", [VerifyRuntimeMode], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVFMLALLTT_LANE : SInst<"svmlalltt_lane[_f32_mf8]", "dd~~i>", "f", MergeNone, "aarch64_sve_fp8_fmlalltt_lane", [VerifyRuntimeMode], [ImmCheck<3, ImmCheck0_7>]>;
 }
diff --git clang/include/clang/CIRFrontendAction/.clang-tidy clang/include/clang/CIR/FrontendAction/.clang-tidy
similarity index 75%
rename from clang/include/clang/CIRFrontendAction/.clang-tidy
rename to clang/include/clang/CIR/FrontendAction/.clang-tidy
index ef88dbcec488..1a5dfe141806 100644
--- clang/include/clang/CIRFrontendAction/.clang-tidy
+++ clang/include/clang/CIR/FrontendAction/.clang-tidy
@@ -51,3 +51,18 @@ Checks: >
         readability-simplify-boolean-expr,
         readability-simplify-subscript-expr,
         readability-use-anyofallof
+CheckOptions:
+  - key:             readability-identifier-naming.ClassCase
+    value:           CamelCase
+  - key:             readability-identifier-naming.EnumCase
+    value:           CamelCase
+  - key:             readability-identifier-naming.FunctionCase
+    value:           camelBack
+  - key:             readability-identifier-naming.MemberCase
+    value:           CamelCase
+  - key:             readability-identifier-naming.ParameterCase
+    value:           CamelCase
+  - key:             readability-identifier-naming.UnionCase
+    value:           CamelCase
+  - key:             readability-identifier-naming.VariableCase
+    value:           CamelCase
diff --git clang/include/clang/CIR/FrontendAction/CIRGenAction.h clang/include/clang/CIR/FrontendAction/CIRGenAction.h
index 2ab612613b73..5f9110bc83b8 100644
--- clang/include/clang/CIR/FrontendAction/CIRGenAction.h
+++ clang/include/clang/CIR/FrontendAction/CIRGenAction.h
@@ -26,6 +26,7 @@ class CIRGenAction : public clang::ASTFrontendAction {
 public:
   enum class OutputType {
     EmitCIR,
+    EmitLLVM,
   };
 
 private:
@@ -55,6 +56,13 @@ public:
   EmitCIRAction(mlir::MLIRContext *MLIRCtx = nullptr);
 };
 
+class EmitLLVMAction : public CIRGenAction {
+  virtual void anchor();
+
+public:
+  EmitLLVMAction(mlir::MLIRContext *MLIRCtx = nullptr);
+};
+
 } // namespace cir
 
 #endif
diff --git clang/include/clang/CIR/LowerToLLVM.h clang/include/clang/CIR/LowerToLLVM.h
new file mode 100644
index 000000000000..6e1b0270fcd2
--- /dev/null
+++ clang/include/clang/CIR/LowerToLLVM.h
@@ -0,0 +1,35 @@
+//====- LowerToLLVM.h- Lowering from CIR to LLVM --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares an interface for converting CIR modules to LLVM IR.
+//
+//===----------------------------------------------------------------------===//
+#ifndef CLANG_CIR_LOWERTOLLVM_H
+#define CLANG_CIR_LOWERTOLLVM_H
+
+#include <memory>
+
+namespace llvm {
+class LLVMContext;
+class Module;
+} // namespace llvm
+
+namespace mlir {
+class ModuleOp;
+} // namespace mlir
+
+namespace cir {
+
+namespace direct {
+std::unique_ptr<llvm::Module>
+lowerDirectlyFromCIRToLLVMIR(mlir::ModuleOp mlirModule,
+                             llvm::LLVMContext &llvmCtx);
+} // namespace direct
+} // namespace cir
+
+#endif // CLANG_CIR_LOWERTOLLVM_H
diff --git clang/include/clang/CIR/MissingFeatures.h clang/include/clang/CIR/MissingFeatures.h
new file mode 100644
index 000000000000..3c018aeea650
--- /dev/null
+++ clang/include/clang/CIR/MissingFeatures.h
@@ -0,0 +1,40 @@
+//===---- MissingFeatures.h - Checks for unimplemented features -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file introduces some helper classes to guard against features that
+// CIR dialect supports that we do not have and also do not have great ways to
+// assert against.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CLANG_CIR_MISSINGFEATURES_H
+#define CLANG_CIR_MISSINGFEATURES_H
+
+namespace cir {
+
+// As a way to track features that haven't yet been implemented this class
+// explicitly contains a list of static fns that will return false that you
+// can guard against. If and when a feature becomes implemented simply changing
+// this return to true will cause compilation to fail at all the points in which
+// we noted that we needed to address. This is a much more explicit way to
+// handle "TODO"s.
+struct MissingFeatures {
+  // Address space related
+  static bool addressSpace() { return false; }
+
+  // Unhandled global/linkage information.
+  static bool opGlobalDSOLocal() { return false; }
+  static bool opGlobalThreadLocal() { return false; }
+  static bool opGlobalConstant() { return false; }
+  static bool opGlobalAlignment() { return false; }
+  static bool opGlobalLinkage() { return false; }
+};
+
+} // namespace cir
+
+#endif // CLANG_CIR_MISSINGFEATURES_H
diff --git clang/include/clang/Driver/Options.td clang/include/clang/Driver/Options.td
index 6eabd9f76a79..0ab923fcdd58 100644
--- clang/include/clang/Driver/Options.td
+++ clang/include/clang/Driver/Options.td
@@ -6341,15 +6341,13 @@ def mno_avx : Flag<["-"], "mno-avx">, Group<m_x86_Features_Group>;
 def mavx10_1_256 : Flag<["-"], "mavx10.1-256">, Group<m_x86_AVX10_Features_Group>;
 def mno_avx10_1_256 : Flag<["-"], "mno-avx10.1-256">, Group<m_x86_AVX10_Features_Group>;
 def mavx10_1_512 : Flag<["-"], "mavx10.1-512">, Group<m_x86_AVX10_Features_Group>;
-def mno_avx10_1_512 : Flag<["-"], "mno-avx10.1-512">, Group<m_x86_AVX10_Features_Group>;
-def mavx10_1 : Flag<["-"], "mavx10.1">, Alias<mavx10_1_256>;
-def mno_avx10_1 : Flag<["-"], "mno-avx10.1">, Alias<mno_avx10_1_256>;
+def mno_avx10_1_512 : Flag<["-"], "mno-avx10.1-512">, Alias<mno_avx10_1_256>;
+def mavx10_1 : Flag<["-"], "mavx10.1">, Flags<[Unsupported]>;
+def mno_avx10_1 : Flag<["-"], "mno-avx10.1">, Flags<[Unsupported]>;
 def mavx10_2_256 : Flag<["-"], "mavx10.2-256">, Group<m_x86_AVX10_Features_Group>;
-def mno_avx10_2_256 : Flag<["-"], "mno-avx10.2-256">, Group<m_x86_AVX10_Features_Group>;
 def mavx10_2_512 : Flag<["-"], "mavx10.2-512">, Group<m_x86_AVX10_Features_Group>;
-def mno_avx10_2_512 : Flag<["-"], "mno-avx10.2-512">, Group<m_x86_AVX10_Features_Group>;
-def mavx10_2 : Flag<["-"], "mavx10.2">, Alias<mavx10_2_256>;
-def mno_avx10_2 : Flag<["-"], "mno-avx10.2">, Alias<mno_avx10_2_256>;
+def mavx10_2 : Flag<["-"], "mavx10.2">, Alias<mavx10_2_512>;
+def mno_avx10_2 : Flag<["-"], "mno-avx10.2">, Group<m_x86_AVX10_Features_Group>;
 def mavx2 : Flag<["-"], "mavx2">, Group<m_x86_Features_Group>;
 def mno_avx2 : Flag<["-"], "mno-avx2">, Group<m_x86_Features_Group>;
 def mavx512f : Flag<["-"], "mavx512f">, Group<m_x86_Features_Group>;
@@ -7304,6 +7302,9 @@ let Visibility = [CC1Option, FC1Option] in {
 def mlink_builtin_bitcode : Separate<["-"], "mlink-builtin-bitcode">,
   HelpText<"Link and internalize needed symbols from the given bitcode file "
            "before performing optimizations.">;
+def mlink_bitcode_file
+    : Separate<["-"], "mlink-bitcode-file">,
+      HelpText<"Link the given bitcode file before performing optimizations.">;
 } // let Visibility = [CC1Option, FC1Option]
 
 let Visibility = [CC1Option] in {
@@ -7407,14 +7408,13 @@ def msmall_data_limit : Separate<["-"], "msmall-data-limit">,
 def funwind_tables_EQ : Joined<["-"], "funwind-tables=">,
   HelpText<"Generate unwinding tables for all functions">,
   MarshallingInfoInt<CodeGenOpts<"UnwindTables">>;
-defm constructor_aliases : BoolMOption<"constructor-aliases",
-  CodeGenOpts<"CXXCtorDtorAliases">, DefaultFalse,
-  PosFlag<SetTrue, [], [ClangOption], "Enable">,
-  NegFlag<SetFalse, [], [ClangOption], "Disable">,
-  BothFlags<[], [ClangOption, CC1Option],
-          " emitting complete constructors and destructors as aliases when possible">>;
-def mlink_bitcode_file : Separate<["-"], "mlink-bitcode-file">,
-  HelpText<"Link the given bitcode file before performing optimizations.">;
+defm constructor_aliases
+    : BoolMOption<"constructor-aliases", CodeGenOpts<"CXXCtorDtorAliases">,
+                  DefaultFalse, PosFlag<SetTrue, [], [ClangOption], "Enable">,
+                  NegFlag<SetFalse, [], [ClangOption], "Disable">,
+                  BothFlags<[], [ClangOption, CC1Option],
+                            " emitting complete constructors and destructors "
+                            "as aliases when possible">>;
 defm link_builtin_bitcode_postopt: BoolMOption<"link-builtin-bitcode-postopt",
   CodeGenOpts<"LinkBitcodePostopt">, DefaultFalse,
   PosFlag<SetTrue, [], [ClangOption], "Link builtin bitcodes after the "
@@ -7589,6 +7589,11 @@ def fexperimental_assignment_tracking_EQ : Joined<["-"], "fexperimental-assignme
 def enable_tlsdesc : Flag<["-"], "enable-tlsdesc">,
   MarshallingInfoFlag<CodeGenOpts<"EnableTLSDESC">>;
 
+def import_call_optimization : Flag<["-"], "import-call-optimization">,
+    HelpText<"Emit Import Call sections on supported targets that can be used "
+             "by the Windows kernel to enable import call optimization">,
+    MarshallingInfoFlag<CodeGenOpts<"ImportCallOptimization">>;
+
 } // let Visibility = [CC1Option]
 
 //===----------------------------------------------------------------------===//
diff --git clang/include/clang/Lex/Preprocessor.h clang/include/clang/Lex/Preprocessor.h
index 416f403c2984..2bf4d1a16699 100644
--- clang/include/clang/Lex/Preprocessor.h
+++ clang/include/clang/Lex/Preprocessor.h
@@ -933,7 +933,7 @@ private:
     }
 
     ArrayRef<ModuleMacro*> getOverriddenMacros() const {
-      if (auto *Info = State.dyn_cast<ModuleMacroInfo*>())
+      if (auto *Info = dyn_cast_if_present<ModuleMacroInfo *>(State))
         return Info->OverriddenMacros;
       return {};
     }
diff --git clang/include/clang/Parse/Parser.h clang/include/clang/Parse/Parser.h
index e99d2cf2eaa4..335258d59702 100644
--- clang/include/clang/Parse/Parser.h
+++ clang/include/clang/Parse/Parser.h
@@ -3710,6 +3710,7 @@ private:
     SourceLocation RParenLoc;
     SourceLocation EndLoc;
     SourceLocation MiscLoc;
+    OpenACCAtomicKind AtomicKind;
     SmallVector<Expr *> Exprs;
     SmallVector<OpenACCClause *> Clauses;
     // TODO OpenACC: As we implement support for the Atomic, Routine, and Cache
diff --git clang/include/clang/Sema/Sema.h clang/include/clang/Sema/Sema.h
index 472a0e25adc9..59e29262e350 100644
--- clang/include/clang/Sema/Sema.h
+++ clang/include/clang/Sema/Sema.h
@@ -3664,6 +3664,12 @@ public:
                              NonTrivialCUnionContext UseContext,
                              unsigned NonTrivialKind);
 
+  /// Certain globally-unique variables might be accidentally duplicated if
+  /// built into multiple shared libraries with hidden visibility. This can
+  /// cause problems if the variable is mutable, its initialization is
+  /// effectful, or its address is taken.
+  bool GloballyUniqueObjectMightBeAccidentallyDuplicated(const VarDecl *Dcl);
+
   /// AddInitializerToDecl - Adds the initializer Init to the
   /// declaration dcl. If DirectInit is true, this is C++ direct
   /// initialization rather than copy initialization.
diff --git clang/include/clang/Sema/SemaOpenACC.h clang/include/clang/Sema/SemaOpenACC.h
index 2e5a0ea0aaac..3004b98760a9 100644
--- clang/include/clang/Sema/SemaOpenACC.h
+++ clang/include/clang/Sema/SemaOpenACC.h
@@ -695,24 +695,53 @@ public:
   /// should check legality of the statement as it appertains to this Construct.
   StmtResult ActOnAssociatedStmt(SourceLocation DirectiveLoc,
                                  OpenACCDirectiveKind K,
+                                 OpenACCAtomicKind AtKind,
                                  ArrayRef<const OpenACCClause *> Clauses,
                                  StmtResult AssocStmt);
 
+  StmtResult ActOnAssociatedStmt(SourceLocation DirectiveLoc,
+                                 OpenACCDirectiveKind K,
+                                 ArrayRef<const OpenACCClause *> Clauses,
+                                 StmtResult AssocStmt) {
+    return ActOnAssociatedStmt(DirectiveLoc, K, OpenACCAtomicKind::None,
+                               Clauses, AssocStmt);
+  }
+  /// Called to check the form of the `atomic` construct which has some fairly
+  /// sizable restrictions.
+  StmtResult CheckAtomicAssociatedStmt(SourceLocation AtomicDirLoc,
+                                       OpenACCAtomicKind AtKind,
+                                       StmtResult AssocStmt);
+
   /// Called after the directive has been completely parsed, including the
   /// declaration group or associated statement.
+  /// DirLoc: Location of the actual directive keyword.
   /// LParenLoc: Location of the left paren, if it exists (not on all
   /// constructs).
   /// MiscLoc: First misc location, if necessary (not all constructs).
   /// Exprs: List of expressions on the construct itself, if necessary (not all
   /// constructs).
+  /// AK: The atomic kind of the directive, if necessary (atomic only)
   /// RParenLoc: Location of the right paren, if it exists (not on all
   /// constructs).
+  /// EndLoc: The last source location of the driective.
+  /// Clauses: The list of clauses for the directive, if present.
+  /// AssocStmt: The associated statement for this construct, if necessary.
   StmtResult ActOnEndStmtDirective(
       OpenACCDirectiveKind K, SourceLocation StartLoc, SourceLocation DirLoc,
       SourceLocation LParenLoc, SourceLocation MiscLoc, ArrayRef<Expr *> Exprs,
-      SourceLocation RParenLoc, SourceLocation EndLoc,
+      OpenACCAtomicKind AK, SourceLocation RParenLoc, SourceLocation EndLoc,
       ArrayRef<OpenACCClause *> Clauses, StmtResult AssocStmt);
 
+  StmtResult ActOnEndStmtDirective(
+      OpenACCDirectiveKind K, SourceLocation StartLoc, SourceLocation DirLoc,
+      SourceLocation LParenLoc, SourceLocation MiscLoc, ArrayRef<Expr *> Exprs,
+      SourceLocation RParenLoc, SourceLocation EndLoc,
+      ArrayRef<OpenACCClause *> Clauses, StmtResult AssocStmt) {
+    return ActOnEndStmtDirective(K, StartLoc, DirLoc, LParenLoc, MiscLoc, Exprs,
+                                 OpenACCAtomicKind::None, RParenLoc, EndLoc,
+                                 Clauses, AssocStmt);
+  }
+
   /// Called after the directive has been completely parsed, including the
   /// declaration group or associated statement.
   DeclGroupRef ActOnEndDeclDirective();
diff --git clang/include/clang/Serialization/ASTBitCodes.h clang/include/clang/Serialization/ASTBitCodes.h
index 7656add0b688..3c184db5b2ad 100644
--- clang/include/clang/Serialization/ASTBitCodes.h
+++ clang/include/clang/Serialization/ASTBitCodes.h
@@ -2045,6 +2045,7 @@ enum StmtCode {
   STMT_OPENACC_SHUTDOWN_CONSTRUCT,
   STMT_OPENACC_SET_CONSTRUCT,
   STMT_OPENACC_UPDATE_CONSTRUCT,
+  STMT_OPENACC_ATOMIC_CONSTRUCT,
 
   // HLSL Constructs
   EXPR_HLSL_OUT_ARG,
diff --git clang/include/clang/StaticAnalyzer/Core/BugReporter/BugReporter.h clang/include/clang/StaticAnalyzer/Core/BugReporter/BugReporter.h
index 8974342166fa..7563d8bbd1d2 100644
--- clang/include/clang/StaticAnalyzer/Core/BugReporter/BugReporter.h
+++ clang/include/clang/StaticAnalyzer/Core/BugReporter/BugReporter.h
@@ -748,8 +748,8 @@ public:
 /// It can be valuable to produce tags with some bits of information and later
 /// reuse them for a better diagnostic.
 ///
-/// Please make sure that derived class' constuctor is private and that the user
-/// can only create objects using DataTag::Factory.  This also means that
+/// Please make sure that derived class' constructor is private and that the
+/// user can only create objects using DataTag::Factory.  This also means that
 /// DataTag::Factory should be friend for every derived class.
 class DataTag : public ProgramPointTag {
 public:
diff --git clang/include/clang/StaticAnalyzer/Core/PathSensitive/SMTConstraintManager.h clang/include/clang/StaticAnalyzer/Core/PathSensitive/SMTConstraintManager.h
index 7cfb24e5e649..e084a1399530 100644
--- clang/include/clang/StaticAnalyzer/Core/PathSensitive/SMTConstraintManager.h
+++ clang/include/clang/StaticAnalyzer/Core/PathSensitive/SMTConstraintManager.h
@@ -353,12 +353,7 @@ protected:
     addStateConstraints(NewState);
 
     std::optional<bool> res = Solver->check();
-    if (!res)
-      Cached[hash] = ConditionTruthVal();
-    else
-      Cached[hash] = ConditionTruthVal(*res);
-
-    return Cached[hash];
+    return Cached[hash] = res ? ConditionTruthVal(*res) : ConditionTruthVal();
   }
 
   // Cache the result of an SMT query (true, false, unknown). The key is the
diff --git clang/lib/AST/ASTContext.cpp clang/lib/AST/ASTContext.cpp
index 4e387da6dccd..f3aedbc39d12 100644
--- clang/lib/AST/ASTContext.cpp
+++ clang/lib/AST/ASTContext.cpp
@@ -7224,6 +7224,16 @@ static bool isSameQualifier(const NestedNameSpecifier *X,
   return !PX && !PY;
 }
 
+static bool hasSameCudaAttrs(const FunctionDecl *A, const FunctionDecl *B) {
+  if (!A->getASTContext().getLangOpts().CUDA)
+    return true; // Target attributes are overloadable in CUDA compilation only.
+  if (A->hasAttr<CUDADeviceAttr>() != B->hasAttr<CUDADeviceAttr>())
+    return false;
+  if (A->hasAttr<CUDADeviceAttr>() && B->hasAttr<CUDADeviceAttr>())
+    return A->hasAttr<CUDAHostAttr>() == B->hasAttr<CUDAHostAttr>();
+  return true; // unattributed and __host__ functions are the same.
+}
+
 /// Determine whether the attributes we can overload on are identical for A and
 /// B. Will ignore any overloadable attrs represented in the type of A and B.
 static bool hasSameOverloadableAttrs(const FunctionDecl *A,
@@ -7254,7 +7264,7 @@ static bool hasSameOverloadableAttrs(const FunctionDecl *A,
     if (Cand1ID != Cand2ID)
       return false;
   }
-  return true;
+  return hasSameCudaAttrs(A, B);
 }
 
 bool ASTContext::isSameEntity(const NamedDecl *X, const NamedDecl *Y) const {
@@ -10363,7 +10373,8 @@ bool ASTContext::areLaxCompatibleSveTypes(QualType FirstType,
 /// getRVVTypeSize - Return RVV vector register size.
 static uint64_t getRVVTypeSize(ASTContext &Context, const BuiltinType *Ty) {
   assert(Ty->isRVVVLSBuiltinType() && "Invalid RVV Type");
-  auto VScale = Context.getTargetInfo().getVScaleRange(Context.getLangOpts());
+  auto VScale =
+      Context.getTargetInfo().getVScaleRange(Context.getLangOpts(), false);
   if (!VScale)
     return 0;
 
diff --git clang/lib/AST/ByteCode/ByteCodeEmitter.cpp clang/lib/AST/ByteCode/ByteCodeEmitter.cpp
index 19e2416c4c94..5bd1b73133d6 100644
--- clang/lib/AST/ByteCode/ByteCodeEmitter.cpp
+++ clang/lib/AST/ByteCode/ByteCodeEmitter.cpp
@@ -135,11 +135,9 @@ Function *ByteCodeEmitter::compileFunc(const FunctionDecl *FuncDecl) {
   // Create a handle over the emitted code.
   Function *Func = P.getFunction(FuncDecl);
   if (!Func) {
-    unsigned BuiltinID = FuncDecl->getBuiltinID();
-    Func =
-        P.createFunction(FuncDecl, ParamOffset, std::move(ParamTypes),
-                         std::move(ParamDescriptors), std::move(ParamOffsets),
-                         HasThisPointer, HasRVO, BuiltinID);
+    Func = P.createFunction(FuncDecl, ParamOffset, std::move(ParamTypes),
+                            std::move(ParamDescriptors),
+                            std::move(ParamOffsets), HasThisPointer, HasRVO);
   }
 
   assert(Func);
@@ -212,8 +210,7 @@ Function *ByteCodeEmitter::compileObjCBlock(const BlockExpr *BE) {
   Function *Func =
       P.createFunction(BE, ParamOffset, std::move(ParamTypes),
                        std::move(ParamDescriptors), std::move(ParamOffsets),
-                       /*HasThisPointer=*/false, /*HasRVO=*/false,
-                       /*IsUnevaluatedBuiltin=*/false);
+                       /*HasThisPointer=*/false, /*HasRVO=*/false);
 
   assert(Func);
   Func->setDefined(true);
diff --git clang/lib/AST/ByteCode/Compiler.cpp clang/lib/AST/ByteCode/Compiler.cpp
index 4659d0e00784..c1408379b4c1 100644
--- clang/lib/AST/ByteCode/Compiler.cpp
+++ clang/lib/AST/ByteCode/Compiler.cpp
@@ -562,8 +562,10 @@ bool Compiler<Emitter>::VisitCastExpr(const CastExpr *CE) {
     // We're creating a complex value here, so we need to
     // allocate storage for it.
     if (!Initializing) {
-      unsigned LocalIndex = allocateTemporary(CE);
-      if (!this->emitGetPtrLocal(LocalIndex, CE))
+      std::optional<unsigned> LocalIndex = allocateTemporary(CE);
+      if (!LocalIndex)
+        return false;
+      if (!this->emitGetPtrLocal(*LocalIndex, CE))
         return false;
     }
 
@@ -679,8 +681,10 @@ bool Compiler<Emitter>::VisitCastExpr(const CastExpr *CE) {
     assert(CE->getType()->isVectorType());
 
     if (!Initializing) {
-      unsigned LocalIndex = allocateTemporary(CE);
-      if (!this->emitGetPtrLocal(LocalIndex, CE))
+      std::optional<unsigned> LocalIndex = allocateTemporary(CE);
+      if (!LocalIndex)
+        return false;
+      if (!this->emitGetPtrLocal(*LocalIndex, CE))
         return false;
     }
     unsigned ToSize = CE->getType()->getAs<VectorType>()->getNumElements();
@@ -759,8 +763,10 @@ bool Compiler<Emitter>::VisitImaginaryLiteral(const ImaginaryLiteral *E) {
     return true;
 
   if (!Initializing) {
-    unsigned LocalIndex = allocateTemporary(E);
-    if (!this->emitGetPtrLocal(LocalIndex, E))
+    std::optional<unsigned> LocalIndex = allocateTemporary(E);
+    if (!LocalIndex)
+      return false;
+    if (!this->emitGetPtrLocal(*LocalIndex, E))
       return false;
   }
 
@@ -1118,8 +1124,10 @@ template <class Emitter>
 bool Compiler<Emitter>::VisitComplexBinOp(const BinaryOperator *E) {
   // Prepare storage for result.
   if (!Initializing) {
-    unsigned LocalIndex = allocateTemporary(E);
-    if (!this->emitGetPtrLocal(LocalIndex, E))
+    std::optional<unsigned> LocalIndex = allocateTemporary(E);
+    if (!LocalIndex)
+      return false;
+    if (!this->emitGetPtrLocal(*LocalIndex, E))
       return false;
   }
 
@@ -1175,7 +1183,10 @@ bool Compiler<Emitter>::VisitComplexBinOp(const BinaryOperator *E) {
 
     if (!LHSIsComplex) {
       // This is using the RHS type for the fake-complex LHS.
-      LHSOffset = allocateTemporary(RHS);
+      std::optional<unsigned> LocalIndex = allocateTemporary(RHS);
+      if (!LocalIndex)
+        return false;
+      LHSOffset = *LocalIndex;
 
       if (!this->emitGetPtrLocal(LHSOffset, E))
         return false;
@@ -1347,8 +1358,10 @@ bool Compiler<Emitter>::VisitVectorBinOp(const BinaryOperator *E) {
 
   // Prepare storage for result.
   if (!Initializing && !E->isCompoundAssignmentOp()) {
-    unsigned LocalIndex = allocateTemporary(E);
-    if (!this->emitGetPtrLocal(LocalIndex, E))
+    std::optional<unsigned> LocalIndex = allocateTemporary(E);
+    if (!LocalIndex)
+      return false;
+    if (!this->emitGetPtrLocal(*LocalIndex, E))
       return false;
   }
 
@@ -4170,14 +4183,16 @@ Compiler<Emitter>::allocateLocal(DeclTy &&Src, QualType Ty,
 }
 
 template <class Emitter>
-unsigned Compiler<Emitter>::allocateTemporary(const Expr *E) {
+std::optional<unsigned> Compiler<Emitter>::allocateTemporary(const Expr *E) {
   QualType Ty = E->getType();
   assert(!Ty->isRecordType());
 
   Descriptor *D = P.createDescriptor(
       E, Ty.getTypePtr(), Descriptor::InlineDescMD, Ty.isConstQualified(),
       /*IsTemporary=*/true, /*IsMutable=*/false, /*Init=*/nullptr);
-  assert(D);
+
+  if (!D)
+    return std::nullopt;
 
   Scope::Local Local = this->createLocal(D);
   VariableScope<Emitter> *S = VarScope;
@@ -4661,7 +4676,7 @@ bool Compiler<Emitter>::VisitCallExpr(const CallExpr *E) {
       OCE && OCE->isAssignmentOp()) {
     // Just like with regular assignments, we need to special-case assignment
     // operators here and evaluate the RHS (the second arg) before the LHS (the
-    // first arg. We fix this by using a Flip op later.
+    // first arg). We fix this by using a Flip op later.
     assert(Args.size() == 2);
     IsAssignmentOperatorCall = true;
     std::reverse(Args.begin(), Args.end());
@@ -4976,8 +4991,8 @@ bool Compiler<Emitter>::visitCompoundStmt(const CompoundStmt *S) {
 template <class Emitter>
 bool Compiler<Emitter>::visitDeclStmt(const DeclStmt *DS) {
   for (const auto *D : DS->decls()) {
-    if (isa<StaticAssertDecl, TagDecl, TypedefNameDecl, UsingEnumDecl,
-            FunctionDecl>(D))
+    if (isa<StaticAssertDecl, TagDecl, TypedefNameDecl, BaseUsingDecl,
+            FunctionDecl, NamespaceAliasDecl>(D))
       continue;
 
     const auto *VD = dyn_cast<VarDecl>(D);
@@ -5649,6 +5664,21 @@ bool Compiler<Emitter>::compileDestructor(const CXXDestructorDecl *Dtor) {
   return this->emitPopPtr(Dtor) && this->emitRetVoid(Dtor);
 }
 
+template <class Emitter>
+bool Compiler<Emitter>::compileUnionAssignmentOperator(
+    const CXXMethodDecl *MD) {
+  if (!this->emitThis(MD))
+    return false;
+
+  auto PVD = MD->getParamDecl(0);
+  ParamOffset PO = this->Params[PVD]; // Must exist.
+
+  if (!this->emitGetParam(PT_Ptr, PO.Offset, MD))
+    return false;
+
+  return this->emitMemcpy(MD) && this->emitRet(PT_Ptr, MD);
+}
+
 template <class Emitter>
 bool Compiler<Emitter>::visitFunc(const FunctionDecl *F) {
   // Classify the return type.
@@ -5660,9 +5690,16 @@ bool Compiler<Emitter>::visitFunc(const FunctionDecl *F) {
     return this->compileDestructor(Dtor);
 
   // Emit custom code if this is a lambda static invoker.
-  if (const auto *MD = dyn_cast<CXXMethodDecl>(F);
-      MD && MD->isLambdaStaticInvoker())
-    return this->emitLambdaStaticInvokerBody(MD);
+  if (const auto *MD = dyn_cast<CXXMethodDecl>(F)) {
+    const RecordDecl *RD = MD->getParent();
+
+    if (RD->isUnion() &&
+        (MD->isCopyAssignmentOperator() || MD->isMoveAssignmentOperator()))
+      return this->compileUnionAssignmentOperator(MD);
+
+    if (MD->isLambdaStaticInvoker())
+      return this->emitLambdaStaticInvokerBody(MD);
+  }
 
   // Regular functions.
   if (const auto *Body = F->getBody())
diff --git clang/lib/AST/ByteCode/Compiler.h clang/lib/AST/ByteCode/Compiler.h
index f9a597a16ef4..0a93c46a40ef 100644
--- clang/lib/AST/ByteCode/Compiler.h
+++ clang/lib/AST/ByteCode/Compiler.h
@@ -309,7 +309,7 @@ protected:
   std::optional<unsigned>
   allocateLocal(DeclTy &&Decl, QualType Ty = QualType(),
                 const ValueDecl *ExtendingDecl = nullptr);
-  unsigned allocateTemporary(const Expr *E);
+  std::optional<unsigned> allocateTemporary(const Expr *E);
 
 private:
   friend class VariableScope<Emitter>;
@@ -383,6 +383,7 @@ private:
   bool emitBuiltinBitCast(const CastExpr *E);
   bool compileConstructor(const CXXConstructorDecl *Ctor);
   bool compileDestructor(const CXXDestructorDecl *Dtor);
+  bool compileUnionAssignmentOperator(const CXXMethodDecl *MD);
 
   bool checkLiteralType(const Expr *E);
 
diff --git clang/lib/AST/ByteCode/Context.cpp clang/lib/AST/ByteCode/Context.cpp
index a322700fc0d2..aa434d5c8592 100644
--- clang/lib/AST/ByteCode/Context.cpp
+++ clang/lib/AST/ByteCode/Context.cpp
@@ -212,14 +212,11 @@ const llvm::fltSemantics &Context::getFloatSemantics(QualType T) const {
 bool Context::Run(State &Parent, const Function *Func) {
 
   {
-    InterpState State(Parent, *P, Stk, *this);
-    State.Current = new InterpFrame(State, Func, /*Caller=*/nullptr, CodePtr(),
-                                    Func->getArgSize());
+    InterpState State(Parent, *P, Stk, *this, Func);
     if (Interpret(State)) {
       assert(Stk.empty());
       return true;
     }
-
     // State gets destroyed here, so the Stk.clear() below doesn't accidentally
     // remove values the State's destructor might access.
   }
diff --git clang/lib/AST/ByteCode/EvalEmitter.cpp clang/lib/AST/ByteCode/EvalEmitter.cpp
index 9763fe89b737..95149efbea99 100644
--- clang/lib/AST/ByteCode/EvalEmitter.cpp
+++ clang/lib/AST/ByteCode/EvalEmitter.cpp
@@ -17,11 +17,7 @@ using namespace clang::interp;
 
 EvalEmitter::EvalEmitter(Context &Ctx, Program &P, State &Parent,
                          InterpStack &Stk)
-    : Ctx(Ctx), P(P), S(Parent, P, Stk, Ctx, this), EvalResult(&Ctx) {
-  // Create a dummy frame for the interpreter which does not have locals.
-  S.Current =
-      new InterpFrame(S, /*Func=*/nullptr, /*Caller=*/nullptr, CodePtr(), 0);
-}
+    : Ctx(Ctx), P(P), S(Parent, P, Stk, Ctx, this), EvalResult(&Ctx) {}
 
 EvalEmitter::~EvalEmitter() {
   for (auto &[K, V] : Locals) {
@@ -174,6 +170,9 @@ template <> bool EvalEmitter::emitRet<PT_Ptr>(const SourceInfo &Info) {
       return false;
     }
   } else {
+    if (!Ptr.isLive() && !Ptr.isTemporary())
+      return false;
+
     EvalResult.setValue(Ptr.toAPValue(Ctx.getASTContext()));
   }
 
diff --git clang/lib/AST/ByteCode/EvalEmitter.h clang/lib/AST/ByteCode/EvalEmitter.h
index e7c9e80d75d9..2cac2ba2ef22 100644
--- clang/lib/AST/ByteCode/EvalEmitter.h
+++ clang/lib/AST/ByteCode/EvalEmitter.h
@@ -17,7 +17,6 @@
 #include "InterpState.h"
 #include "PrimType.h"
 #include "Source.h"
-#include "llvm/Support/Error.h"
 
 namespace clang {
 namespace interp {
@@ -42,8 +41,6 @@ public:
   /// Clean up all resources.
   void cleanup();
 
-  InterpState &getState() { return S; }
-
 protected:
   EvalEmitter(Context &Ctx, Program &P, State &Parent, InterpStack &Stk);
 
diff --git clang/lib/AST/ByteCode/Function.cpp clang/lib/AST/ByteCode/Function.cpp
index 896a4fb3f946..6b892dfd616c 100644
--- clang/lib/AST/ByteCode/Function.cpp
+++ clang/lib/AST/ByteCode/Function.cpp
@@ -19,12 +19,28 @@ Function::Function(Program &P, FunctionDeclTy Source, unsigned ArgSize,
                    llvm::SmallVectorImpl<PrimType> &&ParamTypes,
                    llvm::DenseMap<unsigned, ParamDescriptor> &&Params,
                    llvm::SmallVectorImpl<unsigned> &&ParamOffsets,
-                   bool HasThisPointer, bool HasRVO, unsigned BuiltinID)
-    : P(P), Source(Source), ArgSize(ArgSize), ParamTypes(std::move(ParamTypes)),
-      Params(std::move(Params)), ParamOffsets(std::move(ParamOffsets)),
-      HasThisPointer(HasThisPointer), HasRVO(HasRVO), BuiltinID(BuiltinID) {
-  if (const auto *F = Source.dyn_cast<const FunctionDecl *>())
+                   bool HasThisPointer, bool HasRVO)
+    : P(P), Kind(FunctionKind::Normal), Source(Source), ArgSize(ArgSize),
+      ParamTypes(std::move(ParamTypes)), Params(std::move(Params)),
+      ParamOffsets(std::move(ParamOffsets)), HasThisPointer(HasThisPointer),
+      HasRVO(HasRVO) {
+  if (const auto *F = dyn_cast<const FunctionDecl *>(Source)) {
     Variadic = F->isVariadic();
+    BuiltinID = F->getBuiltinID();
+    if (const auto *CD = dyn_cast<CXXConstructorDecl>(F)) {
+      Virtual = CD->isVirtual();
+      Kind = FunctionKind::Ctor;
+    } else if (const auto *CD = dyn_cast<CXXDestructorDecl>(F)) {
+      Virtual = CD->isVirtual();
+      Kind = FunctionKind::Dtor;
+    } else if (const auto *MD = dyn_cast<CXXMethodDecl>(F)) {
+      Virtual = MD->isVirtual();
+      if (MD->isLambdaStaticInvoker())
+        Kind = FunctionKind::LambdaStaticInvoker;
+      else if (clang::isLambdaCallOperator(F))
+        Kind = FunctionKind::LambdaCallOperator;
+    }
+  }
 }
 
 Function::ParamDescriptor Function::getParamDescriptor(unsigned Offset) const {
@@ -45,13 +61,6 @@ SourceInfo Function::getSource(CodePtr PC) const {
   return It->second;
 }
 
-bool Function::isVirtual() const {
-  if (const auto *M = dyn_cast_if_present<CXXMethodDecl>(
-          Source.dyn_cast<const FunctionDecl *>()))
-    return M->isVirtual();
-  return false;
-}
-
 /// Unevaluated builtins don't get their arguments put on the stack
 /// automatically. They instead operate on the AST of their Call
 /// Expression.
diff --git clang/lib/AST/ByteCode/Function.h clang/lib/AST/ByteCode/Function.h
index 409a80f59f1e..2d3421e5e612 100644
--- clang/lib/AST/ByteCode/Function.h
+++ clang/lib/AST/ByteCode/Function.h
@@ -80,6 +80,13 @@ using FunctionDeclTy =
 ///
 class Function final {
 public:
+  enum class FunctionKind {
+    Normal,
+    Ctor,
+    Dtor,
+    LambdaStaticInvoker,
+    LambdaCallOperator,
+  };
   using ParamDescriptor = std::pair<PrimType, Descriptor *>;
 
   /// Returns the size of the function's local stack.
@@ -141,43 +148,31 @@ public:
   bool isConstexpr() const { return IsValid || isLambdaStaticInvoker(); }
 
   /// Checks if the function is virtual.
-  bool isVirtual() const;
+  bool isVirtual() const { return Virtual; };
 
   /// Checks if the function is a constructor.
-  bool isConstructor() const {
-    return isa_and_nonnull<CXXConstructorDecl>(
-        dyn_cast<const FunctionDecl *>(Source));
-  }
+  bool isConstructor() const { return Kind == FunctionKind::Ctor; }
   /// Checks if the function is a destructor.
-  bool isDestructor() const {
-    return isa_and_nonnull<CXXDestructorDecl>(
-        dyn_cast<const FunctionDecl *>(Source));
-  }
-
-  /// Returns the parent record decl, if any.
-  const CXXRecordDecl *getParentDecl() const {
-    if (const auto *MD = dyn_cast_if_present<CXXMethodDecl>(
-            dyn_cast<const FunctionDecl *>(Source)))
-      return MD->getParent();
-    return nullptr;
-  }
+  bool isDestructor() const { return Kind == FunctionKind::Dtor; }
 
   /// Returns whether this function is a lambda static invoker,
   /// which we generate custom byte code for.
   bool isLambdaStaticInvoker() const {
-    if (const auto *MD = dyn_cast_if_present<CXXMethodDecl>(
-            dyn_cast<const FunctionDecl *>(Source)))
-      return MD->isLambdaStaticInvoker();
-    return false;
+    return Kind == FunctionKind::LambdaStaticInvoker;
   }
 
   /// Returns whether this function is the call operator
   /// of a lambda record decl.
   bool isLambdaCallOperator() const {
+    return Kind == FunctionKind::LambdaCallOperator;
+  }
+
+  /// Returns the parent record decl, if any.
+  const CXXRecordDecl *getParentDecl() const {
     if (const auto *MD = dyn_cast_if_present<CXXMethodDecl>(
             dyn_cast<const FunctionDecl *>(Source)))
-      return clang::isLambdaCallOperator(MD);
-    return false;
+      return MD->getParent();
+    return nullptr;
   }
 
   /// Checks if the function is fully done compiling.
@@ -213,7 +208,7 @@ public:
 
   bool isThisPointerExplicit() const {
     if (const auto *MD = dyn_cast_if_present<CXXMethodDecl>(
-            Source.dyn_cast<const FunctionDecl *>()))
+            dyn_cast<const FunctionDecl *>(Source)))
       return MD->isExplicitObjectMemberFunction();
     return false;
   }
@@ -232,7 +227,7 @@ private:
            llvm::SmallVectorImpl<PrimType> &&ParamTypes,
            llvm::DenseMap<unsigned, ParamDescriptor> &&Params,
            llvm::SmallVectorImpl<unsigned> &&ParamOffsets, bool HasThisPointer,
-           bool HasRVO, unsigned BuiltinID);
+           bool HasRVO);
 
   /// Sets the code of a function.
   void setCode(unsigned NewFrameSize, std::vector<std::byte> &&NewCode,
@@ -255,6 +250,8 @@ private:
 
   /// Program reference.
   Program &P;
+  /// Function Kind.
+  FunctionKind Kind;
   /// Declaration this function was compiled from.
   FunctionDeclTy Source;
   /// Local area size: storage + metadata.
@@ -289,6 +286,7 @@ private:
   bool HasBody = false;
   bool Defined = false;
   bool Variadic = false;
+  bool Virtual = false;
   unsigned BuiltinID = 0;
 
 public:
diff --git clang/lib/AST/ByteCode/Interp.cpp clang/lib/AST/ByteCode/Interp.cpp
index f91820e16fac..1123ced99eb0 100644
--- clang/lib/AST/ByteCode/Interp.cpp
+++ clang/lib/AST/ByteCode/Interp.cpp
@@ -65,17 +65,17 @@ static void diagnoseNonConstVariable(InterpState &S, CodePtr OpPC,
                                      const ValueDecl *VD);
 static bool diagnoseUnknownDecl(InterpState &S, CodePtr OpPC,
                                 const ValueDecl *D) {
-  const SourceInfo &E = S.Current->getSource(OpPC);
 
   if (isa<ParmVarDecl>(D)) {
     if (D->getType()->isReferenceType())
       return false;
 
+    const SourceInfo &Loc = S.Current->getSource(OpPC);
     if (S.getLangOpts().CPlusPlus11) {
-      S.FFDiag(E, diag::note_constexpr_function_param_value_unknown) << D;
+      S.FFDiag(Loc, diag::note_constexpr_function_param_value_unknown) << D;
       S.Note(D->getLocation(), diag::note_declared_at) << D->getSourceRange();
     } else {
-      S.FFDiag(E);
+      S.FFDiag(Loc);
     }
     return false;
   }
diff --git clang/lib/AST/ByteCode/Interp.h clang/lib/AST/ByteCode/Interp.h
index 063970afec9e..9f029adc7083 100644
--- clang/lib/AST/ByteCode/Interp.h
+++ clang/lib/AST/ByteCode/Interp.h
@@ -325,11 +325,11 @@ bool Ret(InterpState &S, CodePtr &PC) {
 
   if (InterpFrame *Caller = S.Current->Caller) {
     PC = S.Current->getRetPC();
-    delete S.Current;
+    InterpFrame::free(S.Current);
     S.Current = Caller;
     S.Stk.push<T>(Ret);
   } else {
-    delete S.Current;
+    InterpFrame::free(S.Current);
     S.Current = nullptr;
     // The topmost frame should come from an EvalEmitter,
     // which has its own implementation of the Ret<> instruction.
@@ -345,10 +345,10 @@ inline bool RetVoid(InterpState &S, CodePtr &PC) {
 
   if (InterpFrame *Caller = S.Current->Caller) {
     PC = S.Current->getRetPC();
-    delete S.Current;
+    InterpFrame::free(S.Current);
     S.Current = Caller;
   } else {
-    delete S.Current;
+    InterpFrame::free(S.Current);
     S.Current = nullptr;
   }
   return true;
@@ -379,15 +379,14 @@ bool AddSubMulHelper(InterpState &S, CodePtr OpPC, unsigned Bits, const T &LHS,
   APSInt Value = OpAP<APSInt>()(LHS.toAPSInt(Bits), RHS.toAPSInt(Bits));
 
   // Report undefined behaviour, stopping if required.
-  const Expr *E = S.Current->getExpr(OpPC);
-  QualType Type = E->getType();
   if (S.checkingForUndefinedBehavior()) {
+    const Expr *E = S.Current->getExpr(OpPC);
+    QualType Type = E->getType();
     SmallString<32> Trunc;
     Value.trunc(Result.bitWidth())
         .toString(Trunc, 10, Result.isSigned(), /*formatAsCLiteral=*/false,
                   /*UpperCase=*/true, /*InsertSeparators=*/true);
-    auto Loc = E->getExprLoc();
-    S.report(Loc, diag::warn_integer_constant_overflow)
+    S.report(E->getExprLoc(), diag::warn_integer_constant_overflow)
         << Trunc << Type << E->getSourceRange();
   }
 
@@ -737,16 +736,14 @@ bool Neg(InterpState &S, CodePtr OpPC) {
   S.Stk.push<T>(Result);
 
   APSInt NegatedValue = -Value.toAPSInt(Value.bitWidth() + 1);
-  const Expr *E = S.Current->getExpr(OpPC);
-  QualType Type = E->getType();
-
   if (S.checkingForUndefinedBehavior()) {
+    const Expr *E = S.Current->getExpr(OpPC);
+    QualType Type = E->getType();
     SmallString<32> Trunc;
     NegatedValue.trunc(Result.bitWidth())
         .toString(Trunc, 10, Result.isSigned(), /*formatAsCLiteral=*/false,
                   /*UpperCase=*/true, /*InsertSeparators=*/true);
-    auto Loc = E->getExprLoc();
-    S.report(Loc, diag::warn_integer_constant_overflow)
+    S.report(E->getExprLoc(), diag::warn_integer_constant_overflow)
         << Trunc << Type << E->getSourceRange();
     return true;
   }
@@ -800,15 +797,14 @@ bool IncDecHelper(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
     APResult = --Value.toAPSInt(Bits);
 
   // Report undefined behaviour, stopping if required.
-  const Expr *E = S.Current->getExpr(OpPC);
-  QualType Type = E->getType();
   if (S.checkingForUndefinedBehavior()) {
+    const Expr *E = S.Current->getExpr(OpPC);
+    QualType Type = E->getType();
     SmallString<32> Trunc;
     APResult.trunc(Result.bitWidth())
         .toString(Trunc, 10, Result.isSigned(), /*formatAsCLiteral=*/false,
                   /*UpperCase=*/true, /*InsertSeparators=*/true);
-    auto Loc = E->getExprLoc();
-    S.report(Loc, diag::warn_integer_constant_overflow)
+    S.report(E->getExprLoc(), diag::warn_integer_constant_overflow)
         << Trunc << Type << E->getSourceRange();
     return true;
   }
diff --git clang/lib/AST/ByteCode/InterpFrame.cpp clang/lib/AST/ByteCode/InterpFrame.cpp
index 20f67d9b1fd4..89fc7a4515d6 100644
--- clang/lib/AST/ByteCode/InterpFrame.cpp
+++ clang/lib/AST/ByteCode/InterpFrame.cpp
@@ -23,11 +23,15 @@
 using namespace clang;
 using namespace clang::interp;
 
+InterpFrame::InterpFrame(InterpState &S)
+    : Caller(nullptr), S(S), Depth(0), Func(nullptr), RetPC(CodePtr()),
+      ArgSize(0), Args(nullptr), FrameOffset(0), IsBottom(true) {}
+
 InterpFrame::InterpFrame(InterpState &S, const Function *Func,
                          InterpFrame *Caller, CodePtr RetPC, unsigned ArgSize)
     : Caller(Caller), S(S), Depth(Caller ? Caller->Depth + 1 : 0), Func(Func),
       RetPC(RetPC), ArgSize(ArgSize), Args(static_cast<char *>(S.Stk.top())),
-      FrameOffset(S.Stk.size()) {
+      FrameOffset(S.Stk.size()), IsBottom(!Caller) {
   if (!Func)
     return;
 
@@ -73,11 +77,15 @@ InterpFrame::~InterpFrame() {
   // When destroying the InterpFrame, call the Dtor for all block
   // that haven't been destroyed via a destroy() op yet.
   // This happens when the execution is interruped midway-through.
-  if (Func) {
-    for (auto &Scope : Func->scopes()) {
-      for (auto &Local : Scope.locals()) {
-        S.deallocate(localBlock(Local.Offset));
-      }
+  destroyScopes();
+}
+
+void InterpFrame::destroyScopes() {
+  if (!Func)
+    return;
+  for (auto &Scope : Func->scopes()) {
+    for (auto &Local : Scope.locals()) {
+      S.deallocate(localBlock(Local.Offset));
     }
   }
 }
@@ -244,7 +252,7 @@ SourceInfo InterpFrame::getSource(CodePtr PC) const {
 
 const Expr *InterpFrame::getExpr(CodePtr PC) const {
   if (Func && !funcHasUsableBody(Func) && Caller)
-    return Caller->getExpr(PC);
+    return Caller->getExpr(RetPC);
 
   return S.getExpr(Func, PC);
 }
diff --git clang/lib/AST/ByteCode/InterpFrame.h clang/lib/AST/ByteCode/InterpFrame.h
index 7cfc3ac68b4f..360e6bff1232 100644
--- clang/lib/AST/ByteCode/InterpFrame.h
+++ clang/lib/AST/ByteCode/InterpFrame.h
@@ -28,6 +28,9 @@ public:
   /// The frame of the previous function.
   InterpFrame *Caller;
 
+  /// Bottom Frame.
+  InterpFrame(InterpState &S);
+
   /// Creates a new frame for a method call.
   InterpFrame(InterpState &S, const Function *Func, InterpFrame *Caller,
               CodePtr RetPC, unsigned ArgSize);
@@ -42,9 +45,15 @@ public:
   /// Destroys the frame, killing all live pointers to stack slots.
   ~InterpFrame();
 
+  static void free(InterpFrame *F) {
+    if (!F->isBottomFrame())
+      delete F;
+  }
+
   /// Invokes the destructors for a scope.
   void destroy(unsigned Idx);
   void initScope(unsigned Idx);
+  void destroyScopes();
 
   /// Describes the frame with arguments for diagnostic purposes.
   void describe(llvm::raw_ostream &OS) const override;
@@ -119,6 +128,8 @@ public:
 
   bool isStdFunction() const;
 
+  bool isBottomFrame() const { return IsBottom; }
+
   void dump() const { dump(llvm::errs(), 0); }
   void dump(llvm::raw_ostream &OS, unsigned Indent = 0) const;
 
@@ -167,6 +178,7 @@ private:
   const size_t FrameOffset;
   /// Mapping from arg offsets to their argument blocks.
   llvm::DenseMap<unsigned, std::unique_ptr<char[]>> Params;
+  bool IsBottom = false;
 };
 
 } // namespace interp
diff --git clang/lib/AST/ByteCode/InterpState.cpp clang/lib/AST/ByteCode/InterpState.cpp
index 287c3bd3bca3..70a2e9b62fc3 100644
--- clang/lib/AST/ByteCode/InterpState.cpp
+++ clang/lib/AST/ByteCode/InterpState.cpp
@@ -17,7 +17,14 @@ using namespace clang::interp;
 
 InterpState::InterpState(State &Parent, Program &P, InterpStack &Stk,
                          Context &Ctx, SourceMapper *M)
-    : Parent(Parent), M(M), P(P), Stk(Stk), Ctx(Ctx), Current(nullptr) {}
+    : Parent(Parent), M(M), P(P), Stk(Stk), Ctx(Ctx), BottomFrame(*this),
+      Current(&BottomFrame) {}
+
+InterpState::InterpState(State &Parent, Program &P, InterpStack &Stk,
+                         Context &Ctx, const Function *Func)
+    : Parent(Parent), M(nullptr), P(P), Stk(Stk), Ctx(Ctx),
+      BottomFrame(*this, Func, nullptr, CodePtr(), Func->getArgSize()),
+      Current(&BottomFrame) {}
 
 bool InterpState::inConstantContext() const {
   if (ConstantContextOverride)
@@ -27,11 +34,12 @@ bool InterpState::inConstantContext() const {
 }
 
 InterpState::~InterpState() {
-  while (Current) {
+  while (Current && !Current->isBottomFrame()) {
     InterpFrame *Next = Current->Caller;
     delete Current;
     Current = Next;
   }
+  BottomFrame.destroyScopes();
 
   while (DeadBlocks) {
     DeadBlock *Next = DeadBlocks->Next;
diff --git clang/lib/AST/ByteCode/InterpState.h clang/lib/AST/ByteCode/InterpState.h
index 2a1311c86a2f..d6adfff1a713 100644
--- clang/lib/AST/ByteCode/InterpState.h
+++ clang/lib/AST/ByteCode/InterpState.h
@@ -37,6 +37,8 @@ class InterpState final : public State, public SourceMapper {
 public:
   InterpState(State &Parent, Program &P, InterpStack &Stk, Context &Ctx,
               SourceMapper *M = nullptr);
+  InterpState(State &Parent, Program &P, InterpStack &Stk, Context &Ctx,
+              const Function *Func);
 
   ~InterpState();
 
@@ -134,6 +136,8 @@ public:
   InterpStack &Stk;
   /// Interpreter Context.
   Context &Ctx;
+  /// Bottom function frame.
+  InterpFrame BottomFrame;
   /// The current frame.
   InterpFrame *Current = nullptr;
   /// Source location of the evaluating expression
diff --git clang/lib/AST/ByteCode/Program.cpp clang/lib/AST/ByteCode/Program.cpp
index 1ffe7cd721f1..e0b86d46428a 100644
--- clang/lib/AST/ByteCode/Program.cpp
+++ clang/lib/AST/ByteCode/Program.cpp
@@ -35,6 +35,7 @@ const void *Program::getNativePointer(unsigned Idx) {
 unsigned Program::createGlobalString(const StringLiteral *S, const Expr *Base) {
   const size_t CharWidth = S->getCharByteWidth();
   const size_t BitWidth = CharWidth * Ctx.getCharBit();
+  unsigned StringLength = S->getLength();
 
   PrimType CharType;
   switch (CharWidth) {
@@ -55,15 +56,15 @@ unsigned Program::createGlobalString(const StringLiteral *S, const Expr *Base) {
     Base = S;
 
   // Create a descriptor for the string.
-  Descriptor *Desc = allocateDescriptor(Base, CharType, Descriptor::GlobalMD,
-                                        S->getLength() + 1,
-                                        /*isConst=*/true,
-                                        /*isTemporary=*/false,
-                                        /*isMutable=*/false);
+  Descriptor *Desc =
+      allocateDescriptor(Base, CharType, Descriptor::GlobalMD, StringLength + 1,
+                         /*isConst=*/true,
+                         /*isTemporary=*/false,
+                         /*isMutable=*/false);
 
   // Allocate storage for the string.
   // The byte length does not include the null terminator.
-  unsigned I = Globals.size();
+  unsigned GlobalIndex = Globals.size();
   unsigned Sz = Desc->getAllocSize();
   auto *G = new (Allocator, Sz) Global(Ctx.getEvalID(), Desc, /*isStatic=*/true,
                                        /*isExtern=*/false);
@@ -74,33 +75,32 @@ unsigned Program::createGlobalString(const StringLiteral *S, const Expr *Base) {
 
   // Construct the string in storage.
   const Pointer Ptr(G->block());
-  for (unsigned I = 0, N = S->getLength(); I <= N; ++I) {
-    Pointer Field = Ptr.atIndex(I).narrow();
-    const uint32_t CodePoint = I == N ? 0 : S->getCodeUnit(I);
+  for (unsigned I = 0; I <= StringLength; ++I) {
+    Pointer Field = Ptr.atIndex(I);
+    const uint32_t CodePoint = I == StringLength ? 0 : S->getCodeUnit(I);
     switch (CharType) {
     case PT_Sint8: {
       using T = PrimConv<PT_Sint8>::T;
       Field.deref<T>() = T::from(CodePoint, BitWidth);
-      Field.initialize();
       break;
     }
     case PT_Uint16: {
       using T = PrimConv<PT_Uint16>::T;
       Field.deref<T>() = T::from(CodePoint, BitWidth);
-      Field.initialize();
       break;
     }
     case PT_Uint32: {
       using T = PrimConv<PT_Uint32>::T;
       Field.deref<T>() = T::from(CodePoint, BitWidth);
-      Field.initialize();
       break;
     }
     default:
       llvm_unreachable("unsupported character type");
     }
   }
-  return I;
+  Ptr.initialize();
+
+  return GlobalIndex;
 }
 
 Pointer Program::getPtrGlobal(unsigned Idx) const {
diff --git clang/lib/AST/Decl.cpp clang/lib/AST/Decl.cpp
index 610207cf8b9a..7e8a172ae4c3 100644
--- clang/lib/AST/Decl.cpp
+++ clang/lib/AST/Decl.cpp
@@ -3065,7 +3065,6 @@ FunctionDecl::FunctionDecl(Kind DK, ASTContext &C, DeclContext *DC,
   FunctionDeclBits.IsIneligibleOrNotSelected = false;
   FunctionDeclBits.HasImplicitReturnZero = false;
   FunctionDeclBits.IsLateTemplateParsed = false;
-  FunctionDeclBits.IsInstantiatedFromMemberTemplate = false;
   FunctionDeclBits.ConstexprKind = static_cast<uint64_t>(ConstexprKind);
   FunctionDeclBits.BodyContainsImmediateEscalatingExpression = false;
   FunctionDeclBits.InstantiationIsPending = false;
diff --git clang/lib/AST/DeclBase.cpp clang/lib/AST/DeclBase.cpp
index c0a331d18cab..fc16448cf9e9 100644
--- clang/lib/AST/DeclBase.cpp
+++ clang/lib/AST/DeclBase.cpp
@@ -1203,6 +1203,8 @@ const FunctionType *Decl::getFunctionType(bool BlocksToo) const {
 
   if (Ty->isFunctionPointerType())
     Ty = Ty->castAs<PointerType>()->getPointeeType();
+  else if (Ty->isMemberFunctionPointerType())
+    Ty = Ty->castAs<MemberPointerType>()->getPointeeType();
   else if (Ty->isFunctionReferenceType())
     Ty = Ty->castAs<ReferenceType>()->getPointeeType();
   else if (BlocksToo && Ty->isBlockPointerType())
diff --git clang/lib/AST/ItaniumMangle.cpp clang/lib/AST/ItaniumMangle.cpp
index e889b74a5cda..e5eb22eae7dd 100644
--- clang/lib/AST/ItaniumMangle.cpp
+++ clang/lib/AST/ItaniumMangle.cpp
@@ -3419,24 +3419,24 @@ void CXXNameMangler::mangleType(const BuiltinType *T) {
       /* Prior to Clang 18.0 we used this incorrect mangled name */            \
       mangleVendorType("__SVBFloat16_t");                                      \
     } else {                                                                   \
-      type_name = MangledName;                                                 \
-      Out << (type_name == Name ? "u" : "") << type_name.size() << type_name;  \
+      type_name = #MangledName;                                                \
+      Out << (type_name == #Name ? "u" : "") << type_name.size() << type_name; \
     }                                                                          \
     break;
 #define SVE_PREDICATE_TYPE(Name, MangledName, Id, SingletonId)                 \
   case BuiltinType::Id:                                                        \
-    type_name = MangledName;                                                   \
-    Out << (type_name == Name ? "u" : "") << type_name.size() << type_name;    \
+    type_name = #MangledName;                                                  \
+    Out << (type_name == #Name ? "u" : "") << type_name.size() << type_name;   \
     break;
 #define SVE_OPAQUE_TYPE(Name, MangledName, Id, SingletonId)                    \
   case BuiltinType::Id:                                                        \
-    type_name = MangledName;                                                   \
-    Out << (type_name == Name ? "u" : "") << type_name.size() << type_name;    \
+    type_name = #MangledName;                                                  \
+    Out << (type_name == #Name ? "u" : "") << type_name.size() << type_name;   \
     break;
 #define SVE_SCALAR_TYPE(Name, MangledName, Id, SingletonId, Bits)              \
   case BuiltinType::Id:                                                        \
-    type_name = MangledName;                                                   \
-    Out << (type_name == Name ? "u" : "") << type_name.size() << type_name;    \
+    type_name = #MangledName;                                                  \
+    Out << (type_name == #Name ? "u" : "") << type_name.size() << type_name;   \
     break;
 #include "clang/Basic/AArch64SVEACLETypes.def"
 #define PPC_VECTOR_TYPE(Name, Id, Size)                                        \
@@ -4198,7 +4198,7 @@ void CXXNameMangler::mangleRISCVFixedRVVVectorType(const VectorType *T) {
 
   // Apend the LMUL suffix.
   auto VScale = getASTContext().getTargetInfo().getVScaleRange(
-      getASTContext().getLangOpts());
+      getASTContext().getLangOpts(), false);
   unsigned VLen = VScale->first * llvm::RISCV::RVVBitsPerBlock;
 
   if (T->getVectorKind() == VectorKind::RVVFixedLengthData) {
diff --git clang/lib/AST/MicrosoftMangle.cpp clang/lib/AST/MicrosoftMangle.cpp
index edeeaeaa9ae1..fe34251688a9 100644
--- clang/lib/AST/MicrosoftMangle.cpp
+++ clang/lib/AST/MicrosoftMangle.cpp
@@ -2792,6 +2792,10 @@ void MicrosoftCXXNameMangler::mangleType(const BuiltinType *T, Qualifiers,
     mangleArtificialTagType(TagTypeKind::Struct, "__bf16", {"__clang"});
     break;
 
+  case BuiltinType::MFloat8:
+    mangleArtificialTagType(TagTypeKind::Struct, "__mfp8", {"__clang"});
+    break;
+
 #define WASM_REF_TYPE(InternalName, MangledName, Id, SingletonId, AS)          \
   case BuiltinType::Id:                                                        \
     mangleArtificialTagType(TagTypeKind::Struct, MangledName);                 \
@@ -2806,48 +2810,13 @@ void MicrosoftCXXNameMangler::mangleType(const BuiltinType *T, Qualifiers,
     break;
 #include "clang/Basic/HLSLIntangibleTypes.def"
 
-#define SVE_TYPE(Name, Id, SingletonId) \
-  case BuiltinType::Id:
-#include "clang/Basic/AArch64SVEACLETypes.def"
-#define PPC_VECTOR_TYPE(Name, Id, Size) \
-  case BuiltinType::Id:
-#include "clang/Basic/PPCTypes.def"
-#define RVV_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
-#include "clang/Basic/RISCVVTypes.def"
-#define AMDGPU_TYPE(Name, Id, SingletonId, Width, Align) case BuiltinType::Id:
-#include "clang/Basic/AMDGPUTypes.def"
-  case BuiltinType::ShortAccum:
-  case BuiltinType::Accum:
-  case BuiltinType::LongAccum:
-  case BuiltinType::UShortAccum:
-  case BuiltinType::UAccum:
-  case BuiltinType::ULongAccum:
-  case BuiltinType::ShortFract:
-  case BuiltinType::Fract:
-  case BuiltinType::LongFract:
-  case BuiltinType::UShortFract:
-  case BuiltinType::UFract:
-  case BuiltinType::ULongFract:
-  case BuiltinType::SatShortAccum:
-  case BuiltinType::SatAccum:
-  case BuiltinType::SatLongAccum:
-  case BuiltinType::SatUShortAccum:
-  case BuiltinType::SatUAccum:
-  case BuiltinType::SatULongAccum:
-  case BuiltinType::SatShortFract:
-  case BuiltinType::SatFract:
-  case BuiltinType::SatLongFract:
-  case BuiltinType::SatUShortFract:
-  case BuiltinType::SatUFract:
-  case BuiltinType::SatULongFract:
-  case BuiltinType::Ibm128:
-  case BuiltinType::Float128: {
+    // Issue an error for any type not explicitly handled.
+  default:
     Error(Range.getBegin(), "built-in type: ",
           T->getName(Context.getASTContext().getPrintingPolicy()))
         << Range;
     break;
   }
-  }
 }
 
 // <type>          ::= <function-type>
diff --git clang/lib/AST/ParentMapContext.cpp clang/lib/AST/ParentMapContext.cpp
index 2e77e1d7c4c6..e9387ec79c37 100644
--- clang/lib/AST/ParentMapContext.cpp
+++ clang/lib/AST/ParentMapContext.cpp
@@ -395,7 +395,7 @@ private:
       if (!isa<ParentVector *>(NodeOrVector)) {
         auto *Vector = new ParentVector(
             1, getSingleDynTypedNodeFromParentMap(NodeOrVector));
-        delete NodeOrVector.template dyn_cast<DynTypedNode *>();
+        delete dyn_cast<DynTypedNode *>(NodeOrVector);
         NodeOrVector = Vector;
       }
 
diff --git clang/lib/AST/StmtOpenACC.cpp clang/lib/AST/StmtOpenACC.cpp
index 2b0ac716bab5..11eab0c27579 100644
--- clang/lib/AST/StmtOpenACC.cpp
+++ clang/lib/AST/StmtOpenACC.cpp
@@ -305,3 +305,19 @@ OpenACCUpdateConstruct::Create(const ASTContext &C, SourceLocation Start,
       new (Mem) OpenACCUpdateConstruct(Start, DirectiveLoc, End, Clauses);
   return Inst;
 }
+
+OpenACCAtomicConstruct *
+OpenACCAtomicConstruct::CreateEmpty(const ASTContext &C) {
+  void *Mem = C.Allocate(sizeof(OpenACCAtomicConstruct));
+  auto *Inst = new (Mem) OpenACCAtomicConstruct(EmptyShell{});
+  return Inst;
+}
+
+OpenACCAtomicConstruct *OpenACCAtomicConstruct::Create(
+    const ASTContext &C, SourceLocation Start, SourceLocation DirectiveLoc,
+    OpenACCAtomicKind AtKind, SourceLocation End, Stmt *AssociatedStmt) {
+  void *Mem = C.Allocate(sizeof(OpenACCAtomicConstruct));
+  auto *Inst = new (Mem)
+      OpenACCAtomicConstruct(Start, DirectiveLoc, AtKind, End, AssociatedStmt);
+  return Inst;
+}
diff --git clang/lib/AST/StmtPrinter.cpp clang/lib/AST/StmtPrinter.cpp
index d523abfe3128..bae4134e2835 100644
--- clang/lib/AST/StmtPrinter.cpp
+++ clang/lib/AST/StmtPrinter.cpp
@@ -1242,6 +1242,16 @@ void StmtPrinter::VisitOpenACCWaitConstruct(OpenACCWaitConstruct *S) {
   OS << '\n';
 }
 
+void StmtPrinter::VisitOpenACCAtomicConstruct(OpenACCAtomicConstruct *S) {
+  Indent() << "#pragma acc atomic";
+
+  if (S->getAtomicKind() != OpenACCAtomicKind::None)
+    OS << " " << S->getAtomicKind();
+
+  OS << '\n';
+  PrintStmt(S->getAssociatedStmt());
+}
+
 //===----------------------------------------------------------------------===//
 //  Expr printing methods.
 //===----------------------------------------------------------------------===//
diff --git clang/lib/AST/StmtProfile.cpp clang/lib/AST/StmtProfile.cpp
index 84985fcb20ff..36d231e21fa9 100644
--- clang/lib/AST/StmtProfile.cpp
+++ clang/lib/AST/StmtProfile.cpp
@@ -2809,6 +2809,11 @@ void StmtProfiler::VisitOpenACCUpdateConstruct(
   P.VisitOpenACCClauseList(S->clauses());
 }
 
+void StmtProfiler::VisitOpenACCAtomicConstruct(
+    const OpenACCAtomicConstruct *S) {
+  VisitStmt(S);
+}
+
 void StmtProfiler::VisitHLSLOutArgExpr(const HLSLOutArgExpr *S) {
   VisitStmt(S);
 }
diff --git clang/lib/AST/TemplateName.cpp clang/lib/AST/TemplateName.cpp
index 3a1eb1ca12f4..9e0a7dc2b8cd 100644
--- clang/lib/AST/TemplateName.cpp
+++ clang/lib/AST/TemplateName.cpp
@@ -144,7 +144,7 @@ TemplateName::TemplateName(DeducedTemplateStorage *Deduced)
 bool TemplateName::isNull() const { return Storage.isNull(); }
 
 TemplateName::NameKind TemplateName::getKind() const {
-  if (auto *ND = Storage.dyn_cast<Decl *>()) {
+  if (auto *ND = dyn_cast<Decl *>(Storage)) {
     if (isa<UsingShadowDecl>(ND))
       return UsingTemplate;
     assert(isa<TemplateDecl>(ND));
diff --git clang/lib/AST/TextNodeDumper.cpp clang/lib/AST/TextNodeDumper.cpp
index a57cba959748..10d7e4c0c738 100644
--- clang/lib/AST/TextNodeDumper.cpp
+++ clang/lib/AST/TextNodeDumper.cpp
@@ -958,9 +958,9 @@ void TextNodeDumper::dumpAccessSpecifier(AccessSpecifier AS) {
 
 void TextNodeDumper::dumpCleanupObject(
     const ExprWithCleanups::CleanupObject &C) {
-  if (auto *BD = C.dyn_cast<BlockDecl *>())
+  if (auto *BD = dyn_cast<BlockDecl *>(C))
     dumpDeclRef(BD, "cleanup");
-  else if (auto *CLE = C.dyn_cast<CompoundLiteralExpr *>())
+  else if (auto *CLE = dyn_cast<CompoundLiteralExpr *>(C))
     AddChild([=] {
       OS << "cleanup ";
       {
@@ -3041,6 +3041,12 @@ void TextNodeDumper::VisitOpenACCUpdateConstruct(
   VisitOpenACCConstructStmt(S);
 }
 
+void TextNodeDumper::VisitOpenACCAtomicConstruct(
+    const OpenACCAtomicConstruct *S) {
+  VisitOpenACCConstructStmt(S);
+  OS << ' ' << S->getAtomicKind();
+}
+
 void TextNodeDumper::VisitEmbedExpr(const EmbedExpr *S) {
   AddChild("begin", [=] { OS << S->getStartingElementPos(); });
   AddChild("number of elements", [=] { OS << S->getDataElementCount(); });
diff --git clang/lib/AST/Type.cpp clang/lib/AST/Type.cpp
index fde0746a1757..8c11ec2e1fe2 100644
--- clang/lib/AST/Type.cpp
+++ clang/lib/AST/Type.cpp
@@ -3480,9 +3480,9 @@ StringRef BuiltinType::getName(const PrintingPolicy &Policy) const {
   case Id: \
     return #ExtType;
 #include "clang/Basic/OpenCLExtensionTypes.def"
-#define SVE_TYPE(Name, Id, SingletonId) \
-  case Id: \
-    return Name;
+#define SVE_TYPE(Name, Id, SingletonId)                                        \
+  case Id:                                                                     \
+    return #Name;
 #include "clang/Basic/AArch64SVEACLETypes.def"
 #define PPC_VECTOR_TYPE(Name, Id, Size) \
   case Id: \
diff --git clang/lib/Analysis/ExprMutationAnalyzer.cpp clang/lib/Analysis/ExprMutationAnalyzer.cpp
index d7b44149d0fc..8944343484e5 100644
--- clang/lib/Analysis/ExprMutationAnalyzer.cpp
+++ clang/lib/Analysis/ExprMutationAnalyzer.cpp
@@ -806,17 +806,15 @@ FunctionParmMutationAnalyzer::FunctionParmMutationAnalyzer(
 
 const Stmt *
 FunctionParmMutationAnalyzer::findMutation(const ParmVarDecl *Parm) {
-  const auto Memoized = Results.find(Parm);
-  if (Memoized != Results.end())
-    return Memoized->second;
+  auto [Place, Inserted] = Results.try_emplace(Parm);
+  if (!Inserted)
+    return Place->second;
+
   // To handle call A -> call B -> call A. Assume parameters of A is not mutated
   // before analyzing parameters of A. Then when analyzing the second "call A",
   // FunctionParmMutationAnalyzer can use this memoized value to avoid infinite
   // recursion.
-  Results[Parm] = nullptr;
-  if (const Stmt *S = BodyAnalyzer.findMutation(Parm))
-    return Results[Parm] = S;
-  return Results[Parm];
+  return Place->second = BodyAnalyzer.findMutation(Parm);
 }
 
 } // namespace clang
diff --git clang/lib/Analysis/FlowSensitive/DataflowAnalysisContext.cpp clang/lib/Analysis/FlowSensitive/DataflowAnalysisContext.cpp
index 4b86daa56d7b..1c4fe5c6d501 100644
--- clang/lib/Analysis/FlowSensitive/DataflowAnalysisContext.cpp
+++ clang/lib/Analysis/FlowSensitive/DataflowAnalysisContext.cpp
@@ -160,8 +160,9 @@ Atom
 DataflowAnalysisContext::joinFlowConditions(Atom FirstToken,
                                             Atom SecondToken) {
   Atom Token = arena().makeFlowConditionToken();
-  FlowConditionDeps[Token].insert(FirstToken);
-  FlowConditionDeps[Token].insert(SecondToken);
+  auto &TokenDeps = FlowConditionDeps[Token];
+  TokenDeps.insert(FirstToken);
+  TokenDeps.insert(SecondToken);
   addFlowConditionConstraint(Token,
                              arena().makeOr(arena().makeAtomRef(FirstToken),
                                             arena().makeAtomRef(SecondToken)));
diff --git clang/lib/Analysis/FlowSensitive/SmartPointerAccessorCaching.cpp clang/lib/Analysis/FlowSensitive/SmartPointerAccessorCaching.cpp
index c58bd309545d..b73f9e275144 100644
--- clang/lib/Analysis/FlowSensitive/SmartPointerAccessorCaching.cpp
+++ clang/lib/Analysis/FlowSensitive/SmartPointerAccessorCaching.cpp
@@ -14,6 +14,7 @@ using ast_matchers::callee;
 using ast_matchers::cxxMemberCallExpr;
 using ast_matchers::cxxMethodDecl;
 using ast_matchers::cxxOperatorCallExpr;
+using ast_matchers::hasCanonicalType;
 using ast_matchers::hasName;
 using ast_matchers::hasOverloadedOperatorName;
 using ast_matchers::ofClass;
@@ -122,27 +123,29 @@ namespace clang::dataflow {
 ast_matchers::StatementMatcher isSmartPointerLikeOperatorStar() {
   return cxxOperatorCallExpr(
       hasOverloadedOperatorName("*"),
-      callee(cxxMethodDecl(parameterCountIs(0), returns(referenceType()),
+      callee(cxxMethodDecl(parameterCountIs(0),
+                           returns(hasCanonicalType(referenceType())),
                            ofClass(smartPointerClassWithGetOrValue()))));
 }
 
 ast_matchers::StatementMatcher isSmartPointerLikeOperatorArrow() {
   return cxxOperatorCallExpr(
       hasOverloadedOperatorName("->"),
-      callee(cxxMethodDecl(parameterCountIs(0), returns(pointerType()),
+      callee(cxxMethodDecl(parameterCountIs(0),
+                           returns(hasCanonicalType(pointerType())),
                            ofClass(smartPointerClassWithGetOrValue()))));
 }
 
 ast_matchers::StatementMatcher isSmartPointerLikeValueMethodCall() {
-  return cxxMemberCallExpr(callee(
-      cxxMethodDecl(parameterCountIs(0), returns(referenceType()),
-                    hasName("value"), ofClass(smartPointerClassWithValue()))));
+  return cxxMemberCallExpr(callee(cxxMethodDecl(
+      parameterCountIs(0), returns(hasCanonicalType(referenceType())),
+      hasName("value"), ofClass(smartPointerClassWithValue()))));
 }
 
 ast_matchers::StatementMatcher isSmartPointerLikeGetMethodCall() {
-  return cxxMemberCallExpr(callee(
-      cxxMethodDecl(parameterCountIs(0), returns(pointerType()), hasName("get"),
-                    ofClass(smartPointerClassWithGet()))));
+  return cxxMemberCallExpr(callee(cxxMethodDecl(
+      parameterCountIs(0), returns(hasCanonicalType(pointerType())),
+      hasName("get"), ofClass(smartPointerClassWithGet()))));
 }
 
 const FunctionDecl *
diff --git clang/lib/Basic/Sanitizers.cpp clang/lib/Basic/Sanitizers.cpp
index 5b9b88d03270..e0eff2a37b8a 100644
--- clang/lib/Basic/Sanitizers.cpp
+++ clang/lib/Basic/Sanitizers.cpp
@@ -18,6 +18,7 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
+#include <cmath>
 #include <optional>
 
 using namespace clang;
@@ -43,6 +44,27 @@ std::optional<double> SanitizerMaskCutoffs::operator[](unsigned Kind) const {
 
 void SanitizerMaskCutoffs::clear(SanitizerMask K) { set(K, 0); }
 
+std::optional<std::vector<unsigned>>
+SanitizerMaskCutoffs::getAllScaled(unsigned ScalingFactor) const {
+  std::vector<unsigned> ScaledCutoffs;
+
+  bool AnyCutoff = false;
+  for (unsigned int i = 0; i < SanitizerKind::SO_Count; ++i) {
+    auto C = (*this)[i];
+    if (C.has_value()) {
+      ScaledCutoffs.push_back(lround(std::clamp(*C, 0.0, 1.0) * ScalingFactor));
+      AnyCutoff = true;
+    } else {
+      ScaledCutoffs.push_back(0);
+    }
+  }
+
+  if (AnyCutoff)
+    return ScaledCutoffs;
+
+  return std::nullopt;
+}
+
 // Once LLVM switches to C++17, the constexpr variables can be inline and we
 // won't need this.
 #define SANITIZER(NAME, ID) constexpr SanitizerMask SanitizerKind::ID;
diff --git clang/lib/Basic/Targets/AArch64.cpp clang/lib/Basic/Targets/AArch64.cpp
index 0b899137bbb5..57c9849ef2a7 100644
--- clang/lib/Basic/Targets/AArch64.cpp
+++ clang/lib/Basic/Targets/AArch64.cpp
@@ -703,12 +703,13 @@ ArrayRef<Builtin::Info> AArch64TargetInfo::getTargetBuiltins() const {
 }
 
 std::optional<std::pair<unsigned, unsigned>>
-AArch64TargetInfo::getVScaleRange(const LangOptions &LangOpts) const {
+AArch64TargetInfo::getVScaleRange(const LangOptions &LangOpts,
+                                  bool IsArmStreamingFunction) const {
   if (LangOpts.VScaleMin || LangOpts.VScaleMax)
     return std::pair<unsigned, unsigned>(
         LangOpts.VScaleMin ? LangOpts.VScaleMin : 1, LangOpts.VScaleMax);
 
-  if (hasFeature("sve"))
+  if (hasFeature("sve") || (IsArmStreamingFunction && hasFeature("sme")))
     return std::pair<unsigned, unsigned>(1, 16);
 
   return std::nullopt;
diff --git clang/lib/Basic/Targets/AArch64.h clang/lib/Basic/Targets/AArch64.h
index 600940f5e4e2..79e012f48e65 100644
--- clang/lib/Basic/Targets/AArch64.h
+++ clang/lib/Basic/Targets/AArch64.h
@@ -184,7 +184,8 @@ public:
   ArrayRef<Builtin::Info> getTargetBuiltins() const override;
 
   std::optional<std::pair<unsigned, unsigned>>
-  getVScaleRange(const LangOptions &LangOpts) const override;
+  getVScaleRange(const LangOptions &LangOpts,
+                 bool IsArmStreamingFunction) const override;
   bool doesFeatureAffectCodeGen(StringRef Name) const override;
   bool validateCpuSupports(StringRef FeatureStr) const override;
   bool hasFeature(StringRef Feature) const override;
@@ -227,6 +228,11 @@ public:
   bool validatePointerAuthKey(const llvm::APSInt &value) const override;
 
   const char *getBFloat16Mangling() const override { return "u6__bf16"; };
+
+  std::pair<unsigned, unsigned> hardwareInterferenceSizes() const override {
+    return std::make_pair(256, 64);
+  }
+
   bool hasInt128Type() const override;
 
   bool hasBitIntType() const override { return true; }
diff --git clang/lib/Basic/Targets/ARM.h clang/lib/Basic/Targets/ARM.h
index fdb40c3d4191..5f4acce7af5a 100644
--- clang/lib/Basic/Targets/ARM.h
+++ clang/lib/Basic/Targets/ARM.h
@@ -227,7 +227,7 @@ public:
   const char *getBFloat16Mangling() const override { return "u6__bf16"; };
 
   std::pair<unsigned, unsigned> hardwareInterferenceSizes() const override {
-    return std::make_pair(getTriple().isArch64Bit() ? 256 : 64, 64);
+    return std::make_pair(64, 64);
   }
 };
 
diff --git clang/lib/Basic/Targets/RISCV.cpp clang/lib/Basic/Targets/RISCV.cpp
index 8167d7603b0e..61b8ae9d098a 100644
--- clang/lib/Basic/Targets/RISCV.cpp
+++ clang/lib/Basic/Targets/RISCV.cpp
@@ -222,7 +222,7 @@ void RISCVTargetInfo::getTargetDefines(const LangOptions &Opts,
   // Currently we support the v1.0 RISC-V V intrinsics.
   Builder.defineMacro("__riscv_v_intrinsic", Twine(getVersionValue(1, 0)));
 
-  auto VScale = getVScaleRange(Opts);
+  auto VScale = getVScaleRange(Opts, false);
   if (VScale && VScale->first && VScale->first == VScale->second)
     Builder.defineMacro("__riscv_v_fixed_vlen",
                         Twine(VScale->first * llvm::RISCV::RVVBitsPerBlock));
@@ -289,7 +289,8 @@ bool RISCVTargetInfo::initFeatureMap(
 }
 
 std::optional<std::pair<unsigned, unsigned>>
-RISCVTargetInfo::getVScaleRange(const LangOptions &LangOpts) const {
+RISCVTargetInfo::getVScaleRange(const LangOptions &LangOpts,
+                                bool IsArmStreamingFunction) const {
   // RISCV::RVVBitsPerBlock is 64.
   unsigned VScaleMin = ISAInfo->getMinVLen() / llvm::RISCV::RVVBitsPerBlock;
 
diff --git clang/lib/Basic/Targets/RISCV.h clang/lib/Basic/Targets/RISCV.h
index bb3f3a5cda7c..d31c46f2bb16 100644
--- clang/lib/Basic/Targets/RISCV.h
+++ clang/lib/Basic/Targets/RISCV.h
@@ -99,7 +99,8 @@ public:
                  const std::vector<std::string> &FeaturesVec) const override;
 
   std::optional<std::pair<unsigned, unsigned>>
-  getVScaleRange(const LangOptions &LangOpts) const override;
+  getVScaleRange(const LangOptions &LangOpts,
+                 bool IsArmStreamingFunction) const override;
 
   bool hasFeature(StringRef Feature) const override;
 
diff --git clang/lib/CIR/CMakeLists.txt clang/lib/CIR/CMakeLists.txt
index f3ef8525e15c..4a99ecb33dfb 100644
--- clang/lib/CIR/CMakeLists.txt
+++ clang/lib/CIR/CMakeLists.txt
@@ -5,3 +5,4 @@ add_subdirectory(Dialect)
 add_subdirectory(CodeGen)
 add_subdirectory(FrontendAction)
 add_subdirectory(Interfaces)
+add_subdirectory(Lowering)
diff --git clang/lib/CIR/FrontendAction/.clang-tidy clang/lib/CIR/FrontendAction/.clang-tidy
new file mode 100644
index 000000000000..cfb5bdb4bd1f
--- /dev/null
+++ clang/lib/CIR/FrontendAction/.clang-tidy
@@ -0,0 +1,16 @@
+InheritParentConfig: true
+CheckOptions:
+  - key:             readability-identifier-naming.ClassCase
+    value:           CamelCase
+  - key:             readability-identifier-naming.EnumCase
+    value:           CamelCase
+  - key:             readability-identifier-naming.FunctionCase
+    value:           camelBack
+  - key:             readability-identifier-naming.MemberCase
+    value:           CamelCase
+  - key:             readability-identifier-naming.ParameterCase
+    value:           CamelCase
+  - key:             readability-identifier-naming.UnionCase
+    value:           CamelCase
+  - key:             readability-identifier-naming.VariableCase
+    value:           CamelCase
diff --git clang/lib/CIR/FrontendAction/CIRGenAction.cpp clang/lib/CIR/FrontendAction/CIRGenAction.cpp
index 21b6bc56ed05..eab6958ac8f6 100644
--- clang/lib/CIR/FrontendAction/CIRGenAction.cpp
+++ clang/lib/CIR/FrontendAction/CIRGenAction.cpp
@@ -7,23 +7,47 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/CIR/FrontendAction/CIRGenAction.h"
-#include "clang/CIR/CIRGenerator.h"
-#include "clang/Frontend/CompilerInstance.h"
-
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OwningOpRef.h"
+#include "clang/CIR/CIRGenerator.h"
+#include "clang/CIR/LowerToLLVM.h"
+#include "clang/CodeGen/BackendUtil.h"
+#include "clang/Frontend/CompilerInstance.h"
+#include "llvm/IR/Module.h"
 
 using namespace cir;
 using namespace clang;
 
 namespace cir {
 
+static BackendAction
+getBackendActionFromOutputType(CIRGenAction::OutputType Action) {
+  switch (Action) {
+  case CIRGenAction::OutputType::EmitCIR:
+    assert(false &&
+           "Unsupported output type for getBackendActionFromOutputType!");
+    break; // Unreachable, but fall through to report that
+  case CIRGenAction::OutputType::EmitLLVM:
+    return BackendAction::Backend_EmitLL;
+  }
+  // We should only get here if a non-enum value is passed in or we went through
+  // the assert(false) case above
+  llvm_unreachable("Unsupported output type!");
+}
+
+static std::unique_ptr<llvm::Module>
+lowerFromCIRToLLVMIR(mlir::ModuleOp MLIRModule, llvm::LLVMContext &LLVMCtx) {
+  return direct::lowerDirectlyFromCIRToLLVMIR(MLIRModule, LLVMCtx);
+}
+
 class CIRGenConsumer : public clang::ASTConsumer {
 
   virtual void anchor();
 
   CIRGenAction::OutputType Action;
 
+  CompilerInstance &CI;
+
   std::unique_ptr<raw_pwrite_stream> OutputStream;
 
   ASTContext *Context{nullptr};
@@ -31,18 +55,12 @@ class CIRGenConsumer : public clang::ASTConsumer {
   std::unique_ptr<CIRGenerator> Gen;
 
 public:
-  CIRGenConsumer(CIRGenAction::OutputType Action,
-                 DiagnosticsEngine &DiagnosticsEngine,
-                 IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS,
-                 const HeaderSearchOptions &HeaderSearchOptions,
-                 const CodeGenOptions &CodeGenOptions,
-                 const TargetOptions &TargetOptions,
-                 const LangOptions &LangOptions,
-                 const FrontendOptions &FEOptions,
+  CIRGenConsumer(CIRGenAction::OutputType Action, CompilerInstance &CI,
                  std::unique_ptr<raw_pwrite_stream> OS)
-      : Action(Action), OutputStream(std::move(OS)), FS(VFS),
-        Gen(std::make_unique<CIRGenerator>(DiagnosticsEngine, std::move(VFS),
-                                           CodeGenOptions)) {}
+      : Action(Action), CI(CI), OutputStream(std::move(OS)),
+        FS(&CI.getVirtualFileSystem()),
+        Gen(std::make_unique<CIRGenerator>(CI.getDiagnostics(), std::move(FS),
+                                           CI.getCodeGenOpts())) {}
 
   void Initialize(ASTContext &Ctx) override {
     assert(!Context && "initialized multiple times");
@@ -66,6 +84,17 @@ public:
         MlirModule->print(*OutputStream, Flags);
       }
       break;
+    case CIRGenAction::OutputType::EmitLLVM: {
+      llvm::LLVMContext LLVMCtx;
+      std::unique_ptr<llvm::Module> LLVMModule =
+          lowerFromCIRToLLVMIR(MlirModule, LLVMCtx);
+
+      BackendAction BEAction = getBackendActionFromOutputType(Action);
+      emitBackendOutput(
+          CI, CI.getCodeGenOpts(), C.getTargetInfo().getDataLayoutString(),
+          LLVMModule.get(), BEAction, FS, std::move(OutputStream));
+      break;
+    }
     }
   }
 };
@@ -84,6 +113,8 @@ getOutputStream(CompilerInstance &CI, StringRef InFile,
   switch (Action) {
   case CIRGenAction::OutputType::EmitCIR:
     return CI.createDefaultOutputFile(false, InFile, "cir");
+  case CIRGenAction::OutputType::EmitLLVM:
+    return CI.createDefaultOutputFile(false, InFile, "ll");
   }
   llvm_unreachable("Invalid CIRGenAction::OutputType");
 }
@@ -95,10 +126,8 @@ CIRGenAction::CreateASTConsumer(CompilerInstance &CI, StringRef InFile) {
   if (!Out)
     Out = getOutputStream(CI, InFile, Action);
 
-  auto Result = std::make_unique<cir::CIRGenConsumer>(
-      Action, CI.getDiagnostics(), &CI.getVirtualFileSystem(),
-      CI.getHeaderSearchOpts(), CI.getCodeGenOpts(), CI.getTargetOpts(),
-      CI.getLangOpts(), CI.getFrontendOpts(), std::move(Out));
+  auto Result =
+      std::make_unique<cir::CIRGenConsumer>(Action, CI, std::move(Out));
 
   return Result;
 }
@@ -106,3 +135,7 @@ CIRGenAction::CreateASTConsumer(CompilerInstance &CI, StringRef InFile) {
 void EmitCIRAction::anchor() {}
 EmitCIRAction::EmitCIRAction(mlir::MLIRContext *MLIRCtx)
     : CIRGenAction(OutputType::EmitCIR, MLIRCtx) {}
+
+void EmitLLVMAction::anchor() {}
+EmitLLVMAction::EmitLLVMAction(mlir::MLIRContext *MLIRCtx)
+    : CIRGenAction(OutputType::EmitLLVM, MLIRCtx) {}
diff --git clang/lib/CIR/FrontendAction/CMakeLists.txt clang/lib/CIR/FrontendAction/CMakeLists.txt
index b0616ab5d64b..ac2b857239d0 100644
--- clang/lib/CIR/FrontendAction/CMakeLists.txt
+++ clang/lib/CIR/FrontendAction/CMakeLists.txt
@@ -12,6 +12,8 @@ add_clang_library(clangCIRFrontendAction
   clangAST
   clangFrontend
   clangCIR
+  clangCIRLoweringDirectToLLVM
+  clangCodeGen
   MLIRCIR
   MLIRIR
   )
diff --git clang/lib/CIR/Lowering/CMakeLists.txt clang/lib/CIR/Lowering/CMakeLists.txt
new file mode 100644
index 000000000000..95c304ded918
--- /dev/null
+++ clang/lib/CIR/Lowering/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(DirectToLLVM)
diff --git clang/lib/CIR/Lowering/DirectToLLVM/CMakeLists.txt clang/lib/CIR/Lowering/DirectToLLVM/CMakeLists.txt
new file mode 100644
index 000000000000..3f74c79249a2
--- /dev/null
+++ clang/lib/CIR/Lowering/DirectToLLVM/CMakeLists.txt
@@ -0,0 +1,17 @@
+set(LLVM_LINK_COMPONENTS
+  Core
+  Support
+  )
+
+get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
+
+add_clang_library(clangCIRLoweringDirectToLLVM
+  LowerToLLVM.cpp
+
+  LINK_LIBS
+  MLIRIR
+  ${dialect_libs}
+  MLIRCIR
+  MLIRBuiltinToLLVMIRTranslation
+  MLIRLLVMToLLVMIRTranslation
+  )
diff --git clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
new file mode 100644
index 000000000000..af8ca7d0b89e
--- /dev/null
+++ clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -0,0 +1,201 @@
+//====- LowerToLLVM.cpp - Lowering from CIR to LLVMIR ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements lowering of CIR operations to LLVMIR.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LowerToLLVM.h"
+
+#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
+#include "mlir/Dialect/DLTI/DLTI.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/IR/BuiltinDialect.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Target/LLVMIR/Dialect/Builtin/BuiltinToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Export.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "clang/CIR/Dialect/IR/CIRDialect.h"
+#include "clang/CIR/MissingFeatures.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/TimeProfiler.h"
+
+using namespace cir;
+using namespace llvm;
+
+namespace cir {
+namespace direct {
+
+// This pass requires the CIR to be in a "flat" state. All blocks in each
+// function must belong to the parent region. Once scopes and control flow
+// are implemented in CIR, a pass will be run before this one to flatten
+// the CIR and get it into the state that this pass requires.
+struct ConvertCIRToLLVMPass
+    : public mlir::PassWrapper<ConvertCIRToLLVMPass,
+                               mlir::OperationPass<mlir::ModuleOp>> {
+  void getDependentDialects(mlir::DialectRegistry &registry) const override {
+    registry.insert<mlir::BuiltinDialect, mlir::DLTIDialect,
+                    mlir::LLVM::LLVMDialect, mlir::func::FuncDialect>();
+  }
+  void runOnOperation() final;
+
+  StringRef getDescription() const override {
+    return "Convert the prepared CIR dialect module to LLVM dialect";
+  }
+
+  StringRef getArgument() const override { return "cir-flat-to-llvm"; }
+};
+
+mlir::LogicalResult CIRToLLVMGlobalOpLowering::matchAndRewrite(
+    cir::GlobalOp op, OpAdaptor adaptor,
+    mlir::ConversionPatternRewriter &rewriter) const {
+
+  // Fetch required values to create LLVM op.
+  const mlir::Type cirSymType = op.getSymType();
+
+  // This is the LLVM dialect type.
+  const mlir::Type llvmType = getTypeConverter()->convertType(cirSymType);
+  // FIXME: These default values are placeholders until the the equivalent
+  //        attributes are available on cir.global ops.
+  assert(!cir::MissingFeatures::opGlobalConstant());
+  const bool isConst = false;
+  assert(!cir::MissingFeatures::addressSpace());
+  const unsigned addrSpace = 0;
+  assert(!cir::MissingFeatures::opGlobalDSOLocal());
+  const bool isDsoLocal = true;
+  assert(!cir::MissingFeatures::opGlobalThreadLocal());
+  const bool isThreadLocal = false;
+  assert(!cir::MissingFeatures::opGlobalAlignment());
+  const uint64_t alignment = 0;
+  assert(!cir::MissingFeatures::opGlobalLinkage());
+  const mlir::LLVM::Linkage linkage = mlir::LLVM::Linkage::External;
+  const StringRef symbol = op.getSymName();
+  std::optional<mlir::Attribute> init = op.getInitialValue();
+
+  SmallVector<mlir::NamedAttribute> attributes;
+
+  if (init.has_value()) {
+    if (const auto fltAttr = mlir::dyn_cast<cir::FPAttr>(init.value())) {
+      // Initializer is a constant floating-point number: convert to MLIR
+      // builtin constant.
+      init = rewriter.getFloatAttr(llvmType, fltAttr.getValue());
+    } else if (const auto intAttr =
+                   mlir::dyn_cast<cir::IntAttr>(init.value())) {
+      // Initializer is a constant array: convert it to a compatible llvm init.
+      init = rewriter.getIntegerAttr(llvmType, intAttr.getValue());
+    } else {
+      op.emitError() << "unsupported initializer '" << init.value() << "'";
+      return mlir::failure();
+    }
+  }
+
+  // Rewrite op.
+  rewriter.replaceOpWithNewOp<mlir::LLVM::GlobalOp>(
+      op, llvmType, isConst, linkage, symbol, init.value_or(mlir::Attribute()),
+      alignment, addrSpace, isDsoLocal, isThreadLocal,
+      /*comdat=*/mlir::SymbolRefAttr(), attributes);
+
+  return mlir::success();
+}
+
+static void prepareTypeConverter(mlir::LLVMTypeConverter &converter,
+                                 mlir::DataLayout &dataLayout) {
+  converter.addConversion([&](cir::IntType type) -> mlir::Type {
+    // LLVM doesn't work with signed types, so we drop the CIR signs here.
+    return mlir::IntegerType::get(type.getContext(), type.getWidth());
+  });
+  converter.addConversion([&](cir::SingleType type) -> mlir::Type {
+    return mlir::Float32Type::get(type.getContext());
+  });
+  converter.addConversion([&](cir::DoubleType type) -> mlir::Type {
+    return mlir::Float64Type::get(type.getContext());
+  });
+  converter.addConversion([&](cir::FP80Type type) -> mlir::Type {
+    return mlir::Float80Type::get(type.getContext());
+  });
+  converter.addConversion([&](cir::FP128Type type) -> mlir::Type {
+    return mlir::Float128Type::get(type.getContext());
+  });
+  converter.addConversion([&](cir::LongDoubleType type) -> mlir::Type {
+    return converter.convertType(type.getUnderlying());
+  });
+  converter.addConversion([&](cir::FP16Type type) -> mlir::Type {
+    return mlir::Float16Type::get(type.getContext());
+  });
+  converter.addConversion([&](cir::BF16Type type) -> mlir::Type {
+    return mlir::BFloat16Type::get(type.getContext());
+  });
+}
+
+void ConvertCIRToLLVMPass::runOnOperation() {
+  llvm::TimeTraceScope scope("Convert CIR to LLVM Pass");
+
+  mlir::ModuleOp module = getOperation();
+  mlir::DataLayout dl(module);
+  mlir::LLVMTypeConverter converter(&getContext());
+  prepareTypeConverter(converter, dl);
+
+  mlir::RewritePatternSet patterns(&getContext());
+
+  patterns.add<CIRToLLVMGlobalOpLowering>(converter, patterns.getContext(), dl);
+
+  mlir::ConversionTarget target(getContext());
+  target.addLegalOp<mlir::ModuleOp>();
+  target.addLegalDialect<mlir::LLVM::LLVMDialect>();
+  target.addIllegalDialect<mlir::BuiltinDialect, cir::CIRDialect,
+                           mlir::func::FuncDialect>();
+
+  if (failed(applyPartialConversion(module, target, std::move(patterns))))
+    signalPassFailure();
+}
+
+static std::unique_ptr<mlir::Pass> createConvertCIRToLLVMPass() {
+  return std::make_unique<ConvertCIRToLLVMPass>();
+}
+
+static void populateCIRToLLVMPasses(mlir::OpPassManager &pm) {
+  pm.addPass(createConvertCIRToLLVMPass());
+}
+
+std::unique_ptr<llvm::Module>
+lowerDirectlyFromCIRToLLVMIR(mlir::ModuleOp mlirModule, LLVMContext &llvmCtx) {
+  llvm::TimeTraceScope scope("lower from CIR to LLVM directly");
+
+  mlir::MLIRContext *mlirCtx = mlirModule.getContext();
+
+  mlir::PassManager pm(mlirCtx);
+  populateCIRToLLVMPasses(pm);
+
+  if (mlir::failed(pm.run(mlirModule))) {
+    // FIXME: Handle any errors where they occurs and return a nullptr here.
+    report_fatal_error(
+        "The pass manager failed to lower CIR to LLVMIR dialect!");
+  }
+
+  mlir::registerBuiltinDialectTranslation(*mlirCtx);
+  mlir::registerLLVMDialectTranslation(*mlirCtx);
+
+  llvm::TimeTraceScope translateScope("translateModuleToLLVMIR");
+
+  StringRef moduleName = mlirModule.getName().value_or("CIRToLLVMModule");
+  std::unique_ptr<llvm::Module> llvmModule =
+      mlir::translateModuleToLLVMIR(mlirModule, llvmCtx, moduleName);
+
+  if (!llvmModule) {
+    // FIXME: Handle any errors where they occurs and return a nullptr here.
+    report_fatal_error("Lowering from LLVMIR dialect to llvm IR failed!");
+  }
+
+  return llvmModule;
+}
+} // namespace direct
+} // namespace cir
diff --git clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h
new file mode 100644
index 000000000000..6167ff39b5ad
--- /dev/null
+++ clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h
@@ -0,0 +1,42 @@
+//====- LowerToLLVM.h- Lowering from CIR to LLVM --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares an interface for converting CIR modules to LLVM IR.
+//
+//===----------------------------------------------------------------------===//
+#ifndef CLANG_CIR_LOWERTOLLVM_H
+#define CLANG_CIR_LOWERTOLLVM_H
+
+#include "mlir/Transforms/DialectConversion.h"
+#include "clang/CIR/Dialect/IR/CIRDialect.h"
+
+namespace cir {
+
+namespace direct {
+
+class CIRToLLVMGlobalOpLowering
+    : public mlir::OpConversionPattern<cir::GlobalOp> {
+  const mlir::DataLayout &dataLayout;
+
+public:
+  CIRToLLVMGlobalOpLowering(const mlir::TypeConverter &typeConverter,
+                            mlir::MLIRContext *context,
+                            const mlir::DataLayout &dataLayout)
+      : OpConversionPattern(typeConverter, context), dataLayout(dataLayout) {
+    setHasBoundedRewriteRecursion();
+  }
+
+  mlir::LogicalResult
+  matchAndRewrite(cir::GlobalOp op, OpAdaptor adaptor,
+                  mlir::ConversionPatternRewriter &rewriter) const override;
+};
+
+} // namespace direct
+} // namespace cir
+
+#endif // CLANG_CIR_LOWERTOLLVM_H
diff --git clang/lib/CodeGen/BackendUtil.cpp clang/lib/CodeGen/BackendUtil.cpp
index 3e65eeb3755d..97e9bbccd61e 100644
--- clang/lib/CodeGen/BackendUtil.cpp
+++ clang/lib/CodeGen/BackendUtil.cpp
@@ -795,14 +795,23 @@ static void addSanitizers(const Triple &TargetTriple,
     PB.registerOptimizerLastEPCallback(SanitizersCallback);
   }
 
-  if (LowerAllowCheckPass::IsRequested()) {
+  // SanitizeSkipHotCutoffs: doubles with range [0, 1]
+  // Opts.cutoffs: unsigned ints with range [0, 1000000]
+  auto ScaledCutoffs = CodeGenOpts.SanitizeSkipHotCutoffs.getAllScaled(1000000);
+
+  // TODO: remove IsRequested()
+  if (LowerAllowCheckPass::IsRequested() || ScaledCutoffs.has_value()) {
     // We want to call it after inline, which is about OptimizerEarlyEPCallback.
-    PB.registerOptimizerEarlyEPCallback([&](ModulePassManager &MPM,
-                                            OptimizationLevel Level,
-                                            ThinOrFullLTOPhase Phase) {
-      LowerAllowCheckPass::Options Opts;
-      MPM.addPass(createModuleToFunctionPassAdaptor(LowerAllowCheckPass(Opts)));
-    });
+    PB.registerOptimizerEarlyEPCallback(
+        [ScaledCutoffs](ModulePassManager &MPM, OptimizationLevel Level,
+                        ThinOrFullLTOPhase Phase) {
+          LowerAllowCheckPass::Options Opts;
+          // TODO: after removing IsRequested(), make this unconditional
+          if (ScaledCutoffs.has_value())
+            Opts.cutoffs = ScaledCutoffs.value();
+          MPM.addPass(
+              createModuleToFunctionPassAdaptor(LowerAllowCheckPass(Opts)));
+        });
   }
 }
 
diff --git clang/lib/CodeGen/CGBuiltin.cpp clang/lib/CodeGen/CGBuiltin.cpp
index 7ec9d59bfed5..4d3d9e9897c1 100644
--- clang/lib/CodeGen/CGBuiltin.cpp
+++ clang/lib/CodeGen/CGBuiltin.cpp
@@ -29,6 +29,7 @@
 #include "clang/AST/Expr.h"
 #include "clang/AST/OSLog.h"
 #include "clang/AST/OperationKinds.h"
+#include "clang/AST/StmtVisitor.h"
 #include "clang/AST/Type.h"
 #include "clang/Basic/TargetBuiltins.h"
 #include "clang/Basic/TargetInfo.h"
@@ -1049,33 +1050,100 @@ CodeGenFunction::evaluateOrEmitBuiltinObjectSize(const Expr *E, unsigned Type,
   return ConstantInt::get(ResType, ObjectSize, /*isSigned=*/true);
 }
 
-const FieldDecl *CodeGenFunction::FindFlexibleArrayMemberFieldAndOffset(
-    ASTContext &Ctx, const RecordDecl *RD, const FieldDecl *FAMDecl,
-    uint64_t &Offset) {
+namespace {
+
+/// StructFieldAccess is a simple visitor class to grab the first MemberExpr
+/// from an Expr. It records any ArraySubscriptExpr we meet along the way.
+class StructFieldAccess
+    : public ConstStmtVisitor<StructFieldAccess, const MemberExpr *> {
+  bool AddrOfSeen = false;
+
+public:
+  const ArraySubscriptExpr *ASE = nullptr;
+
+  const MemberExpr *VisitMemberExpr(const MemberExpr *E) {
+    if (AddrOfSeen && E->getType()->isArrayType())
+      // Avoid forms like '&ptr->array'.
+      return nullptr;
+    return E;
+  }
+
+  const MemberExpr *VisitArraySubscriptExpr(const ArraySubscriptExpr *E) {
+    if (ASE)
+      // We don't support multiple subscripts.
+      return nullptr;
+
+    AddrOfSeen = false; // '&ptr->array[idx]' is okay.
+    ASE = E;
+    return Visit(E->getBase());
+  }
+  const MemberExpr *VisitCastExpr(const CastExpr *E) {
+    return Visit(E->getSubExpr());
+  }
+  const MemberExpr *VisitParenExpr(const ParenExpr *E) {
+    return Visit(E->getSubExpr());
+  }
+  const MemberExpr *VisitUnaryAddrOf(const clang::UnaryOperator *E) {
+    AddrOfSeen = true;
+    return Visit(E->getSubExpr());
+  }
+  const MemberExpr *VisitUnaryDeref(const clang::UnaryOperator *E) {
+    AddrOfSeen = false;
+    return Visit(E->getSubExpr());
+  }
+};
+
+} // end anonymous namespace
+
+/// Find a struct's flexible array member. It may be embedded inside multiple
+/// sub-structs, but must still be the last field.
+static const FieldDecl *FindFlexibleArrayMemberField(CodeGenFunction &CGF,
+                                                     ASTContext &Ctx,
+                                                     const RecordDecl *RD) {
   const LangOptions::StrictFlexArraysLevelKind StrictFlexArraysLevel =
-      getLangOpts().getStrictFlexArraysLevel();
-  uint32_t FieldNo = 0;
+      CGF.getLangOpts().getStrictFlexArraysLevel();
 
   if (RD->isImplicit())
     return nullptr;
 
   for (const FieldDecl *FD : RD->fields()) {
-    if ((!FAMDecl || FD == FAMDecl) &&
-        Decl::isFlexibleArrayMemberLike(
+    if (Decl::isFlexibleArrayMemberLike(
             Ctx, FD, FD->getType(), StrictFlexArraysLevel,
-            /*IgnoreTemplateOrMacroSubstitution=*/true)) {
-      const ASTRecordLayout &Layout = Ctx.getASTRecordLayout(RD);
-      Offset += Layout.getFieldOffset(FieldNo);
+            /*IgnoreTemplateOrMacroSubstitution=*/true))
       return FD;
+
+    if (auto RT = FD->getType()->getAs<RecordType>())
+      if (const FieldDecl *FD =
+              FindFlexibleArrayMemberField(CGF, Ctx, RT->getAsRecordDecl()))
+        return FD;
+  }
+
+  return nullptr;
+}
+
+/// Calculate the offset of a struct field. It may be embedded inside multiple
+/// sub-structs.
+static bool GetFieldOffset(ASTContext &Ctx, const RecordDecl *RD,
+                           const FieldDecl *FD, int64_t &Offset) {
+  if (RD->isImplicit())
+    return false;
+
+  // Keep track of the field number ourselves, because the other methods
+  // (CGRecordLayout::getLLVMFieldNo) aren't always equivalent to how the AST
+  // is laid out.
+  uint32_t FieldNo = 0;
+  const ASTRecordLayout &Layout = Ctx.getASTRecordLayout(RD);
+
+  for (const FieldDecl *Field : RD->fields()) {
+    if (Field == FD) {
+      Offset += Layout.getFieldOffset(FieldNo);
+      return true;
     }
 
-    QualType Ty = FD->getType();
-    if (Ty->isRecordType()) {
-      if (const FieldDecl *Field = FindFlexibleArrayMemberFieldAndOffset(
-              Ctx, Ty->getAsRecordDecl(), FAMDecl, Offset)) {
-        const ASTRecordLayout &Layout = Ctx.getASTRecordLayout(RD);
+    if (auto RT = Field->getType()->getAs<RecordType>()) {
+      if (GetFieldOffset(Ctx, RT->getAsRecordDecl(), FD, Offset)) {
         Offset += Layout.getFieldOffset(FieldNo);
-        return Field;
+        return true;
       }
     }
 
@@ -1083,202 +1151,255 @@ const FieldDecl *CodeGenFunction::FindFlexibleArrayMemberFieldAndOffset(
       ++FieldNo;
   }
 
-  return nullptr;
+  return false;
 }
 
-static unsigned CountCountedByAttrs(const RecordDecl *RD) {
-  unsigned Num = 0;
-
-  for (const FieldDecl *FD : RD->fields()) {
-    if (FD->getType()->isCountAttributedType())
-      return ++Num;
+static std::optional<int64_t>
+GetFieldOffset(ASTContext &Ctx, const RecordDecl *RD, const FieldDecl *FD) {
+  int64_t Offset = 0;
 
-    QualType Ty = FD->getType();
-    if (Ty->isRecordType())
-      Num += CountCountedByAttrs(Ty->getAsRecordDecl());
-  }
+  if (GetFieldOffset(Ctx, RD, FD, Offset))
+    return std::optional<int64_t>(Offset);
 
-  return Num;
+  return std::nullopt;
 }
 
 llvm::Value *
-CodeGenFunction::emitFlexibleArrayMemberSize(const Expr *E, unsigned Type,
-                                             llvm::IntegerType *ResType) {
-  // The code generated here calculates the size of a struct with a flexible
-  // array member that uses the counted_by attribute. There are two instances
-  // we handle:
-  //
-  //       struct s {
-  //         unsigned long flags;
-  //         int count;
-  //         int array[] __attribute__((counted_by(count)));
-  //       }
-  //
-  //   1) bdos of the flexible array itself:
-  //
-  //     __builtin_dynamic_object_size(p->array, 1) ==
-  //         p->count * sizeof(*p->array)
-  //
-  //   2) bdos of a pointer into the flexible array:
-  //
-  //     __builtin_dynamic_object_size(&p->array[42], 1) ==
-  //         (p->count - 42) * sizeof(*p->array)
+CodeGenFunction::emitCountedByMemberSize(const Expr *E, llvm::Value *EmittedE,
+                                         unsigned Type,
+                                         llvm::IntegerType *ResType) {
+  ASTContext &Ctx = getContext();
+
+  // Note: If the whole struct is specificed in the __bdos (i.e. Visitor
+  // returns a DeclRefExpr). The calculation of the whole size of the structure
+  // with a flexible array member can be done in two ways:
   //
-  //   2) bdos of the whole struct, including the flexible array:
+  //     1) sizeof(struct S) + count * sizeof(typeof(fam))
+  //     2) offsetof(struct S, fam) + count * sizeof(typeof(fam))
   //
-  //     __builtin_dynamic_object_size(p, 1) ==
-  //        max(sizeof(struct s),
-  //            offsetof(struct s, array) + p->count * sizeof(*p->array))
+  // The first will add additional padding after the end of the array
+  // allocation while the second method is more precise, but not quite expected
+  // from programmers. See
+  // https://lore.kernel.org/lkml/ZvV6X5FPBBW7CO1f@archlinux/ for a discussion
+  // of the topic.
   //
-  ASTContext &Ctx = getContext();
-  const Expr *Base = E->IgnoreParenImpCasts();
-  const Expr *Idx = nullptr;
-
-  if (const auto *UO = dyn_cast<UnaryOperator>(Base);
-      UO && UO->getOpcode() == UO_AddrOf) {
-    Expr *SubExpr = UO->getSubExpr()->IgnoreParenImpCasts();
-    if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(SubExpr)) {
-      Base = ASE->getBase()->IgnoreParenImpCasts();
-      Idx = ASE->getIdx()->IgnoreParenImpCasts();
-
-      if (const auto *IL = dyn_cast<IntegerLiteral>(Idx)) {
-        int64_t Val = IL->getValue().getSExtValue();
-        if (Val < 0)
-          return getDefaultBuiltinObjectSizeResult(Type, ResType);
-
-        if (Val == 0)
-          // The index is 0, so we don't need to take it into account.
-          Idx = nullptr;
-      }
-    } else {
-      // Potential pointer to another element in the struct.
-      Base = SubExpr;
-    }
-  }
+  // GCC isn't (currently) able to calculate __bdos on a pointer to the whole
+  // structure. Therefore, because of the above issue, we choose to match what
+  // GCC does for consistency's sake.
 
-  // Get the flexible array member Decl.
-  const RecordDecl *OuterRD = nullptr;
-  const FieldDecl *FAMDecl = nullptr;
-  if (const auto *ME = dyn_cast<MemberExpr>(Base)) {
-    // Check if \p Base is referencing the FAM itself.
-    const ValueDecl *VD = ME->getMemberDecl();
-    OuterRD = VD->getDeclContext()->getOuterLexicalRecordContext();
-    FAMDecl = dyn_cast<FieldDecl>(VD);
-    if (!FAMDecl)
-      return nullptr;
-  } else if (const auto *DRE = dyn_cast<DeclRefExpr>(Base)) {
-    // Check if we're pointing to the whole struct.
-    QualType Ty = DRE->getDecl()->getType();
-    if (Ty->isPointerType())
-      Ty = Ty->getPointeeType();
-    OuterRD = Ty->getAsRecordDecl();
-
-    // If we have a situation like this:
-    //
-    //     struct union_of_fams {
-    //         int flags;
-    //         union {
-    //             signed char normal_field;
-    //             struct {
-    //                 int count1;
-    //                 int arr1[] __counted_by(count1);
-    //             };
-    //             struct {
-    //                 signed char count2;
-    //                 int arr2[] __counted_by(count2);
-    //             };
-    //         };
-    //    };
-    //
-    // We don't know which 'count' to use in this scenario:
-    //
-    //     size_t get_size(struct union_of_fams *p) {
-    //         return __builtin_dynamic_object_size(p, 1);
-    //     }
-    //
-    // Instead of calculating a wrong number, we give up.
-    if (OuterRD && CountCountedByAttrs(OuterRD) > 1)
-      return nullptr;
-  }
+  StructFieldAccess Visitor;
+  const MemberExpr *ME = Visitor.Visit(E);
+  if (!ME)
+    return nullptr;
 
-  if (!OuterRD)
+  const auto *FD = dyn_cast<FieldDecl>(ME->getMemberDecl());
+  if (!FD)
     return nullptr;
 
-  // We call FindFlexibleArrayMemberAndOffset even if FAMDecl is non-null to
-  // get its offset.
-  uint64_t Offset = 0;
-  FAMDecl =
-      FindFlexibleArrayMemberFieldAndOffset(Ctx, OuterRD, FAMDecl, Offset);
-  Offset = Ctx.toCharUnitsFromBits(Offset).getQuantity();
+  const RecordDecl *RD = FD->getDeclContext()->getOuterLexicalRecordContext();
+  const FieldDecl *FlexibleArrayMemberFD = nullptr;
 
-  if (!FAMDecl || !FAMDecl->getType()->isCountAttributedType())
-    // No flexible array member found or it doesn't have the "counted_by"
-    // attribute.
-    return nullptr;
+  if (Decl::isFlexibleArrayMemberLike(
+          Ctx, FD, FD->getType(), getLangOpts().getStrictFlexArraysLevel(),
+          /*IgnoreTemplateOrMacroSubstitution=*/true))
+    FlexibleArrayMemberFD = FD;
+  else
+    FlexibleArrayMemberFD = FindFlexibleArrayMemberField(*this, Ctx, RD);
 
-  const FieldDecl *CountedByFD = FAMDecl->findCountedByField();
-  if (!CountedByFD)
-    // Can't find the field referenced by the "counted_by" attribute.
+  if (!FlexibleArrayMemberFD ||
+      !FlexibleArrayMemberFD->getType()->isCountAttributedType())
     return nullptr;
 
-  if (isa<DeclRefExpr>(Base))
-    // The whole struct is specificed in the __bdos. The calculation of the
-    // whole size of the structure can be done in two ways:
-    //
-    //     1) sizeof(struct S) + count * sizeof(typeof(fam))
-    //     2) offsetof(struct S, fam) + count * sizeof(typeof(fam))
-    //
-    // The first will add additional padding after the end of the array,
-    // allocation while the second method is more precise, but not quite
-    // expected from programmers. See
-    // https://lore.kernel.org/lkml/ZvV6X5FPBBW7CO1f@archlinux/ for a
-    // discussion of the topic.
-    //
-    // GCC isn't (currently) able to calculate __bdos on a pointer to the whole
-    // structure. Therefore, because of the above issue, we'll choose to match
-    // what GCC does for consistency's sake.
+  const FieldDecl *CountFD = FlexibleArrayMemberFD->findCountedByField();
+  if (!CountFD)
+    // Can't find the field referenced by the "counted_by" attribute.
     return nullptr;
 
-  // Build a load of the counted_by field.
-  bool IsSigned = CountedByFD->getType()->isSignedIntegerType();
-  Value *CountedByInst = EmitLoadOfCountedByField(Base, FAMDecl, CountedByFD);
-  if (!CountedByInst)
-    return getDefaultBuiltinObjectSizeResult(Type, ResType);
-
-  CountedByInst = Builder.CreateIntCast(CountedByInst, ResType, IsSigned);
+  const Expr *Idx = nullptr;
+  if (Visitor.ASE) {
+    Idx = Visitor.ASE->getIdx();
 
-  // Build a load of the index and subtract it from the count.
-  Value *IdxInst = nullptr;
-  if (Idx) {
-    if (Idx->HasSideEffects(getContext()))
+    if (Idx->HasSideEffects(Ctx))
       // We can't have side-effects.
       return getDefaultBuiltinObjectSizeResult(Type, ResType);
 
+    if (const auto *IL = dyn_cast<IntegerLiteral>(Idx)) {
+      int64_t Val = IL->getValue().getSExtValue();
+      if (Val < 0)
+        return getDefaultBuiltinObjectSizeResult(Type, ResType);
+
+      // The index is 0, so we don't need to take it into account.
+      if (Val == 0)
+        Idx = nullptr;
+    }
+  }
+
+  // Calculate the flexible array member's object size using these formulae
+  // (note: if the calculation is negative, we return 0.):
+  //
+  //      struct p;
+  //      struct s {
+  //          /* ... */
+  //          int count;
+  //          struct p *array[] __attribute__((counted_by(count)));
+  //      };
+  //
+  // 1) 'ptr->array':
+  //
+  //    count = ptr->count;
+  //
+  //    flexible_array_member_base_size = sizeof (*ptr->array);
+  //    flexible_array_member_size =
+  //            count * flexible_array_member_base_size;
+  //
+  //    if (flexible_array_member_size < 0)
+  //        return 0;
+  //    return flexible_array_member_size;
+  //
+  // 2) '&ptr->array[idx]':
+  //
+  //    count = ptr->count;
+  //    index = idx;
+  //
+  //    flexible_array_member_base_size = sizeof (*ptr->array);
+  //    flexible_array_member_size =
+  //            count * flexible_array_member_base_size;
+  //
+  //    index_size = index * flexible_array_member_base_size;
+  //
+  //    if (flexible_array_member_size < 0 || index < 0)
+  //        return 0;
+  //    return flexible_array_member_size - index_size;
+  //
+  // 3) '&ptr->field':
+  //
+  //    count = ptr->count;
+  //    sizeof_struct = sizeof (struct s);
+  //
+  //    flexible_array_member_base_size = sizeof (*ptr->array);
+  //    flexible_array_member_size =
+  //            count * flexible_array_member_base_size;
+  //
+  //    field_offset = offsetof (struct s, field);
+  //    offset_diff = sizeof_struct - field_offset;
+  //
+  //    if (flexible_array_member_size < 0)
+  //        return 0;
+  //    return offset_diff + flexible_array_member_size;
+  //
+  // 4) '&ptr->field_array[idx]':
+  //
+  //    count = ptr->count;
+  //    index = idx;
+  //    sizeof_struct = sizeof (struct s);
+  //
+  //    flexible_array_member_base_size = sizeof (*ptr->array);
+  //    flexible_array_member_size =
+  //            count * flexible_array_member_base_size;
+  //
+  //    field_base_size = sizeof (*ptr->field_array);
+  //    field_offset = offsetof (struct s, field)
+  //    field_offset += index * field_base_size;
+  //
+  //    offset_diff = sizeof_struct - field_offset;
+  //
+  //    if (flexible_array_member_size < 0 || index < 0)
+  //        return 0;
+  //    return offset_diff + flexible_array_member_size;
+
+  QualType CountTy = CountFD->getType();
+  bool IsSigned = CountTy->isSignedIntegerType();
+
+  QualType FlexibleArrayMemberTy = FlexibleArrayMemberFD->getType();
+  QualType FieldTy = FD->getType();
+
+  // Explicit cast because otherwise the CharWidth will promote an i32's into
+  // u64's leading to overflows..
+  int64_t CharWidth = static_cast<int64_t>(CGM.getContext().getCharWidth());
+
+  //  size_t field_offset = offsetof (struct s, field);
+  Value *FieldOffset = nullptr;
+  if (FlexibleArrayMemberFD != FD) {
+    std::optional<int64_t> Offset = GetFieldOffset(Ctx, RD, FD);
+    if (!Offset)
+      return nullptr;
+    FieldOffset =
+        llvm::ConstantInt::get(ResType, *Offset / CharWidth, IsSigned);
+  }
+
+  //  size_t count = (size_t) ptr->count;
+  Value *Count = EmitLoadOfCountedByField(ME, FlexibleArrayMemberFD, CountFD);
+  if (!Count)
+    return nullptr;
+  Count = Builder.CreateIntCast(Count, ResType, IsSigned, "count");
+
+  //  size_t index = (size_t) ptr->index;
+  Value *Index = nullptr;
+  if (Idx) {
     bool IdxSigned = Idx->getType()->isSignedIntegerType();
-    IdxInst = EmitAnyExprToTemp(Idx).getScalarVal();
-    IdxInst = Builder.CreateIntCast(IdxInst, ResType, IdxSigned);
-
-    // We go ahead with the calculation here. If the index turns out to be
-    // negative, we'll catch it at the end.
-    CountedByInst =
-        Builder.CreateSub(CountedByInst, IdxInst, "", !IsSigned, IsSigned);
-  }
-
-  // Calculate how large the flexible array member is in bytes.
-  const ArrayType *ArrayTy = Ctx.getAsArrayType(FAMDecl->getType());
-  CharUnits Size = Ctx.getTypeSizeInChars(ArrayTy->getElementType());
-  llvm::Constant *ElemSize =
-      llvm::ConstantInt::get(ResType, Size.getQuantity(), IsSigned);
-  Value *Res =
-      Builder.CreateMul(CountedByInst, ElemSize, "", !IsSigned, IsSigned);
-  Res = Builder.CreateIntCast(Res, ResType, IsSigned);
-
-  // A negative \p IdxInst or \p CountedByInst means that the index lands
-  // outside of the flexible array member. If that's the case, we want to
-  // return 0.
-  Value *Cmp = Builder.CreateIsNotNeg(CountedByInst);
-  if (IdxInst)
-    Cmp = Builder.CreateAnd(Builder.CreateIsNotNeg(IdxInst), Cmp);
+    Index = EmitScalarExpr(Idx);
+    Index = Builder.CreateIntCast(Index, ResType, IdxSigned, "index");
+  }
+
+  //  size_t flexible_array_member_base_size = sizeof (*ptr->array);
+  const ArrayType *ArrayTy = Ctx.getAsArrayType(FlexibleArrayMemberTy);
+  CharUnits BaseSize = Ctx.getTypeSizeInChars(ArrayTy->getElementType());
+  auto *FlexibleArrayMemberBaseSize =
+      llvm::ConstantInt::get(ResType, BaseSize.getQuantity(), IsSigned);
+
+  //  size_t flexible_array_member_size =
+  //          count * flexible_array_member_base_size;
+  Value *FlexibleArrayMemberSize =
+      Builder.CreateMul(Count, FlexibleArrayMemberBaseSize,
+                        "flexible_array_member_size", !IsSigned, IsSigned);
+
+  Value *Res = nullptr;
+  if (FlexibleArrayMemberFD == FD) {
+    if (Idx) { // Option (2) '&ptr->array[idx]'
+      //  size_t index_size = index * flexible_array_member_base_size;
+      Value *IndexSize = Builder.CreateMul(FlexibleArrayMemberBaseSize, Index,
+                                           "index_size", !IsSigned, IsSigned);
+
+      //  return flexible_array_member_size - index_size;
+      Res = Builder.CreateSub(FlexibleArrayMemberSize, IndexSize, "result",
+                              !IsSigned, IsSigned);
+    } else { // Option (1) 'ptr->array'
+      //  return flexible_array_member_size;
+      Res = FlexibleArrayMemberSize;
+    }
+  } else {
+    //  size_t sizeof_struct = sizeof (struct s);
+    llvm::StructType *StructTy = getTypes().getCGRecordLayout(RD).getLLVMType();
+    const llvm::DataLayout &Layout = CGM.getDataLayout();
+    TypeSize Size = Layout.getTypeSizeInBits(StructTy);
+    Value *SizeofStruct =
+        llvm::ConstantInt::get(ResType, Size.getKnownMinValue() / CharWidth);
+
+    if (Idx) { // Option (4) '&ptr->field_array[idx]'
+      //  size_t field_base_size = sizeof (*ptr->field_array);
+      const ArrayType *ArrayTy = Ctx.getAsArrayType(FieldTy);
+      CharUnits BaseSize = Ctx.getTypeSizeInChars(ArrayTy->getElementType());
+      auto *FieldBaseSize =
+          llvm::ConstantInt::get(ResType, BaseSize.getQuantity(), IsSigned);
+
+      //  field_offset += index * field_base_size;
+      Value *Mul = Builder.CreateMul(Index, FieldBaseSize, "field_offset",
+                                     !IsSigned, IsSigned);
+      FieldOffset = Builder.CreateAdd(FieldOffset, Mul);
+    }
+    // Option (3) '&ptr->field', and Option (4) continuation.
+
+    //  size_t offset_diff = flexible_array_member_offset - field_offset;
+    Value *OffsetDiff = Builder.CreateSub(SizeofStruct, FieldOffset,
+                                          "offset_diff", !IsSigned, IsSigned);
+
+    //  return offset_diff + flexible_array_member_size;
+    Res = Builder.CreateAdd(FlexibleArrayMemberSize, OffsetDiff, "result");
+  }
+
+  Value *Cmp = Builder.CreateIsNotNeg(Res);
+  if (Idx)
+    Cmp = Builder.CreateAnd(Builder.CreateIsNotNeg(Index), Cmp);
 
   return Builder.CreateSelect(Cmp, Res, ConstantInt::get(ResType, 0, IsSigned));
 }
@@ -1315,13 +1436,6 @@ CodeGenFunction::emitBuiltinObjectSize(const Expr *E, unsigned Type,
     }
   }
 
-  if (IsDynamic) {
-    // Emit special code for a flexible array member with the "counted_by"
-    // attribute.
-    if (Value *V = emitFlexibleArrayMemberSize(E, Type, ResType))
-      return V;
-  }
-
   // LLVM can't handle Type=3 appropriately, and __builtin_object_size shouldn't
   // evaluate E for side-effects. In either case, we shouldn't lower to
   // @llvm.objectsize.
@@ -1332,6 +1446,12 @@ CodeGenFunction::emitBuiltinObjectSize(const Expr *E, unsigned Type,
   assert(Ptr->getType()->isPointerTy() &&
          "Non-pointer passed to __builtin_object_size?");
 
+  if (IsDynamic)
+    // Emit special code for a flexible array member with the "counted_by"
+    // attribute.
+    if (Value *V = emitCountedByMemberSize(E, Ptr, Type, ResType))
+      return V;
+
   Function *F =
       CGM.getIntrinsic(Intrinsic::objectsize, {ResType, Ptr->getType()});
 
@@ -15254,6 +15374,17 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
     Function *F = CGM.getIntrinsic(Intrinsic::prefetch, Address->getType());
     return Builder.CreateCall(F, {Address, RW, Locality, Data});
   }
+  case X86::BI_m_prefetch:
+  case X86::BI_m_prefetchw: {
+    Value *Address = Ops[0];
+    // The 'w' suffix implies write.
+    Value *RW =
+        ConstantInt::get(Int32Ty, BuiltinID == X86::BI_m_prefetchw ? 1 : 0);
+    Value *Locality = ConstantInt::get(Int32Ty, 0x3);
+    Value *Data = ConstantInt::get(Int32Ty, 1);
+    Function *F = CGM.getIntrinsic(Intrinsic::prefetch, Address->getType());
+    return Builder.CreateCall(F, {Address, RW, Locality, Data});
+  }
   case X86::BI_mm_clflush: {
     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse2_clflush),
                               Ops[0]);
diff --git clang/lib/CodeGen/CGExpr.cpp clang/lib/CodeGen/CGExpr.cpp
index 9676e61cf322..bf8df2789f58 100644
--- clang/lib/CodeGen/CGExpr.cpp
+++ clang/lib/CodeGen/CGExpr.cpp
@@ -3614,29 +3614,33 @@ void CodeGenFunction::EmitCheck(
   llvm::Value *RecoverableCond = nullptr;
   llvm::Value *TrapCond = nullptr;
   bool NoMerge = false;
+  // Expand checks into:
+  //   (Check1 || !allow_ubsan_check) && (Check2 || !allow_ubsan_check) ...
+  // We need separate allow_ubsan_check intrinsics because they have separately
+  // specified cutoffs.
+  // This expression looks expensive but will be simplified after
+  // LowerAllowCheckPass.
   for (auto &[Check, Ord] : Checked) {
+    llvm::Value *GuardedCheck = Check;
+    if (ClSanitizeGuardChecks ||
+        (CGM.getCodeGenOpts().SanitizeSkipHotCutoffs[Ord] > 0)) {
+      llvm::Value *Allow = Builder.CreateCall(
+          CGM.getIntrinsic(llvm::Intrinsic::allow_ubsan_check),
+          llvm::ConstantInt::get(CGM.Int8Ty, Ord));
+      GuardedCheck = Builder.CreateOr(Check, Builder.CreateNot(Allow));
+    }
+
     // -fsanitize-trap= overrides -fsanitize-recover=.
     llvm::Value *&Cond = CGM.getCodeGenOpts().SanitizeTrap.has(Ord) ? TrapCond
                          : CGM.getCodeGenOpts().SanitizeRecover.has(Ord)
                              ? RecoverableCond
                              : FatalCond;
-    Cond = Cond ? Builder.CreateAnd(Cond, Check) : Check;
+    Cond = Cond ? Builder.CreateAnd(Cond, GuardedCheck) : GuardedCheck;
 
     if (!CGM.getCodeGenOpts().SanitizeMergeHandlers.has(Ord))
       NoMerge = true;
   }
 
-  if (ClSanitizeGuardChecks) {
-    llvm::Value *Allow =
-        Builder.CreateCall(CGM.getIntrinsic(llvm::Intrinsic::allow_ubsan_check),
-                           llvm::ConstantInt::get(CGM.Int8Ty, CheckHandler));
-
-    for (llvm::Value **Cond : {&FatalCond, &RecoverableCond, &TrapCond}) {
-      if (*Cond)
-        *Cond = Builder.CreateOr(*Cond, Builder.CreateNot(Allow));
-    }
-  }
-
   if (TrapCond)
     EmitTrapCheck(TrapCond, CheckHandler, NoMerge);
   if (!FatalCond && !RecoverableCond)
diff --git clang/lib/CodeGen/CGOpenCLRuntime.cpp clang/lib/CodeGen/CGOpenCLRuntime.cpp
index 115b618056a4..9f8ff488755e 100644
--- clang/lib/CodeGen/CGOpenCLRuntime.cpp
+++ clang/lib/CodeGen/CGOpenCLRuntime.cpp
@@ -130,10 +130,11 @@ void CGOpenCLRuntime::recordBlockInfo(const BlockExpr *E,
   assert(!EnqueuedBlockMap.contains(E) && "Block expression emitted twice");
   assert(isa<llvm::Function>(InvokeF) && "Invalid invoke function");
   assert(Block->getType()->isPointerTy() && "Invalid block literal type");
-  EnqueuedBlockMap[E].InvokeFunc = InvokeF;
-  EnqueuedBlockMap[E].BlockArg = Block;
-  EnqueuedBlockMap[E].BlockTy = BlockTy;
-  EnqueuedBlockMap[E].KernelHandle = nullptr;
+  EnqueuedBlockInfo &BlockInfo = EnqueuedBlockMap[E];
+  BlockInfo.InvokeFunc = InvokeF;
+  BlockInfo.BlockArg = Block;
+  BlockInfo.BlockTy = BlockTy;
+  BlockInfo.KernelHandle = nullptr;
 }
 
 llvm::Function *CGOpenCLRuntime::getInvokeFunction(const Expr *E) {
@@ -148,17 +149,19 @@ CGOpenCLRuntime::emitOpenCLEnqueuedBlock(CodeGenFunction &CGF, const Expr *E) {
   // to get the block literal.
   const BlockExpr *Block = getBlockExpr(E);
 
-  assert(EnqueuedBlockMap.contains(Block) && "Block expression not emitted");
+  auto It = EnqueuedBlockMap.find(Block);
+  assert(It != EnqueuedBlockMap.end() && "Block expression not emitted");
+  EnqueuedBlockInfo &BlockInfo = It->second;
 
   // Do not emit the block wrapper again if it has been emitted.
-  if (EnqueuedBlockMap[Block].KernelHandle) {
-    return EnqueuedBlockMap[Block];
+  if (BlockInfo.KernelHandle) {
+    return BlockInfo;
   }
 
   auto *F = CGF.getTargetHooks().createEnqueuedBlockKernel(
-      CGF, EnqueuedBlockMap[Block].InvokeFunc, EnqueuedBlockMap[Block].BlockTy);
+      CGF, BlockInfo.InvokeFunc, BlockInfo.BlockTy);
 
   // The common part of the post-processing of the kernel goes here.
-  EnqueuedBlockMap[Block].KernelHandle = F;
-  return EnqueuedBlockMap[Block];
+  BlockInfo.KernelHandle = F;
+  return BlockInfo;
 }
diff --git clang/lib/CodeGen/CGStmt.cpp clang/lib/CodeGen/CGStmt.cpp
index 7c944fe85a35..e2ae1046c084 100644
--- clang/lib/CodeGen/CGStmt.cpp
+++ clang/lib/CodeGen/CGStmt.cpp
@@ -489,6 +489,8 @@ void CodeGenFunction::EmitStmt(const Stmt *S, ArrayRef<const Attr *> Attrs) {
   case Stmt::OpenACCUpdateConstructClass:
     EmitOpenACCUpdateConstruct(cast<OpenACCUpdateConstruct>(*S));
     break;
+  case Stmt::OpenACCAtomicConstructClass:
+    EmitOpenACCAtomicConstruct(cast<OpenACCAtomicConstruct>(*S));
   }
 }
 
diff --git clang/lib/CodeGen/CodeGenFunction.cpp clang/lib/CodeGen/CodeGenFunction.cpp
index bbef277a5244..08165e0b2840 100644
--- clang/lib/CodeGen/CodeGenFunction.cpp
+++ clang/lib/CodeGen/CodeGenFunction.cpp
@@ -551,14 +551,6 @@ void CodeGenFunction::FinishFunction(SourceLocation EndLoc) {
     CurFn->addFnAttr("min-legal-vector-width",
                      llvm::utostr(LargestVectorWidth));
 
-  // Add vscale_range attribute if appropriate.
-  std::optional<std::pair<unsigned, unsigned>> VScaleRange =
-      getContext().getTargetInfo().getVScaleRange(getLangOpts());
-  if (VScaleRange) {
-    CurFn->addFnAttr(llvm::Attribute::getWithVScaleRangeArgs(
-        getLLVMContext(), VScaleRange->first, VScaleRange->second));
-  }
-
   // If we generated an unreachable return block, delete it now.
   if (ReturnBlock.isValid() && ReturnBlock.getBlock()->use_empty()) {
     Builder.ClearInsertionPoint();
@@ -1110,6 +1102,15 @@ void CodeGenFunction::StartFunction(GlobalDecl GD, QualType RetTy,
   if (FD && FD->isMain())
     Fn->removeFnAttr("zero-call-used-regs");
 
+  // Add vscale_range attribute if appropriate.
+  std::optional<std::pair<unsigned, unsigned>> VScaleRange =
+      getContext().getTargetInfo().getVScaleRange(
+          getLangOpts(), FD ? IsArmStreamingFunction(FD, true) : false);
+  if (VScaleRange) {
+    CurFn->addFnAttr(llvm::Attribute::getWithVScaleRangeArgs(
+        getLLVMContext(), VScaleRange->first, VScaleRange->second));
+  }
+
   llvm::BasicBlock *EntryBB = createBasicBlock("entry", CurFn);
 
   // Create a marker to make it easy to insert allocas into the entryblock
diff --git clang/lib/CodeGen/CodeGenFunction.h clang/lib/CodeGen/CodeGenFunction.h
index e978cad43362..ced3484fbd2b 100644
--- clang/lib/CodeGen/CodeGenFunction.h
+++ clang/lib/CodeGen/CodeGenFunction.h
@@ -3324,20 +3324,11 @@ public:
                            llvm::Value *Index, QualType IndexType,
                            QualType IndexedType, bool Accessed);
 
-  // Find a struct's flexible array member and get its offset. It may be
-  // embedded inside multiple sub-structs, but must still be the last field.
-  const FieldDecl *
-  FindFlexibleArrayMemberFieldAndOffset(ASTContext &Ctx, const RecordDecl *RD,
-                                        const FieldDecl *FAMDecl,
-                                        uint64_t &Offset);
-
-  llvm::Value *GetCountedByFieldExprGEP(const Expr *Base,
-                                        const FieldDecl *FAMDecl,
+  llvm::Value *GetCountedByFieldExprGEP(const Expr *Base, const FieldDecl *FD,
                                         const FieldDecl *CountDecl);
 
   /// Build an expression accessing the "counted_by" field.
-  llvm::Value *EmitLoadOfCountedByField(const Expr *Base,
-                                        const FieldDecl *FAMDecl,
+  llvm::Value *EmitLoadOfCountedByField(const Expr *Base, const FieldDecl *FD,
                                         const FieldDecl *CountDecl);
 
   llvm::Value *EmitScalarPrePostIncDec(const UnaryOperator *E, LValue LV,
@@ -4176,6 +4167,13 @@ public:
     // but in the future we will implement some sort of IR.
   }
 
+  void EmitOpenACCAtomicConstruct(const OpenACCAtomicConstruct &S) {
+    // TODO OpenACC: Implement this.  It is currently implemented as a 'no-op',
+    // simply emitting its associated stmt, but in the future we will implement
+    // some sort of IR.
+    EmitStmt(S.getAssociatedStmt());
+  }
+
   //===--------------------------------------------------------------------===//
   //                         LValue Expression Emission
   //===--------------------------------------------------------------------===//
@@ -5369,8 +5367,9 @@ private:
                                      llvm::Value *EmittedE,
                                      bool IsDynamic);
 
-  llvm::Value *emitFlexibleArrayMemberSize(const Expr *E, unsigned Type,
-                                           llvm::IntegerType *ResType);
+  llvm::Value *emitCountedByMemberSize(const Expr *E, llvm::Value *EmittedE,
+                                       unsigned Type,
+                                       llvm::IntegerType *ResType);
 
   void emitZeroOrPatternForAutoVarInit(QualType type, const VarDecl &D,
                                        Address Loc);
diff --git clang/lib/CodeGen/CodeGenModule.cpp clang/lib/CodeGen/CodeGenModule.cpp
index a015d64f4065..82002b8d8e4d 100644
--- clang/lib/CodeGen/CodeGenModule.cpp
+++ clang/lib/CodeGen/CodeGenModule.cpp
@@ -1293,6 +1293,11 @@ void CodeGenModule::Release() {
   if (LangOpts.EHAsynch)
     getModule().addModuleFlag(llvm::Module::Warning, "eh-asynch", 1);
 
+  // Emit Import Call section.
+  if (CodeGenOpts.ImportCallOptimization)
+    getModule().addModuleFlag(llvm::Module::Warning, "import-call-optimization",
+                              1);
+
   // Indicate whether this Module was compiled with -fopenmp
   if (getLangOpts().OpenMP && !getLangOpts().OpenMPSimd)
     getModule().addModuleFlag(llvm::Module::Max, "openmp", LangOpts.OpenMP);
@@ -3760,7 +3765,7 @@ ConstantAddress CodeGenModule::GetAddrOfTemplateParamObject(
   auto *GV = new llvm::GlobalVariable(getModule(), Init->getType(),
                                       /*isConstant=*/true, Linkage, Init, Name);
   setGVProperties(GV, TPO);
-  if (supportsCOMDAT())
+  if (supportsCOMDAT() && Linkage == llvm::GlobalValue::LinkOnceODRLinkage)
     GV->setComdat(TheModule.getOrInsertComdat(GV->getName()));
   Emitter.finalize(GV);
 
diff --git clang/lib/CodeGen/CodeGenTBAA.cpp clang/lib/CodeGen/CodeGenTBAA.cpp
index 75e66bae79af..3f1a24791ddd 100644
--- clang/lib/CodeGen/CodeGenTBAA.cpp
+++ clang/lib/CodeGen/CodeGenTBAA.cpp
@@ -226,6 +226,14 @@ llvm::MDNode *CodeGenTBAA::getTypeInfoHelper(const Type *Ty) {
       PtrDepth++;
       Ty = Ty->getPointeeType()->getBaseElementTypeUnsafe();
     } while (Ty->isPointerType());
+
+    // While there are no special rules in the standards regarding void pointers
+    // and strict aliasing, emitting distinct tags for void pointers break some
+    // common idioms and there is no good alternative to re-write the code
+    // without strict-aliasing violations.
+    if (Ty->isVoidType())
+      return AnyPtr;
+
     assert(!isa<VariableArrayType>(Ty));
     // When the underlying type is a builtin type, we compute the pointee type
     // string recursively, which is implicitly more forgiving than the standards
diff --git clang/lib/CodeGen/ItaniumCXXABI.cpp clang/lib/CodeGen/ItaniumCXXABI.cpp
index 7c463f51f63d..7375a511809b 100644
--- clang/lib/CodeGen/ItaniumCXXABI.cpp
+++ clang/lib/CodeGen/ItaniumCXXABI.cpp
@@ -2239,8 +2239,8 @@ CGCallee ItaniumCXXABI::getVirtualFunctionPointer(CodeGenFunction &CGF,
 llvm::Value *ItaniumCXXABI::EmitVirtualDestructorCall(
     CodeGenFunction &CGF, const CXXDestructorDecl *Dtor, CXXDtorType DtorType,
     Address This, DeleteOrMemberCallExpr E, llvm::CallBase **CallOrInvoke) {
-  auto *CE = E.dyn_cast<const CXXMemberCallExpr *>();
-  auto *D = E.dyn_cast<const CXXDeleteExpr *>();
+  auto *CE = dyn_cast<const CXXMemberCallExpr *>(E);
+  auto *D = dyn_cast<const CXXDeleteExpr *>(E);
   assert((CE != nullptr) ^ (D != nullptr));
   assert(CE == nullptr || CE->arg_begin() == CE->arg_end());
   assert(DtorType == Dtor_Deleting || DtorType == Dtor_Complete);
diff --git clang/lib/CodeGen/MicrosoftCXXABI.cpp clang/lib/CodeGen/MicrosoftCXXABI.cpp
index 0d53e8cb45fe..4a2630e83b62 100644
--- clang/lib/CodeGen/MicrosoftCXXABI.cpp
+++ clang/lib/CodeGen/MicrosoftCXXABI.cpp
@@ -1996,8 +1996,8 @@ CGCallee MicrosoftCXXABI::getVirtualFunctionPointer(CodeGenFunction &CGF,
 llvm::Value *MicrosoftCXXABI::EmitVirtualDestructorCall(
     CodeGenFunction &CGF, const CXXDestructorDecl *Dtor, CXXDtorType DtorType,
     Address This, DeleteOrMemberCallExpr E, llvm::CallBase **CallOrInvoke) {
-  auto *CE = E.dyn_cast<const CXXMemberCallExpr *>();
-  auto *D = E.dyn_cast<const CXXDeleteExpr *>();
+  auto *CE = dyn_cast<const CXXMemberCallExpr *>(E);
+  auto *D = dyn_cast<const CXXDeleteExpr *>(E);
   assert((CE != nullptr) ^ (D != nullptr));
   assert(CE == nullptr || CE->arg_begin() == CE->arg_end());
   assert(DtorType == Dtor_Deleting || DtorType == Dtor_Complete);
diff --git clang/lib/CodeGen/Targets/ARM.cpp clang/lib/CodeGen/Targets/ARM.cpp
index 2d858fa2f3c3..47e31ceeaf29 100644
--- clang/lib/CodeGen/Targets/ARM.cpp
+++ clang/lib/CodeGen/Targets/ARM.cpp
@@ -71,6 +71,7 @@ private:
                                   unsigned functionCallConv) const;
   ABIArgInfo classifyHomogeneousAggregate(QualType Ty, const Type *Base,
                                           uint64_t Members) const;
+  bool shouldIgnoreEmptyArg(QualType Ty) const;
   ABIArgInfo coerceIllegalVector(QualType Ty) const;
   bool isIllegalVectorType(QualType Ty) const;
   bool containsAnyFP16Vectors(QualType Ty) const;
@@ -328,6 +329,31 @@ ABIArgInfo ARMABIInfo::classifyHomogeneousAggregate(QualType Ty,
   return ABIArgInfo::getDirect(nullptr, 0, nullptr, false, Align);
 }
 
+bool ARMABIInfo::shouldIgnoreEmptyArg(QualType Ty) const {
+  uint64_t Size = getContext().getTypeSize(Ty);
+  assert((isEmptyRecord(getContext(), Ty, true) || Size == 0) &&
+         "Arg is not empty");
+
+  // Empty records are ignored in C mode, and in C++ on WatchOS.
+  if (!getContext().getLangOpts().CPlusPlus ||
+      getABIKind() == ARMABIKind::AAPCS16_VFP)
+    return true;
+
+  // In C++ mode, arguments which have sizeof() == 0 are ignored. This is not a
+  // situation which is defined by any C++ standard or ABI, but this matches
+  // GCC's de facto ABI.
+  if (Size == 0)
+    return true;
+
+  // Clang 19.0 and earlier always ignored empty struct arguments in C++ mode.
+  if (getContext().getLangOpts().getClangABICompat() <=
+      LangOptions::ClangABI::Ver19)
+    return true;
+
+  // Otherwise, they are passed as if they have a size of 1 byte.
+  return false;
+}
+
 ABIArgInfo ARMABIInfo::classifyArgumentType(QualType Ty, bool isVariadic,
                                             unsigned functionCallConv) const {
   // 6.1.2.1 The following argument types are VFP CPRCs:
@@ -366,9 +392,15 @@ ABIArgInfo ARMABIInfo::classifyArgumentType(QualType Ty, bool isVariadic,
     return getNaturalAlignIndirect(Ty, RAA == CGCXXABI::RAA_DirectInMemory);
   }
 
-  // Ignore empty records.
-  if (isEmptyRecord(getContext(), Ty, true))
-    return ABIArgInfo::getIgnore();
+  // Empty records are either ignored completely or passed as if they were a
+  // 1-byte object, depending on the ABI and language standard.
+  if (isEmptyRecord(getContext(), Ty, true) ||
+      getContext().getTypeSize(Ty) == 0) {
+    if (shouldIgnoreEmptyArg(Ty))
+      return ABIArgInfo::getIgnore();
+    else
+      return ABIArgInfo::getDirect(llvm::Type::getInt8Ty(getVMContext()));
+  }
 
   if (IsAAPCS_VFP) {
     // Homogeneous Aggregates need to be expanded when we can fit the aggregate
@@ -588,7 +620,8 @@ ABIArgInfo ARMABIInfo::classifyReturnType(QualType RetTy, bool isVariadic,
 
   // Otherwise this is an AAPCS variant.
 
-  if (isEmptyRecord(getContext(), RetTy, true))
+  if (isEmptyRecord(getContext(), RetTy, true) ||
+      getContext().getTypeSize(RetTy) == 0)
     return ABIArgInfo::getIgnore();
 
   // Check for homogeneous aggregates with AAPCS-VFP.
@@ -752,7 +785,9 @@ RValue ARMABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
   CharUnits SlotSize = CharUnits::fromQuantity(4);
 
   // Empty records are ignored for parameter passing purposes.
-  if (isEmptyRecord(getContext(), Ty, true))
+  uint64_t Size = getContext().getTypeSize(Ty);
+  bool IsEmpty = isEmptyRecord(getContext(), Ty, true);
+  if ((IsEmpty || Size == 0) && shouldIgnoreEmptyArg(Ty))
     return Slot.asRValue();
 
   CharUnits TySize = getContext().getTypeSizeInChars(Ty);
diff --git clang/lib/CodeGen/Targets/RISCV.cpp clang/lib/CodeGen/Targets/RISCV.cpp
index 2b70f2bd3f38..2c48ba37fd20 100644
--- clang/lib/CodeGen/Targets/RISCV.cpp
+++ clang/lib/CodeGen/Targets/RISCV.cpp
@@ -367,8 +367,8 @@ ABIArgInfo RISCVABIInfo::coerceVLSVector(QualType Ty) const {
   const auto *VT = Ty->castAs<VectorType>();
   assert(VT->getElementType()->isBuiltinType() && "expected builtin type!");
 
-  auto VScale =
-      getContext().getTargetInfo().getVScaleRange(getContext().getLangOpts());
+  auto VScale = getContext().getTargetInfo().getVScaleRange(
+      getContext().getLangOpts(), false);
 
   unsigned NumElts = VT->getNumElements();
   llvm::Type *EltType = llvm::Type::getInt1Ty(getVMContext());
diff --git clang/lib/Driver/Driver.cpp clang/lib/Driver/Driver.cpp
index 612e44b8c4da..912777a9808b 100644
--- clang/lib/Driver/Driver.cpp
+++ clang/lib/Driver/Driver.cpp
@@ -4712,23 +4712,7 @@ Driver::getOffloadArchs(Compilation &C, const llvm::opt::DerivedArgList &Args,
     return KnownArchs.lookup(TC);
 
   llvm::DenseSet<StringRef> Archs;
-  for (auto *Arg : Args) {
-    // Extract any '--[no-]offload-arch' arguments intended for this toolchain.
-    std::unique_ptr<llvm::opt::Arg> ExtractedArg = nullptr;
-    if (Arg->getOption().matches(options::OPT_Xopenmp_target_EQ) &&
-        ToolChain::getOpenMPTriple(Arg->getValue(0)) == TC->getTriple()) {
-      Arg->claim();
-      unsigned Index = Args.getBaseArgs().MakeIndex(Arg->getValue(1));
-      unsigned Prev = Index;
-      ExtractedArg = getOpts().ParseOneArg(Args, Index);
-      if (!ExtractedArg || Index > Prev + 1) {
-        TC->getDriver().Diag(diag::err_drv_invalid_Xopenmp_target_with_args)
-            << Arg->getAsString(Args);
-        continue;
-      }
-      Arg = ExtractedArg.get();
-    }
-
+  for (auto *Arg : C.getArgsForToolChain(TC, /*BoundArch=*/"", Kind)) {
     // Add or remove the seen architectures in order of appearance. If an
     // invalid architecture is given we simply exit.
     if (Arg->getOption().matches(options::OPT_offload_arch_EQ)) {
@@ -4785,14 +4769,31 @@ Driver::getOffloadArchs(Compilation &C, const llvm::opt::DerivedArgList &Args,
     return Archs;
 
   if (Archs.empty()) {
-    if (Kind == Action::OFK_Cuda)
+    if (Kind == Action::OFK_Cuda) {
       Archs.insert(OffloadArchToString(OffloadArch::CudaDefault));
-    else if (Kind == Action::OFK_HIP)
+    } else if (Kind == Action::OFK_HIP) {
       Archs.insert(OffloadArchToString(OffloadArch::HIPDefault));
-    else if (Kind == Action::OFK_OpenMP)
-      Archs.insert(StringRef());
-    else if (Kind == Action::OFK_SYCL)
+    } else if (Kind == Action::OFK_SYCL) {
       Archs.insert(StringRef());
+    } else if (Kind == Action::OFK_OpenMP) {
+      // Accept legacy `-march` device arguments for OpenMP.
+      if (auto *Arg = C.getArgsForToolChain(TC, /*BoundArch=*/"", Kind)
+                          .getLastArg(options::OPT_march_EQ)) {
+        Archs.insert(Arg->getValue());
+      } else {
+        auto ArchsOrErr = TC->getSystemGPUArchs(Args);
+        if (!ArchsOrErr) {
+          TC->getDriver().Diag(diag::err_drv_undetermined_gpu_arch)
+              << llvm::Triple::getArchTypeName(TC->getArch())
+              << llvm::toString(ArchsOrErr.takeError()) << "--offload-arch";
+        } else if (!ArchsOrErr->empty()) {
+          for (auto Arch : *ArchsOrErr)
+            Archs.insert(Args.MakeArgStringRef(Arch));
+        } else {
+          Archs.insert(StringRef());
+        }
+      }
+    }
   } else {
     Args.ClaimAllArgs(options::OPT_offload_arch_EQ);
     Args.ClaimAllArgs(options::OPT_no_offload_arch_EQ);
diff --git clang/lib/Driver/ToolChains/AMDGPU.cpp clang/lib/Driver/ToolChains/AMDGPU.cpp
index a8061ffd9321..83f486611bc9 100644
--- clang/lib/Driver/ToolChains/AMDGPU.cpp
+++ clang/lib/Driver/ToolChains/AMDGPU.cpp
@@ -950,6 +950,11 @@ void ROCMToolChain::addClangTargetOptions(
                                                 ABIVer))
     return;
 
+  std::tuple<bool, const SanitizerArgs> GPUSan(
+      DriverArgs.hasFlag(options::OPT_fgpu_sanitize,
+                         options::OPT_fno_gpu_sanitize, true),
+      getSanitizerArgs(DriverArgs));
+
   bool Wave64 = isWave64(DriverArgs, Kind);
 
   // TODO: There are way too many flags that change this. Do we need to check
@@ -965,21 +970,19 @@ void ROCMToolChain::addClangTargetOptions(
       DriverArgs.hasArg(options::OPT_cl_fp32_correctly_rounded_divide_sqrt);
 
   // Add the OpenCL specific bitcode library.
-  llvm::SmallVector<std::string, 12> BCLibs;
-  BCLibs.push_back(RocmInstallation->getOpenCLPath().str());
+  llvm::SmallVector<BitCodeLibraryInfo, 12> BCLibs;
+  BCLibs.emplace_back(RocmInstallation->getOpenCLPath().str());
 
   // Add the generic set of libraries.
   BCLibs.append(RocmInstallation->getCommonBitcodeLibs(
       DriverArgs, LibDeviceFile, Wave64, DAZ, FiniteOnly, UnsafeMathOpt,
-      FastRelaxedMath, CorrectSqrt, ABIVer, false));
+      FastRelaxedMath, CorrectSqrt, ABIVer, GPUSan, false));
 
-  if (getSanitizerArgs(DriverArgs).needsAsanRt()) {
-    CC1Args.push_back("-mlink-bitcode-file");
-    CC1Args.push_back(
-        DriverArgs.MakeArgString(RocmInstallation->getAsanRTLPath()));
-  }
-  for (StringRef BCFile : BCLibs) {
-    CC1Args.push_back("-mlink-builtin-bitcode");
+  for (auto [BCFile, Internalize] : BCLibs) {
+    if (Internalize)
+      CC1Args.push_back("-mlink-builtin-bitcode");
+    else
+      CC1Args.push_back("-mlink-bitcode-file");
     CC1Args.push_back(DriverArgs.MakeArgString(BCFile));
   }
 }
@@ -1002,18 +1005,35 @@ bool RocmInstallationDetector::checkCommonBitcodeLibs(
   return true;
 }
 
-llvm::SmallVector<std::string, 12>
+llvm::SmallVector<ToolChain::BitCodeLibraryInfo, 12>
 RocmInstallationDetector::getCommonBitcodeLibs(
     const llvm::opt::ArgList &DriverArgs, StringRef LibDeviceFile, bool Wave64,
     bool DAZ, bool FiniteOnly, bool UnsafeMathOpt, bool FastRelaxedMath,
-    bool CorrectSqrt, DeviceLibABIVersion ABIVer, bool isOpenMP = false) const {
-  llvm::SmallVector<std::string, 12> BCLibs;
-
-  auto AddBCLib = [&](StringRef BCFile) { BCLibs.push_back(BCFile.str()); };
+    bool CorrectSqrt, DeviceLibABIVersion ABIVer,
+    const std::tuple<bool, const SanitizerArgs> &GPUSan,
+    bool isOpenMP = false) const {
+  llvm::SmallVector<ToolChain::BitCodeLibraryInfo, 12> BCLibs;
+
+  auto GPUSanEnabled = [GPUSan]() { return std::get<bool>(GPUSan); };
+  auto AddBCLib = [&](ToolChain::BitCodeLibraryInfo BCLib,
+                      bool Internalize = true) {
+    BCLib.ShouldInternalize = Internalize;
+    BCLibs.emplace_back(BCLib);
+  };
+  auto AddSanBCLibs = [&]() {
+    if (GPUSanEnabled()) {
+      auto SanArgs = std::get<const SanitizerArgs>(GPUSan);
+      if (SanArgs.needsAsanRt())
+        AddBCLib(getAsanRTLPath(), false);
+    }
+  };
 
+  AddSanBCLibs();
   AddBCLib(getOCMLPath());
   if (!isOpenMP)
     AddBCLib(getOCKLPath());
+  else if (GPUSanEnabled() && isOpenMP)
+    AddBCLib(getOCKLPath(), false);
   AddBCLib(getDenormalsAreZeroPath(DAZ));
   AddBCLib(getUnsafeMathPath(UnsafeMathOpt || FastRelaxedMath));
   AddBCLib(getFiniteOnlyPath(FiniteOnly || FastRelaxedMath));
@@ -1027,7 +1047,7 @@ RocmInstallationDetector::getCommonBitcodeLibs(
   return BCLibs;
 }
 
-llvm::SmallVector<std::string, 12>
+llvm::SmallVector<ToolChain::BitCodeLibraryInfo, 12>
 ROCMToolChain::getCommonDeviceLibNames(const llvm::opt::ArgList &DriverArgs,
                                        const std::string &GPUArch,
                                        bool isOpenMP) const {
@@ -1044,6 +1064,10 @@ ROCMToolChain::getCommonDeviceLibNames(const llvm::opt::ArgList &DriverArgs,
   // If --hip-device-lib is not set, add the default bitcode libraries.
   // TODO: There are way too many flags that change this. Do we need to check
   // them all?
+  std::tuple<bool, const SanitizerArgs> GPUSan(
+      DriverArgs.hasFlag(options::OPT_fgpu_sanitize,
+                         options::OPT_fno_gpu_sanitize, true),
+      getSanitizerArgs(DriverArgs));
   bool DAZ = DriverArgs.hasFlag(options::OPT_fgpu_flush_denormals_to_zero,
                                 options::OPT_fno_gpu_flush_denormals_to_zero,
                                 getDefaultDenormsAreZeroForTarget(Kind));
@@ -1061,7 +1085,7 @@ ROCMToolChain::getCommonDeviceLibNames(const llvm::opt::ArgList &DriverArgs,
 
   return RocmInstallation->getCommonBitcodeLibs(
       DriverArgs, LibDeviceFile, Wave64, DAZ, FiniteOnly, UnsafeMathOpt,
-      FastRelaxedMath, CorrectSqrt, ABIVer, isOpenMP);
+      FastRelaxedMath, CorrectSqrt, ABIVer, GPUSan, isOpenMP);
 }
 
 bool AMDGPUToolChain::shouldSkipSanitizeOption(
diff --git clang/lib/Driver/ToolChains/AMDGPU.h clang/lib/Driver/ToolChains/AMDGPU.h
index a9b4552a1f91..aad6bc75dffa 100644
--- clang/lib/Driver/ToolChains/AMDGPU.h
+++ clang/lib/Driver/ToolChains/AMDGPU.h
@@ -142,7 +142,7 @@ public:
                         Action::OffloadKind DeviceOffloadKind) const override;
 
   // Returns a list of device library names shared by different languages
-  llvm::SmallVector<std::string, 12>
+  llvm::SmallVector<BitCodeLibraryInfo, 12>
   getCommonDeviceLibNames(const llvm::opt::ArgList &DriverArgs,
                           const std::string &GPUArch,
                           bool isOpenMP = false) const;
diff --git clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
index 3f0b3f2d86b3..24d244ba6f0e 100644
--- clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
+++ clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
@@ -9,7 +9,7 @@
 #include "AMDGPUOpenMP.h"
 #include "AMDGPU.h"
 #include "CommonArgs.h"
-#include "ToolChains/ROCm.h"
+#include "ROCm.h"
 #include "clang/Basic/DiagnosticDriver.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
@@ -71,33 +71,9 @@ llvm::opt::DerivedArgList *AMDGPUOpenMPToolChain::TranslateArgs(
 
   const OptTable &Opts = getDriver().getOpts();
 
-  if (DeviceOffloadKind == Action::OFK_OpenMP) {
-    for (Arg *A : Args)
-      if (!llvm::is_contained(*DAL, A))
-        DAL->append(A);
-
-    if (!DAL->hasArg(options::OPT_march_EQ)) {
-      StringRef Arch = BoundArch;
-      if (Arch.empty()) {
-        auto ArchsOrErr = getSystemGPUArchs(Args);
-        if (!ArchsOrErr) {
-          std::string ErrMsg =
-              llvm::formatv("{0}", llvm::fmt_consume(ArchsOrErr.takeError()));
-          getDriver().Diag(diag::err_drv_undetermined_gpu_arch)
-              << llvm::Triple::getArchTypeName(getArch()) << ErrMsg << "-march";
-          Arch = OffloadArchToString(OffloadArch::HIPDefault);
-        } else {
-          Arch = Args.MakeArgString(ArchsOrErr->front());
-        }
-      }
-      DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ), Arch);
-    }
-
-    return DAL;
-  }
-
   for (Arg *A : Args) {
-    DAL->append(A);
+    if (!llvm::is_contained(*DAL, A))
+      DAL->append(A);
   }
 
   if (!BoundArch.empty()) {
@@ -159,11 +135,6 @@ AMDGPUOpenMPToolChain::getDeviceLibs(const llvm::opt::ArgList &Args) const {
   if (Args.hasArg(options::OPT_nogpulib))
     return {};
 
-  if (!RocmInstallation->hasDeviceLibrary()) {
-    getDriver().Diag(diag::err_drv_no_rocm_device_lib) << 0;
-    return {};
-  }
-
   StringRef GpuArch = getProcessorFromTargetID(
       getTriple(), Args.getLastArgValue(options::OPT_march_EQ));
 
diff --git clang/lib/Driver/ToolChains/Arch/X86.cpp clang/lib/Driver/ToolChains/Arch/X86.cpp
index b2109e11038f..47c2c3e23f9f 100644
--- clang/lib/Driver/ToolChains/Arch/X86.cpp
+++ clang/lib/Driver/ToolChains/Arch/X86.cpp
@@ -237,15 +237,18 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple,
 
     bool IsNegative = Name.consume_front("no-");
 
-#ifndef NDEBUG
-    assert(Name.starts_with("avx10.") && "Invalid AVX10 feature name.");
     StringRef Version, Width;
     std::tie(Version, Width) = Name.substr(6).split('-');
+    assert(Name.starts_with("avx10.") && "Invalid AVX10 feature name.");
     assert((Version == "1" || Version == "2") && "Invalid AVX10 feature name.");
-    assert((Width == "256" || Width == "512") && "Invalid AVX10 feature name.");
-#endif
 
-    Features.push_back(Args.MakeArgString((IsNegative ? "-" : "+") + Name));
+    if (Width == "") {
+      assert(IsNegative && "Only negative options can omit width.");
+      Features.push_back(Args.MakeArgString("-" + Name + "-256"));
+    } else {
+      assert((Width == "256" || Width == "512") && "Invalid vector length.");
+      Features.push_back(Args.MakeArgString((IsNegative ? "-" : "+") + Name));
+    }
   }
 
   // Now add any that the user explicitly requested on the command line,
diff --git clang/lib/Driver/ToolChains/CommonArgs.cpp clang/lib/Driver/ToolChains/CommonArgs.cpp
index a213a055956c..0dbc0515ec8c 100644
--- clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -1439,13 +1439,12 @@ void tools::linkSanitizerRuntimeDeps(const ToolChain &TC,
   CmdArgs.push_back("-lm");
   // There's no libdl on all OSes.
   if (!TC.getTriple().isOSFreeBSD() && !TC.getTriple().isOSNetBSD() &&
-      !TC.getTriple().isOSOpenBSD() &&
+      !TC.getTriple().isOSOpenBSD() && !TC.getTriple().isOSDragonFly() &&
       TC.getTriple().getOS() != llvm::Triple::RTEMS)
     CmdArgs.push_back("-ldl");
   // Required for backtrace on some OSes
-  if (TC.getTriple().isOSFreeBSD() ||
-      TC.getTriple().isOSNetBSD() ||
-      TC.getTriple().isOSOpenBSD())
+  if (TC.getTriple().isOSFreeBSD() || TC.getTriple().isOSNetBSD() ||
+      TC.getTriple().isOSOpenBSD() || TC.getTriple().isOSDragonFly())
     CmdArgs.push_back("-lexecinfo");
   // There is no libresolv on Android, FreeBSD, OpenBSD, etc. On musl
   // libresolv.a, even if exists, is an empty archive to satisfy POSIX -lresolv
diff --git clang/lib/Driver/ToolChains/Cuda.cpp clang/lib/Driver/ToolChains/Cuda.cpp
index 0922a97ed7c1..c800e9cfa0a8 100644
--- clang/lib/Driver/ToolChains/Cuda.cpp
+++ clang/lib/Driver/ToolChains/Cuda.cpp
@@ -969,34 +969,6 @@ CudaToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args,
 
   const OptTable &Opts = getDriver().getOpts();
 
-  // For OpenMP device offloading, append derived arguments. Make sure
-  // flags are not duplicated.
-  // Also append the compute capability.
-  if (DeviceOffloadKind == Action::OFK_OpenMP) {
-    for (Arg *A : Args)
-      if (!llvm::is_contained(*DAL, A))
-        DAL->append(A);
-
-    if (!DAL->hasArg(options::OPT_march_EQ)) {
-      StringRef Arch = BoundArch;
-      if (Arch.empty()) {
-        auto ArchsOrErr = getSystemGPUArchs(Args);
-        if (!ArchsOrErr) {
-          std::string ErrMsg =
-              llvm::formatv("{0}", llvm::fmt_consume(ArchsOrErr.takeError()));
-          getDriver().Diag(diag::err_drv_undetermined_gpu_arch)
-              << llvm::Triple::getArchTypeName(getArch()) << ErrMsg << "-march";
-          Arch = OffloadArchToString(OffloadArch::CudaDefault);
-        } else {
-          Arch = Args.MakeArgString(ArchsOrErr->front());
-        }
-      }
-      DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ), Arch);
-    }
-
-    return DAL;
-  }
-
   for (Arg *A : Args) {
     // Make sure flags are not duplicated.
     if (!llvm::is_contained(*DAL, A)) {
diff --git clang/lib/Driver/ToolChains/HIPAMD.cpp clang/lib/Driver/ToolChains/HIPAMD.cpp
index ccee065b5906..158a25207598 100644
--- clang/lib/Driver/ToolChains/HIPAMD.cpp
+++ clang/lib/Driver/ToolChains/HIPAMD.cpp
@@ -382,7 +382,7 @@ HIPAMDToolChain::getDeviceLibs(const llvm::opt::ArgList &DriverArgs) const {
         llvm::sys::path::append(Path, BCName);
         FullName = Path;
         if (llvm::sys::fs::exists(FullName)) {
-          BCLibs.push_back(FullName);
+          BCLibs.emplace_back(FullName);
           return;
         }
       }
@@ -396,28 +396,11 @@ HIPAMDToolChain::getDeviceLibs(const llvm::opt::ArgList &DriverArgs) const {
     StringRef GpuArch = getGPUArch(DriverArgs);
     assert(!GpuArch.empty() && "Must have an explicit GPU arch.");
 
-    // If --hip-device-lib is not set, add the default bitcode libraries.
-    if (DriverArgs.hasFlag(options::OPT_fgpu_sanitize,
-                           options::OPT_fno_gpu_sanitize, true) &&
-        getSanitizerArgs(DriverArgs).needsAsanRt()) {
-      auto AsanRTL = RocmInstallation->getAsanRTLPath();
-      if (AsanRTL.empty()) {
-        unsigned DiagID = getDriver().getDiags().getCustomDiagID(
-            DiagnosticsEngine::Error,
-            "AMDGPU address sanitizer runtime library (asanrtl) is not found. "
-            "Please install ROCm device library which supports address "
-            "sanitizer");
-        getDriver().Diag(DiagID);
-        return {};
-      } else
-        BCLibs.emplace_back(AsanRTL, /*ShouldInternalize=*/false);
-    }
-
     // Add the HIP specific bitcode library.
-    BCLibs.push_back(RocmInstallation->getHIPPath());
+    BCLibs.emplace_back(RocmInstallation->getHIPPath());
 
     // Add common device libraries like ocml etc.
-    for (StringRef N : getCommonDeviceLibNames(DriverArgs, GpuArch.str()))
+    for (auto N : getCommonDeviceLibNames(DriverArgs, GpuArch.str()))
       BCLibs.emplace_back(N);
 
     // Add instrument lib.
@@ -426,7 +409,7 @@ HIPAMDToolChain::getDeviceLibs(const llvm::opt::ArgList &DriverArgs) const {
     if (InstLib.empty())
       return BCLibs;
     if (llvm::sys::fs::exists(InstLib))
-      BCLibs.push_back(InstLib);
+      BCLibs.emplace_back(InstLib);
     else
       getDriver().Diag(diag::err_drv_no_such_file) << InstLib;
   }
diff --git clang/lib/Driver/ToolChains/Hexagon.cpp clang/lib/Driver/ToolChains/Hexagon.cpp
index 76cedf312d68..7ca5ab9af881 100644
--- clang/lib/Driver/ToolChains/Hexagon.cpp
+++ clang/lib/Driver/ToolChains/Hexagon.cpp
@@ -802,9 +802,7 @@ bool HexagonToolChain::isAutoHVXEnabled(const llvm::opt::ArgList &Args) {
 // Returns the default CPU for Hexagon. This is the default compilation target
 // if no Hexagon processor is selected at the command-line.
 //
-StringRef HexagonToolChain::GetDefaultCPU() {
-  return "hexagonv60";
-}
+StringRef HexagonToolChain::GetDefaultCPU() { return "hexagonv68"; }
 
 StringRef HexagonToolChain::GetTargetCPUVersion(const ArgList &Args) {
   Arg *CpuArg = nullptr;
diff --git clang/lib/Driver/ToolChains/ROCm.h clang/lib/Driver/ToolChains/ROCm.h
index dceb0ab03669..681c242b0678 100644
--- clang/lib/Driver/ToolChains/ROCm.h
+++ clang/lib/Driver/ToolChains/ROCm.h
@@ -13,6 +13,7 @@
 #include "clang/Basic/LLVM.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/Options.h"
+#include "clang/Driver/SanitizerArgs.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/Option/ArgList.h"
@@ -173,12 +174,11 @@ public:
 
   /// Get file paths of default bitcode libraries common to AMDGPU based
   /// toolchains.
-  llvm::SmallVector<std::string, 12>
-  getCommonBitcodeLibs(const llvm::opt::ArgList &DriverArgs,
-                       StringRef LibDeviceFile, bool Wave64, bool DAZ,
-                       bool FiniteOnly, bool UnsafeMathOpt,
-                       bool FastRelaxedMath, bool CorrectSqrt,
-                       DeviceLibABIVersion ABIVer, bool isOpenMP) const;
+  llvm::SmallVector<ToolChain::BitCodeLibraryInfo, 12> getCommonBitcodeLibs(
+      const llvm::opt::ArgList &DriverArgs, StringRef LibDeviceFile,
+      bool Wave64, bool DAZ, bool FiniteOnly, bool UnsafeMathOpt,
+      bool FastRelaxedMath, bool CorrectSqrt, DeviceLibABIVersion ABIVer,
+      const std::tuple<bool, const SanitizerArgs> &GPUSan, bool isOpenMP) const;
   /// Check file paths of default bitcode libraries common to AMDGPU based
   /// toolchains. \returns false if there are invalid or missing files.
   bool checkCommonBitcodeLibs(StringRef GPUArch, StringRef LibDeviceFile,
diff --git clang/lib/Format/ContinuationIndenter.cpp clang/lib/Format/ContinuationIndenter.cpp
index c311deaa17bb..6f7d213c0b55 100644
--- clang/lib/Format/ContinuationIndenter.cpp
+++ clang/lib/Format/ContinuationIndenter.cpp
@@ -349,6 +349,13 @@ bool ContinuationIndenter::canBreak(const LineState &State) {
     }
   }
 
+  // Allow breaking before the right parens with block indentation if there was
+  // a break after the left parens, which is tracked by BreakBeforeClosingParen.
+  if (Style.AlignAfterOpenBracket == FormatStyle::BAS_BlockIndent &&
+      Current.is(tok::r_paren)) {
+    return CurrentState.BreakBeforeClosingParen;
+  }
+
   // Don't allow breaking before a closing brace of a block-indented braced list
   // initializer if there isn't already a break.
   if (Current.is(tok::r_brace) && Current.MatchingParen &&
diff --git clang/lib/Format/FormatToken.cpp clang/lib/Format/FormatToken.cpp
index 963e8f87793f..99bce1f5f098 100644
--- clang/lib/Format/FormatToken.cpp
+++ clang/lib/Format/FormatToken.cpp
@@ -42,11 +42,12 @@ static SmallVector<StringRef> CppNonKeywordTypes = {
 };
 
 bool FormatToken::isTypeName(const LangOptions &LangOpts) const {
+  if (is(TT_TypeName) || Tok.isSimpleTypeSpecifier(LangOpts))
+    return true;
   const bool IsCpp = LangOpts.CXXOperatorNames;
-  return is(TT_TypeName) || Tok.isSimpleTypeSpecifier(LangOpts) ||
-         (IsCpp && is(tok::identifier) &&
-          std::binary_search(CppNonKeywordTypes.begin(),
-                             CppNonKeywordTypes.end(), TokenText));
+  return IsCpp && is(tok::identifier) &&
+         std::binary_search(CppNonKeywordTypes.begin(),
+                            CppNonKeywordTypes.end(), TokenText);
 }
 
 bool FormatToken::isTypeOrIdentifier(const LangOptions &LangOpts) const {
diff --git clang/lib/Format/FormatToken.h clang/lib/Format/FormatToken.h
index d97b6522f1fe..29aba281ae10 100644
--- clang/lib/Format/FormatToken.h
+++ clang/lib/Format/FormatToken.h
@@ -44,6 +44,8 @@ namespace format {
   TYPE(CaseLabelColon)                                                         \
   TYPE(CastRParen)                                                             \
   TYPE(ClassLBrace)                                                            \
+  /* Name of class/struct/union/interface definition. */                       \
+  TYPE(ClassHeadName)                                                          \
   TYPE(ClassRBrace)                                                            \
   TYPE(CompoundRequirementLBrace)                                              \
   /* ternary ?: expression */                                                  \
diff --git clang/lib/Format/TokenAnnotator.cpp clang/lib/Format/TokenAnnotator.cpp
index a172df5291ae..f25332e3a5f4 100644
--- clang/lib/Format/TokenAnnotator.cpp
+++ clang/lib/Format/TokenAnnotator.cpp
@@ -1565,7 +1565,8 @@ private:
         if (const auto *Previous = Tok->Previous;
             !Previous ||
             (!Previous->isAttribute() &&
-             !Previous->isOneOf(TT_RequiresClause, TT_LeadingJavaAnnotation))) {
+             !Previous->isOneOf(TT_RequiresClause, TT_LeadingJavaAnnotation,
+                                TT_BinaryOperator))) {
           Line.MightBeFunctionDecl = true;
           Tok->MightBeFunctionDeclParen = true;
         }
@@ -2580,9 +2581,14 @@ private:
     if (Style.isVerilog())
       return false;
 
-    if (Tok.isNot(tok::identifier) || !Tok.Previous)
+    if (!Tok.Previous || Tok.isNot(tok::identifier) || Tok.is(TT_ClassHeadName))
       return false;
 
+    if ((Style.isJavaScript() || Style.Language == FormatStyle::LK_Java) &&
+        Tok.is(Keywords.kw_extends)) {
+      return false;
+    }
+
     if (const auto *NextNonComment = Tok.getNextNonComment();
         (!NextNonComment && !Line.InMacroBody) ||
         (NextNonComment &&
diff --git clang/lib/Format/UnwrappedLineParser.cpp clang/lib/Format/UnwrappedLineParser.cpp
index 906fc11a07d5..4e040183f2f0 100644
--- clang/lib/Format/UnwrappedLineParser.cpp
+++ clang/lib/Format/UnwrappedLineParser.cpp
@@ -3632,7 +3632,7 @@ void UnwrappedLineParser::parseRequiresClause(FormatToken *RequiresToken) {
   // It could be inlined into here.
   parseConstraintExpression();
 
-  if (!InRequiresExpression)
+  if (!InRequiresExpression && FormatTok->Previous)
     FormatTok->Previous->ClosesRequiresClause = true;
 }
 
@@ -4029,7 +4029,7 @@ void UnwrappedLineParser::parseRecord(bool ParseAsExpr) {
   const FormatToken &InitialToken = *FormatTok;
   nextToken();
 
-  const FormatToken *ClassName = nullptr;
+  FormatToken *ClassName = nullptr;
   bool IsDerived = false;
   auto IsNonMacroIdentifier = [](const FormatToken *Tok) {
     return Tok->is(tok::identifier) && Tok->TokenText != Tok->TokenText.upper();
@@ -4059,7 +4059,7 @@ void UnwrappedLineParser::parseRecord(bool ParseAsExpr) {
     }
     if (FormatTok->is(tok::l_square) && handleCppAttributes())
       continue;
-    const auto *Previous = FormatTok;
+    auto *Previous = FormatTok;
     nextToken();
     switch (FormatTok->Tok.getKind()) {
     case tok::l_paren:
@@ -4074,9 +4074,12 @@ void UnwrappedLineParser::parseRecord(bool ParseAsExpr) {
     case tok::hashhash:
       break;
     default:
-      if (!JSPastExtendsOrImplements && !ClassName &&
-          Previous->is(tok::identifier) && Previous->isNot(TT_AttributeMacro) &&
-          Previous->TokenText != Previous->TokenText.upper()) {
+      if (JSPastExtendsOrImplements || ClassName ||
+          Previous->isNot(tok::identifier) || Previous->is(TT_AttributeMacro)) {
+        break;
+      }
+      if (const auto Text = Previous->TokenText;
+          Text.size() == 1 || Text != Text.upper()) {
         ClassName = Previous;
       }
     }
@@ -4103,7 +4106,7 @@ void UnwrappedLineParser::parseRecord(bool ParseAsExpr) {
       if (AngleNestingLevel == 0) {
         if (FormatTok->is(tok::colon)) {
           IsDerived = true;
-        } else if (FormatTok->is(tok::identifier) &&
+        } else if (!IsDerived && FormatTok->is(tok::identifier) &&
                    FormatTok->Previous->is(tok::coloncolon)) {
           ClassName = FormatTok;
         } else if (FormatTok->is(tok::l_paren) &&
@@ -4160,6 +4163,8 @@ void UnwrappedLineParser::parseRecord(bool ParseAsExpr) {
   if (FormatTok->is(tok::l_brace)) {
     if (IsListInitialization())
       return;
+    if (ClassName)
+      ClassName->setFinalizedType(TT_ClassHeadName);
     auto [OpenBraceType, ClosingBraceType] = GetBraceTypes(InitialToken);
     FormatTok->setFinalizedType(OpenBraceType);
     if (ParseAsExpr) {
diff --git clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp
index 079bcd93d616..c8d004163b96 100644
--- clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp
+++ clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp
@@ -71,7 +71,13 @@ CreateFrontendBaseAction(CompilerInstance &CI) {
     llvm_unreachable("CIR suppport not built into clang");
 #endif
   case EmitHTML:               return std::make_unique<HTMLPrintAction>();
-  case EmitLLVM:               return std::make_unique<EmitLLVMAction>();
+  case EmitLLVM: {
+#if CLANG_ENABLE_CIR
+    if (UseCIR)
+      return std::make_unique<cir::EmitLLVMAction>();
+#endif
+    return std::make_unique<EmitLLVMAction>();
+  }
   case EmitLLVMOnly:           return std::make_unique<EmitLLVMOnlyAction>();
   case EmitCodeGenOnly:        return std::make_unique<EmitCodeGenOnlyAction>();
   case EmitObj:                return std::make_unique<EmitObjAction>();
diff --git clang/lib/Headers/prfchwintrin.h clang/lib/Headers/prfchwintrin.h
index eaea5f3cf8fe..8ec55d707371 100644
--- clang/lib/Headers/prfchwintrin.h
+++ clang/lib/Headers/prfchwintrin.h
@@ -14,6 +14,10 @@
 #ifndef __PRFCHWINTRIN_H
 #define __PRFCHWINTRIN_H
 
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
 /// Loads a memory sequence containing the specified memory address into
 ///    all data cache levels.
 ///
@@ -26,11 +30,7 @@
 ///
 /// \param __P
 ///    A pointer specifying the memory address to be prefetched.
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
-_m_prefetch(void *__P)
-{
-  __builtin_prefetch (__P, 0, 3 /* _MM_HINT_T0 */);
-}
+void _m_prefetch(void *__P);
 
 /// Loads a memory sequence containing the specified memory address into
 ///    the L1 data cache and sets the cache-coherency state to modified.
@@ -48,13 +48,10 @@ _m_prefetch(void *__P)
 ///
 /// \param __P
 ///    A pointer specifying the memory address to be prefetched.
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
-_m_prefetchw(volatile const void *__P)
-{
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wcast-qual"
-  __builtin_prefetch ((const void*)__P, 1, 3 /* _MM_HINT_T0 */);
-#pragma clang diagnostic pop
-}
+void _m_prefetchw(volatile const void *__P);
+
+#if defined(__cplusplus)
+} // extern "C"
+#endif
 
 #endif /* __PRFCHWINTRIN_H */
diff --git clang/lib/Headers/xmmintrin.h clang/lib/Headers/xmmintrin.h
index 20e66d190113..1fb070bca827 100644
--- clang/lib/Headers/xmmintrin.h
+++ clang/lib/Headers/xmmintrin.h
@@ -2197,10 +2197,7 @@ _mm_storer_ps(float *__p, __m128 __a)
 #define _MM_HINT_T2  1
 #define _MM_HINT_NTA 0
 
-#ifndef _MSC_VER
-/* FIXME: We have to #define this because "sel" must be a constant integer, and
-   Sema doesn't do any form of constant propagation yet. */
-
+#if 0
 /// Loads one cache line of data from the specified address to a location
 ///    closer to the processor.
 ///
@@ -2225,6 +2222,10 @@ _mm_storer_ps(float *__p, __m128 __a)
 ///    be generated. \n
 ///    _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will
 ///    be generated.
+///
+/// _mm_prefetch is implemented as a "library builtin" directly in Clang,
+/// similar to how it is done in MSVC. Clang will warn if the user doesn't
+/// include xmmintrin.h or immintrin.h.
 #define _mm_prefetch(a, sel) (__builtin_prefetch((const void *)(a), \
                                                  ((sel) >> 2) & 1, (sel) & 0x3))
 #endif
diff --git clang/lib/Index/USRGeneration.cpp clang/lib/Index/USRGeneration.cpp
index 493123459a5a..1e54b413dc59 100644
--- clang/lib/Index/USRGeneration.cpp
+++ clang/lib/Index/USRGeneration.cpp
@@ -763,9 +763,10 @@ void USRGenerator::VisitType(QualType T) {
           Out << "@BT@OCLReserveID"; break;
         case BuiltinType::OCLSampler:
           Out << "@BT@OCLSampler"; break;
-#define SVE_TYPE(Name, Id, SingletonId) \
-        case BuiltinType::Id: \
-          Out << "@BT@" << Name; break;
+#define SVE_TYPE(Name, Id, SingletonId)                                        \
+  case BuiltinType::Id:                                                        \
+    Out << "@BT@" << #Name;                                                    \
+    break;
 #include "clang/Basic/AArch64SVEACLETypes.def"
 #define PPC_VECTOR_TYPE(Name, Id, Size) \
         case BuiltinType::Id: \
diff --git clang/lib/Parse/ParseOpenACC.cpp clang/lib/Parse/ParseOpenACC.cpp
index 98fd61913e5a..d036971d2fc3 100644
--- clang/lib/Parse/ParseOpenACC.cpp
+++ clang/lib/Parse/ParseOpenACC.cpp
@@ -156,14 +156,14 @@ OpenACCClauseKind getOpenACCClauseKind(Token Tok) {
 // second part of the directive.
 OpenACCAtomicKind getOpenACCAtomicKind(Token Tok) {
   if (!Tok.is(tok::identifier))
-    return OpenACCAtomicKind::Invalid;
+    return OpenACCAtomicKind::None;
   return llvm::StringSwitch<OpenACCAtomicKind>(
              Tok.getIdentifierInfo()->getName())
       .Case("read", OpenACCAtomicKind::Read)
       .Case("write", OpenACCAtomicKind::Write)
       .Case("update", OpenACCAtomicKind::Update)
       .Case("capture", OpenACCAtomicKind::Capture)
-      .Default(OpenACCAtomicKind::Invalid);
+      .Default(OpenACCAtomicKind::None);
 }
 
 OpenACCDefaultClauseKind getOpenACCDefaultClauseKind(Token Tok) {
@@ -398,17 +398,16 @@ OpenACCAtomicKind ParseOpenACCAtomicKind(Parser &P) {
 
   // #pragma acc atomic is equivilent to update:
   if (AtomicClauseToken.isAnnotation())
-    return OpenACCAtomicKind::Update;
+    return OpenACCAtomicKind::None;
 
   OpenACCAtomicKind AtomicKind = getOpenACCAtomicKind(AtomicClauseToken);
 
-  // If we don't know what this is, treat it as 'nothing', and treat the rest of
-  // this as a clause list, which, despite being invalid, is likely what the
-  // user was trying to do.
-  if (AtomicKind == OpenACCAtomicKind::Invalid)
-    return OpenACCAtomicKind::Update;
+  // If this isn't a valid atomic-kind, don't consume the token, and treat the
+  // rest as a clause list, which despite there being no permissible clauses,
+  // will diagnose as a clause.
+  if (AtomicKind != OpenACCAtomicKind::None)
+    P.ConsumeToken();
 
-  P.ConsumeToken();
   return AtomicKind;
 }
 
@@ -570,12 +569,19 @@ void SkipUntilEndOfDirective(Parser &P) {
 
 bool doesDirectiveHaveAssociatedStmt(OpenACCDirectiveKind DirKind) {
   switch (DirKind) {
-  default:
+  case OpenACCDirectiveKind::Routine:
+    // FIXME: Routine MIGHT end up needing to be 'true' here, as it needs a way
+    // to capture a lambda-expression on the next line.
+  case OpenACCDirectiveKind::Cache:
+  case OpenACCDirectiveKind::Declare:
+  case OpenACCDirectiveKind::Set:
   case OpenACCDirectiveKind::EnterData:
   case OpenACCDirectiveKind::ExitData:
   case OpenACCDirectiveKind::Wait:
   case OpenACCDirectiveKind::Init:
   case OpenACCDirectiveKind::Shutdown:
+  case OpenACCDirectiveKind::Update:
+  case OpenACCDirectiveKind::Invalid:
     return false;
   case OpenACCDirectiveKind::Parallel:
   case OpenACCDirectiveKind::Serial:
@@ -586,6 +592,7 @@ bool doesDirectiveHaveAssociatedStmt(OpenACCDirectiveKind DirKind) {
   case OpenACCDirectiveKind::Loop:
   case OpenACCDirectiveKind::Data:
   case OpenACCDirectiveKind::HostData:
+  case OpenACCDirectiveKind::Atomic:
     return true;
   }
   llvm_unreachable("Unhandled directive->assoc stmt");
@@ -1428,6 +1435,7 @@ Parser::ParseOpenACCDirective() {
   SourceLocation DirLoc = getCurToken().getLocation();
   OpenACCDirectiveKind DirKind = ParseOpenACCDirectiveKind(*this);
   Parser::OpenACCWaitParseInfo WaitInfo;
+  OpenACCAtomicKind AtomicKind = OpenACCAtomicKind::None;
 
   getActions().OpenACC().ActOnConstruct(DirKind, DirLoc);
 
@@ -1435,7 +1443,7 @@ Parser::ParseOpenACCDirective() {
   // specifiers that need to be taken care of. Atomic has an 'atomic-clause'
   // that needs to be parsed.
   if (DirKind == OpenACCDirectiveKind::Atomic)
-    ParseOpenACCAtomicKind(*this);
+    AtomicKind = ParseOpenACCAtomicKind(*this);
 
   // We've successfully parsed the construct/directive name, however a few of
   // the constructs have optional parens that contain further details.
@@ -1490,6 +1498,7 @@ Parser::ParseOpenACCDirective() {
                                       T.getCloseLocation(),
                                       /*EndLoc=*/SourceLocation{},
                                       WaitInfo.QueuesLoc,
+                                      AtomicKind,
                                       WaitInfo.getAllExprs(),
                                       ParseOpenACCClauseList(DirKind)};
 
@@ -1538,11 +1547,12 @@ StmtResult Parser::ParseOpenACCDirectiveStmt() {
     ParseScope ACCScope(this, getOpenACCScopeFlags(DirInfo.DirKind));
 
     AssocStmt = getActions().OpenACC().ActOnAssociatedStmt(
-        DirInfo.StartLoc, DirInfo.DirKind, DirInfo.Clauses, ParseStatement());
+        DirInfo.StartLoc, DirInfo.DirKind, DirInfo.AtomicKind, DirInfo.Clauses,
+        ParseStatement());
   }
 
   return getActions().OpenACC().ActOnEndStmtDirective(
       DirInfo.DirKind, DirInfo.StartLoc, DirInfo.DirLoc, DirInfo.LParenLoc,
-      DirInfo.MiscLoc, DirInfo.Exprs, DirInfo.RParenLoc, DirInfo.EndLoc,
-      DirInfo.Clauses, AssocStmt);
+      DirInfo.MiscLoc, DirInfo.Exprs, DirInfo.AtomicKind, DirInfo.RParenLoc,
+      DirInfo.EndLoc, DirInfo.Clauses, AssocStmt);
 }
diff --git clang/lib/Sema/CMakeLists.txt clang/lib/Sema/CMakeLists.txt
index 19cf3a2db00f..1a351684d133 100644
--- clang/lib/Sema/CMakeLists.txt
+++ clang/lib/Sema/CMakeLists.txt
@@ -71,6 +71,7 @@ add_clang_library(clangSema
   SemaObjC.cpp
   SemaObjCProperty.cpp
   SemaOpenACC.cpp
+  SemaOpenACCAtomic.cpp
   SemaOpenACCClause.cpp
   SemaOpenCL.cpp
   SemaOpenMP.cpp
diff --git clang/lib/Sema/HeuristicResolver.cpp clang/lib/Sema/HeuristicResolver.cpp
index 0c57250e63df..36e5b44b8b12 100644
--- clang/lib/Sema/HeuristicResolver.cpp
+++ clang/lib/Sema/HeuristicResolver.cpp
@@ -482,7 +482,8 @@ std::vector<const NamedDecl *> HeuristicResolverImpl::resolveDependentMember(
       if (!Filter(ND))
         return false;
       if (const auto *MD = dyn_cast<CXXMethodDecl>(ND)) {
-        return MD->getMethodQualifiers().compatiblyIncludes(QT.getQualifiers(),
+        return !MD->isInstance() ||
+               MD->getMethodQualifiers().compatiblyIncludes(QT.getQualifiers(),
                                                             Ctx);
       }
       return true;
diff --git clang/lib/Sema/Sema.cpp clang/lib/Sema/Sema.cpp
index 9507d7602aa4..15c18f9a4525 100644
--- clang/lib/Sema/Sema.cpp
+++ clang/lib/Sema/Sema.cpp
@@ -478,8 +478,8 @@ void Sema::Initialize() {
   if (Context.getTargetInfo().hasAArch64SVETypes() ||
       (Context.getAuxTargetInfo() &&
        Context.getAuxTargetInfo()->hasAArch64SVETypes())) {
-#define SVE_TYPE(Name, Id, SingletonId) \
-    addImplicitTypedef(Name, Context.SingletonId);
+#define SVE_TYPE(Name, Id, SingletonId)                                        \
+  addImplicitTypedef(#Name, Context.SingletonId);
 #include "clang/Basic/AArch64SVEACLETypes.def"
   }
 
diff --git clang/lib/Sema/SemaCodeComplete.cpp clang/lib/Sema/SemaCodeComplete.cpp
index f10f7f4768f8..80ae87e7c572 100644
--- clang/lib/Sema/SemaCodeComplete.cpp
+++ clang/lib/Sema/SemaCodeComplete.cpp
@@ -5796,24 +5796,11 @@ QualType getApproximateType(const Expr *E, HeuristicResolver &Resolver) {
         return QualType(Common, 0);
     }
   }
-  // A dependent member: approximate-resolve the base, then lookup.
+  // A dependent member: resolve using HeuristicResolver.
   if (const auto *CDSME = llvm::dyn_cast<CXXDependentScopeMemberExpr>(E)) {
-    QualType Base = CDSME->isImplicitAccess()
-                        ? CDSME->getBaseType()
-                        : getApproximateType(CDSME->getBase(), Resolver);
-    if (CDSME->isArrow() && !Base.isNull())
-      Base = Base->getPointeeType(); // could handle unique_ptr etc here?
-    auto *RD =
-        Base.isNull()
-            ? nullptr
-            : llvm::dyn_cast_or_null<CXXRecordDecl>(getAsRecordDecl(Base));
-    if (RD && RD->isCompleteDefinition()) {
-      // Look up member heuristically, including in bases.
-      for (const auto *Member : RD->lookupDependentName(
-               CDSME->getMember(), [](const NamedDecl *Member) {
-                 return llvm::isa<ValueDecl>(Member);
-               })) {
-        return llvm::cast<ValueDecl>(Member)->getType().getNonReferenceType();
+    for (const auto *Member : Resolver.resolveMemberExpr(CDSME)) {
+      if (const auto *VD = dyn_cast<ValueDecl>(Member)) {
+        return VD->getType().getNonReferenceType();
       }
     }
   }
diff --git clang/lib/Sema/SemaDecl.cpp clang/lib/Sema/SemaDecl.cpp
index 3cad9827fdab..74e0fcec2d91 100644
--- clang/lib/Sema/SemaDecl.cpp
+++ clang/lib/Sema/SemaDecl.cpp
@@ -6681,7 +6681,10 @@ Sema::ActOnTypedefDeclarator(Scope* S, Declarator& D, DeclContext* DC,
   DiagnoseFunctionSpecifiers(D.getDeclSpec());
 
   if (D.getDeclSpec().isInlineSpecified())
-    Diag(D.getDeclSpec().getInlineSpecLoc(), diag::err_inline_non_function)
+    Diag(D.getDeclSpec().getInlineSpecLoc(),
+         (getLangOpts().MSVCCompat && !getLangOpts().CPlusPlus)
+             ? diag::warn_ms_inline_non_function
+             : diag::err_inline_non_function)
         << getLangOpts().CPlusPlus17;
   if (D.getDeclSpec().hasConstexprSpecifier())
     Diag(D.getDeclSpec().getConstexprSpecLoc(), diag::err_invalid_constexpr)
@@ -13377,6 +13380,62 @@ void Sema::checkNonTrivialCUnion(QualType QT, SourceLocation Loc,
         .visit(QT, nullptr, false);
 }
 
+bool Sema::GloballyUniqueObjectMightBeAccidentallyDuplicated(
+    const VarDecl *Dcl) {
+  if (!getLangOpts().CPlusPlus)
+    return false;
+
+  // We only need to warn if the definition is in a header file, so wait to
+  // diagnose until we've seen the definition.
+  if (!Dcl->isThisDeclarationADefinition())
+    return false;
+
+  // If an object is defined in a source file, its definition can't get
+  // duplicated since it will never appear in more than one TU.
+  if (Dcl->getASTContext().getSourceManager().isInMainFile(Dcl->getLocation()))
+    return false;
+
+  // If the variable we're looking at is a static local, then we actually care
+  // about the properties of the function containing it.
+  const ValueDecl *Target = Dcl;
+  // VarDecls and FunctionDecls have different functions for checking
+  // inline-ness, so we have to do it manually.
+  bool TargetIsInline = Dcl->isInline();
+
+  // Update the Target and TargetIsInline property if necessary
+  if (Dcl->isStaticLocal()) {
+    const DeclContext *Ctx = Dcl->getDeclContext();
+    if (!Ctx)
+      return false;
+
+    const FunctionDecl *FunDcl =
+        dyn_cast_if_present<FunctionDecl>(Ctx->getNonClosureAncestor());
+    if (!FunDcl)
+      return false;
+
+    Target = FunDcl;
+    // IsInlined() checks for the C++ inline property
+    TargetIsInline = FunDcl->isInlined();
+  }
+
+  // Non-inline variables can only legally appear in one TU
+  // FIXME: This also applies to templated variables, but that can rarely lead
+  // to false positives so templates are disabled for now.
+  if (!TargetIsInline)
+    return false;
+
+  // If the object isn't hidden, the dynamic linker will prevent duplication.
+  clang::LinkageInfo Lnk = Target->getLinkageAndVisibility();
+  if (Lnk.getVisibility() != HiddenVisibility)
+    return false;
+
+  // If the obj doesn't have external linkage, it's supposed to be duplicated.
+  if (!isExternalFormalLinkage(Lnk.getLinkage()))
+    return false;
+
+  return true;
+}
+
 void Sema::AddInitializerToDecl(Decl *RealDecl, Expr *Init, bool DirectInit) {
   // If there is no declaration, there was an error parsing it.  Just ignore
   // the initializer.
@@ -14783,6 +14842,51 @@ void Sema::FinalizeDeclaration(Decl *ThisDecl) {
   if (DC->getRedeclContext()->isFileContext() && VD->isExternallyVisible())
     AddPushedVisibilityAttribute(VD);
 
+  // If this object has external linkage and hidden visibility, it might be
+  // duplicated when built into a shared library, which causes problems if it's
+  // mutable (since the copies won't be in sync) or its initialization has side
+  // effects (since it will run once per copy instead of once globally)
+  // FIXME: Windows uses dllexport/dllimport instead of visibility, and we don't
+  // handle that yet. Disable the warning on Windows for now.
+  // FIXME: Checking templates can cause false positives if the template in
+  // question is never instantiated (e.g. only specialized templates are used).
+  if (!Context.getTargetInfo().shouldDLLImportComdatSymbols() &&
+      !VD->isTemplated() &&
+      GloballyUniqueObjectMightBeAccidentallyDuplicated(VD)) {
+    // Check mutability. For pointers, ensure that both the pointer and the
+    // pointee are (recursively) const.
+    QualType Type = VD->getType().getNonReferenceType();
+    if (!Type.isConstant(VD->getASTContext())) {
+      Diag(VD->getLocation(), diag::warn_possible_object_duplication_mutable)
+          << VD;
+    } else {
+      while (Type->isPointerType()) {
+        Type = Type->getPointeeType();
+        if (Type->isFunctionType())
+          break;
+        if (!Type.isConstant(VD->getASTContext())) {
+          Diag(VD->getLocation(),
+               diag::warn_possible_object_duplication_mutable)
+              << VD;
+          break;
+        }
+      }
+    }
+
+    // To keep false positives low, only warn if we're certain that the
+    // initializer has side effects. Don't warn on operator new, since a mutable
+    // pointer will trigger the previous warning, and an immutable pointer
+    // getting duplicated just results in a little extra memory usage.
+    const Expr *Init = VD->getAnyInitializer();
+    if (Init &&
+        Init->HasSideEffects(VD->getASTContext(),
+                             /*IncludePossibleEffects=*/false) &&
+        !isa<CXXNewExpr>(Init->IgnoreParenImpCasts())) {
+      Diag(Init->getExprLoc(), diag::warn_possible_object_duplication_init)
+          << VD;
+    }
+  }
+
   // FIXME: Warn on unused var template partial specializations.
   if (VD->isFileVarDecl() && !isa<VarTemplatePartialSpecializationDecl>(VD))
     MarkUnusedFileScopedDecl(VD);
@@ -19966,7 +20070,7 @@ static void CheckForDuplicateEnumValues(Sema &S, ArrayRef<Decl *> Elements,
       continue;
 
     DeclOrVector& Entry = Iter->second;
-    if (EnumConstantDecl *D = Entry.dyn_cast<EnumConstantDecl*>()) {
+    if (EnumConstantDecl *D = dyn_cast<EnumConstantDecl *>(Entry)) {
       // Ensure constants are different.
       if (D == ECD)
         continue;
diff --git clang/lib/Sema/SemaDeclObjC.cpp clang/lib/Sema/SemaDeclObjC.cpp
index f97f17e8c965..e665d0293dc8 100644
--- clang/lib/Sema/SemaDeclObjC.cpp
+++ clang/lib/Sema/SemaDeclObjC.cpp
@@ -1584,7 +1584,7 @@ void SemaObjC::actOnObjCTypeArgsOrProtocolQualifiers(
     const char* prevSpec; // unused
     unsigned diagID; // unused
     QualType type;
-    if (auto *actualTypeDecl = typeDecl.dyn_cast<TypeDecl *>())
+    if (auto *actualTypeDecl = dyn_cast<TypeDecl *>(typeDecl))
       type = Context.getTypeDeclType(actualTypeDecl);
     else
       type = Context.getObjCInterfaceType(cast<ObjCInterfaceDecl *>(typeDecl));
diff --git clang/lib/Sema/SemaExceptionSpec.cpp clang/lib/Sema/SemaExceptionSpec.cpp
index 7b08a066d3cc..77a1bbcc74e5 100644
--- clang/lib/Sema/SemaExceptionSpec.cpp
+++ clang/lib/Sema/SemaExceptionSpec.cpp
@@ -1425,6 +1425,7 @@ CanThrowResult Sema::canThrow(const Stmt *S) {
   case Stmt::OpenACCCombinedConstructClass:
   case Stmt::OpenACCDataConstructClass:
   case Stmt::OpenACCHostDataConstructClass:
+  case Stmt::OpenACCAtomicConstructClass:
   case Stmt::AttributedStmtClass:
   case Stmt::BreakStmtClass:
   case Stmt::CapturedStmtClass:
diff --git clang/lib/Sema/SemaInit.cpp clang/lib/Sema/SemaInit.cpp
index b95cbbf42220..f206cd57eca8 100644
--- clang/lib/Sema/SemaInit.cpp
+++ clang/lib/Sema/SemaInit.cpp
@@ -4573,7 +4573,9 @@ static void TryConstructorInitialization(Sema &S,
 
   CXXConstructorDecl *CtorDecl = cast<CXXConstructorDecl>(Best->Function);
   if (Result != OR_Deleted) {
-    if (!IsListInit && Kind.getKind() == InitializationKind::IK_Default &&
+    if (!IsListInit &&
+        (Kind.getKind() == InitializationKind::IK_Default ||
+         Kind.getKind() == InitializationKind::IK_Direct) &&
         DestRecordDecl != nullptr && DestRecordDecl->isAggregate() &&
         DestRecordDecl->hasUninitializedExplicitInitFields()) {
       S.Diag(Kind.getLocation(), diag::warn_field_requires_explicit_init)
@@ -9146,6 +9148,17 @@ bool InitializationSequence::Diagnose(Sema &S,
               << (Msg ? Msg->getString() : StringRef()) << ArgsRange;
         }
 
+        // If it's a default constructed member, but it's not in the
+        // constructor's initializer list, explicitly note where the member is
+        // declared so the user can see which member is erroneously initialized
+        // with a deleted default constructor.
+        if (Kind.getKind() == InitializationKind::IK_Default &&
+            (Entity.getKind() == InitializedEntity::EK_Member ||
+             Entity.getKind() == InitializedEntity::EK_ParenAggInitMember)) {
+          S.Diag(Entity.getDecl()->getLocation(),
+                 diag::note_default_constructed_field)
+              << Entity.getDecl();
+        }
         S.NoteDeletedFunction(Best->Function);
         break;
       }
diff --git clang/lib/Sema/SemaOpenACC.cpp clang/lib/Sema/SemaOpenACC.cpp
index f5edc0ed36a9..2d2f8ddf4652 100644
--- clang/lib/Sema/SemaOpenACC.cpp
+++ clang/lib/Sema/SemaOpenACC.cpp
@@ -30,18 +30,23 @@ bool diagnoseConstructAppertainment(SemaOpenACC &S, OpenACCDirectiveKind K,
     // Nothing to do here, both invalid and unimplemented don't really need to
     // do anything.
     break;
-  case OpenACCDirectiveKind::ParallelLoop:
-  case OpenACCDirectiveKind::SerialLoop:
-  case OpenACCDirectiveKind::KernelsLoop:
   case OpenACCDirectiveKind::Parallel:
+  case OpenACCDirectiveKind::ParallelLoop:
   case OpenACCDirectiveKind::Serial:
+  case OpenACCDirectiveKind::SerialLoop:
   case OpenACCDirectiveKind::Kernels:
+  case OpenACCDirectiveKind::KernelsLoop:
   case OpenACCDirectiveKind::Loop:
   case OpenACCDirectiveKind::Data:
   case OpenACCDirectiveKind::EnterData:
   case OpenACCDirectiveKind::ExitData:
   case OpenACCDirectiveKind::HostData:
   case OpenACCDirectiveKind::Wait:
+  case OpenACCDirectiveKind::Update:
+  case OpenACCDirectiveKind::Init:
+  case OpenACCDirectiveKind::Shutdown:
+  case OpenACCDirectiveKind::Cache:
+  case OpenACCDirectiveKind::Atomic:
     if (!IsStmt)
       return S.Diag(StartLoc, diag::err_acc_construct_appertainment) << K;
     break;
@@ -73,6 +78,7 @@ bool PreserveLoopRAIIDepthInAssociatedStmtRAII(OpenACCDirectiveKind DK) {
     return false;
   case OpenACCDirectiveKind::Data:
   case OpenACCDirectiveKind::HostData:
+  case OpenACCDirectiveKind::Atomic:
     return true;
   case OpenACCDirectiveKind::EnterData:
   case OpenACCDirectiveKind::ExitData:
@@ -327,6 +333,7 @@ void SemaOpenACC::ActOnConstruct(OpenACCDirectiveKind K,
   case OpenACCDirectiveKind::Shutdown:
   case OpenACCDirectiveKind::Set:
   case OpenACCDirectiveKind::Update:
+  case OpenACCDirectiveKind::Atomic:
     // Nothing to do here, there is no real legalization that needs to happen
     // here as these constructs do not take any arguments.
     break;
@@ -1518,8 +1525,9 @@ bool SemaOpenACC::ActOnStartStmtDirective(
 StmtResult SemaOpenACC::ActOnEndStmtDirective(
     OpenACCDirectiveKind K, SourceLocation StartLoc, SourceLocation DirLoc,
     SourceLocation LParenLoc, SourceLocation MiscLoc, ArrayRef<Expr *> Exprs,
-    SourceLocation RParenLoc, SourceLocation EndLoc,
-    ArrayRef<OpenACCClause *> Clauses, StmtResult AssocStmt) {
+    OpenACCAtomicKind AtomicKind, SourceLocation RParenLoc,
+    SourceLocation EndLoc, ArrayRef<OpenACCClause *> Clauses,
+    StmtResult AssocStmt) {
   switch (K) {
   default:
     return StmtEmpty();
@@ -1583,13 +1591,20 @@ StmtResult SemaOpenACC::ActOnEndStmtDirective(
     return OpenACCUpdateConstruct::Create(getASTContext(), StartLoc, DirLoc,
                                           EndLoc, Clauses);
   }
+  case OpenACCDirectiveKind::Atomic: {
+    assert(Clauses.empty() && "Atomic doesn't allow clauses");
+    return OpenACCAtomicConstruct::Create(
+        getASTContext(), StartLoc, DirLoc, AtomicKind, EndLoc,
+        AssocStmt.isUsable() ? AssocStmt.get() : nullptr);
+  }
   }
   llvm_unreachable("Unhandled case in directive handling?");
 }
 
 StmtResult SemaOpenACC::ActOnAssociatedStmt(
     SourceLocation DirectiveLoc, OpenACCDirectiveKind K,
-    ArrayRef<const OpenACCClause *> Clauses, StmtResult AssocStmt) {
+    OpenACCAtomicKind AtKind, ArrayRef<const OpenACCClause *> Clauses,
+    StmtResult AssocStmt) {
   switch (K) {
   default:
     llvm_unreachable("Unimplemented associated statement application");
@@ -1601,6 +1616,8 @@ StmtResult SemaOpenACC::ActOnAssociatedStmt(
   case OpenACCDirectiveKind::Set:
     llvm_unreachable(
         "these don't have associated statements, so shouldn't get here");
+  case OpenACCDirectiveKind::Atomic:
+    return CheckAtomicAssociatedStmt(DirectiveLoc, AtKind, AssocStmt);
   case OpenACCDirectiveKind::Parallel:
   case OpenACCDirectiveKind::Serial:
   case OpenACCDirectiveKind::Kernels:
diff --git clang/lib/Sema/SemaOpenACCAtomic.cpp clang/lib/Sema/SemaOpenACCAtomic.cpp
new file mode 100644
index 000000000000..68cf338c0711
--- /dev/null
+++ clang/lib/Sema/SemaOpenACCAtomic.cpp
@@ -0,0 +1,736 @@
+//== SemaOpenACCAtomic.cpp - Semantic Analysis for OpenACC Atomic Construct===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements semantic analysis for the OpenACC atomic construct.
+///
+//===----------------------------------------------------------------------===//
+
+#include "clang/AST/ExprCXX.h"
+#include "clang/Basic/DiagnosticSema.h"
+#include "clang/Sema/SemaOpenACC.h"
+
+#include <optional>
+#include <variant>
+
+using namespace clang;
+
+namespace {
+
+class AtomicOperandChecker {
+  SemaOpenACC &SemaRef;
+  OpenACCAtomicKind AtKind;
+  SourceLocation AtomicDirLoc;
+  StmtResult AssocStmt;
+
+  // Do a diagnostic, which sets the correct error, then displays passed note.
+  bool DiagnoseInvalidAtomic(SourceLocation Loc, PartialDiagnostic NoteDiag) {
+    SemaRef.Diag(AtomicDirLoc, diag::err_acc_invalid_atomic)
+        << (AtKind != OpenACCAtomicKind::None) << AtKind;
+    SemaRef.Diag(Loc, NoteDiag);
+    return true;
+  }
+
+  // Create a replacement recovery expr in case we find an error here.  This
+  // allows us to ignore this during template instantiation so we only get a
+  // single error.
+  StmtResult getRecoveryExpr() {
+    if (!AssocStmt.isUsable())
+      return AssocStmt;
+
+    if (!SemaRef.getASTContext().getLangOpts().RecoveryAST)
+      return StmtError();
+
+    Expr *E = dyn_cast<Expr>(AssocStmt.get());
+    QualType T = E ? E->getType() : SemaRef.getASTContext().DependentTy;
+
+    return RecoveryExpr::Create(SemaRef.getASTContext(), T,
+                                AssocStmt.get()->getBeginLoc(),
+                                AssocStmt.get()->getEndLoc(),
+                                E ? ArrayRef<Expr *>{E} : ArrayRef<Expr *>{});
+  }
+
+  // OpenACC 3.3 2.12: 'expr' is an expression with scalar type.
+  bool CheckOperandExpr(const Expr *E, PartialDiagnostic PD) {
+    QualType ExprTy = E->getType();
+
+    // Scalar allowed, plus we allow instantiation dependent to support
+    // templates.
+    if (ExprTy->isInstantiationDependentType() || ExprTy->isScalarType())
+      return false;
+
+    return DiagnoseInvalidAtomic(E->getExprLoc(),
+                                 PD << diag::OACCLValScalar::Scalar << ExprTy);
+  }
+
+  // OpenACC 3.3 2.12: 'x' and 'v' (as applicable) are boht l-value expressoins
+  // with scalar type.
+  bool CheckOperandVariable(const Expr *E, PartialDiagnostic PD) {
+    if (CheckOperandExpr(E, PD))
+      return true;
+
+    if (E->isLValue())
+      return false;
+
+    return DiagnoseInvalidAtomic(E->getExprLoc(),
+                                 PD << diag::OACCLValScalar::LVal);
+  }
+
+  Expr *RequireExpr(Stmt *Stmt, PartialDiagnostic ExpectedNote) {
+    if (Expr *E = dyn_cast<Expr>(Stmt))
+      return E->IgnoreImpCasts();
+
+    DiagnoseInvalidAtomic(Stmt->getBeginLoc(), ExpectedNote);
+    return nullptr;
+  }
+
+  // A struct to hold the return the inner components of any operands, which
+  // allows for compound checking.
+  struct BinaryOpInfo {
+    const Expr *FoundExpr = nullptr;
+    const Expr *LHS = nullptr;
+    const Expr *RHS = nullptr;
+    BinaryOperatorKind Operator;
+  };
+
+  struct UnaryOpInfo {
+    const Expr *FoundExpr = nullptr;
+    const Expr *SubExpr = nullptr;
+    UnaryOperatorKind Operator;
+
+    bool IsIncrementOp() {
+      return Operator == UO_PostInc || Operator == UO_PreInc;
+    }
+  };
+
+  std::optional<UnaryOpInfo> GetUnaryOperatorInfo(const Expr *E) {
+    // If this is a simple unary operator, just return its details.
+    if (const auto *UO = dyn_cast<UnaryOperator>(E))
+      return UnaryOpInfo{UO, UO->getSubExpr()->IgnoreImpCasts(),
+                         UO->getOpcode()};
+
+    // This might be an overloaded operator or a dependent context, so make sure
+    // we can get as many details out of this as we can.
+    if (const auto *OpCall = dyn_cast<CXXOperatorCallExpr>(E)) {
+      UnaryOpInfo Inf;
+      Inf.FoundExpr = OpCall;
+
+      switch (OpCall->getOperator()) {
+      default:
+        return std::nullopt;
+      case OO_PlusPlus:
+        Inf.Operator = OpCall->getNumArgs() == 1 ? UO_PreInc : UO_PostInc;
+        break;
+      case OO_MinusMinus:
+        Inf.Operator = OpCall->getNumArgs() == 1 ? UO_PreDec : UO_PostDec;
+        break;
+      case OO_Amp:
+        Inf.Operator = UO_AddrOf;
+        break;
+      case OO_Star:
+        Inf.Operator = UO_Deref;
+        break;
+      case OO_Plus:
+        Inf.Operator = UO_Plus;
+        break;
+      case OO_Minus:
+        Inf.Operator = UO_Minus;
+        break;
+      case OO_Tilde:
+        Inf.Operator = UO_Not;
+        break;
+      case OO_Exclaim:
+        Inf.Operator = UO_LNot;
+        break;
+      case OO_Coawait:
+        Inf.Operator = UO_Coawait;
+        break;
+      }
+
+      // Some of the above can be both binary and unary operations, so make sure
+      // we get the right one.
+      if (Inf.Operator != UO_PostInc && Inf.Operator != UO_PostDec &&
+          OpCall->getNumArgs() != 1)
+        return std::nullopt;
+
+      Inf.SubExpr = OpCall->getArg(0);
+      return Inf;
+    }
+    return std::nullopt;
+  }
+
+  // Get a normalized version of a binary operator.
+  std::optional<BinaryOpInfo> GetBinaryOperatorInfo(const Expr *E) {
+    if (const auto *BO = dyn_cast<BinaryOperator>(E))
+      return BinaryOpInfo{BO, BO->getLHS()->IgnoreImpCasts(),
+                          BO->getRHS()->IgnoreImpCasts(), BO->getOpcode()};
+
+    // In case this is an operator-call, which allows us to support overloaded
+    // operators and dependent expression.
+    if (const auto *OpCall = dyn_cast<CXXOperatorCallExpr>(E)) {
+      BinaryOpInfo Inf;
+      Inf.FoundExpr = OpCall;
+
+      switch (OpCall->getOperator()) {
+      default:
+        return std::nullopt;
+      case OO_Plus:
+        Inf.Operator = BO_Add;
+        break;
+      case OO_Minus:
+        Inf.Operator = BO_Sub;
+        break;
+      case OO_Star:
+        Inf.Operator = BO_Mul;
+        break;
+      case OO_Slash:
+        Inf.Operator = BO_Div;
+        break;
+      case OO_Percent:
+        Inf.Operator = BO_Rem;
+        break;
+      case OO_Caret:
+        Inf.Operator = BO_Xor;
+        break;
+      case OO_Amp:
+        Inf.Operator = BO_And;
+        break;
+      case OO_Pipe:
+        Inf.Operator = BO_Or;
+        break;
+      case OO_Equal:
+        Inf.Operator = BO_Assign;
+        break;
+      case OO_Spaceship:
+        Inf.Operator = BO_Cmp;
+        break;
+      case OO_Less:
+        Inf.Operator = BO_LT;
+        break;
+      case OO_Greater:
+        Inf.Operator = BO_GT;
+        break;
+      case OO_PlusEqual:
+        Inf.Operator = BO_AddAssign;
+        break;
+      case OO_MinusEqual:
+        Inf.Operator = BO_SubAssign;
+        break;
+      case OO_StarEqual:
+        Inf.Operator = BO_MulAssign;
+        break;
+      case OO_SlashEqual:
+        Inf.Operator = BO_DivAssign;
+        break;
+      case OO_PercentEqual:
+        Inf.Operator = BO_RemAssign;
+        break;
+      case OO_CaretEqual:
+        Inf.Operator = BO_XorAssign;
+        break;
+      case OO_AmpEqual:
+        Inf.Operator = BO_AndAssign;
+        break;
+      case OO_PipeEqual:
+        Inf.Operator = BO_OrAssign;
+        break;
+      case OO_LessLess:
+        Inf.Operator = BO_Shl;
+        break;
+      case OO_GreaterGreater:
+        Inf.Operator = BO_Shr;
+        break;
+      case OO_LessLessEqual:
+        Inf.Operator = BO_ShlAssign;
+        break;
+      case OO_GreaterGreaterEqual:
+        Inf.Operator = BO_ShrAssign;
+        break;
+      case OO_EqualEqual:
+        Inf.Operator = BO_EQ;
+        break;
+      case OO_ExclaimEqual:
+        Inf.Operator = BO_NE;
+        break;
+      case OO_LessEqual:
+        Inf.Operator = BO_LE;
+        break;
+      case OO_GreaterEqual:
+        Inf.Operator = BO_GE;
+        break;
+      case OO_AmpAmp:
+        Inf.Operator = BO_LAnd;
+        break;
+      case OO_PipePipe:
+        Inf.Operator = BO_LOr;
+        break;
+      case OO_Comma:
+        Inf.Operator = BO_Comma;
+        break;
+      case OO_ArrowStar:
+        Inf.Operator = BO_PtrMemI;
+        break;
+      }
+
+      // This isn't a binary operator unless there are two arguments.
+      if (OpCall->getNumArgs() != 2)
+        return std::nullopt;
+
+      // Callee is the call-operator, so we only need to extract the two
+      // arguments here.
+      Inf.LHS = OpCall->getArg(0)->IgnoreImpCasts();
+      Inf.RHS = OpCall->getArg(1)->IgnoreImpCasts();
+      return Inf;
+    }
+
+    return std::nullopt;
+  }
+
+  // Checks a required assignment operation, but don't check the LHS or RHS,
+  // callers have to do that here.
+  std::optional<BinaryOpInfo> CheckAssignment(const Expr *E) {
+    std::optional<BinaryOpInfo> Inf = GetBinaryOperatorInfo(E);
+
+    if (!Inf) {
+      DiagnoseInvalidAtomic(E->getExprLoc(),
+                            SemaRef.PDiag(diag::note_acc_atomic_expr_must_be)
+                                << diag::OACCAtomicExpr::Assign);
+      return std::nullopt;
+    }
+
+    if (Inf->Operator != BO_Assign) {
+      DiagnoseInvalidAtomic(Inf->FoundExpr->getExprLoc(),
+                            SemaRef.PDiag(diag::note_acc_atomic_expr_must_be)
+                                << diag::OACCAtomicExpr::Assign);
+      return std::nullopt;
+    }
+
+    // Assignment always requires an lvalue/scalar on the LHS.
+    if (CheckOperandVariable(
+            Inf->LHS, SemaRef.PDiag(diag::note_acc_atomic_operand_lvalue_scalar)
+                          << /*left=*/0 << diag::OACCAtomicOpKind::Assign))
+      return std::nullopt;
+
+    return Inf;
+  }
+
+  struct IDACInfo {
+    bool Failed = false;
+    enum ExprKindTy {
+      Invalid,
+      // increment/decrement ops.
+      Unary,
+      // v = x
+      SimpleAssign,
+      // x = expr
+      ExprAssign,
+      // x binop= expr
+      CompoundAssign,
+      // x = x binop expr
+      // x = expr binop x
+      AssignBinOp
+    } ExprKind;
+
+    // The variable referred to as 'x' in all of the grammar, such that it is
+    // needed in compound statement checking of capture to check between the two
+    // expressions.
+    const Expr *X_Var = nullptr;
+
+    static IDACInfo Fail() { return IDACInfo{true, Invalid, nullptr}; };
+  };
+
+  // Helper for CheckIncDecAssignCompoundAssign, does checks for inc/dec.
+  IDACInfo CheckIncDec(UnaryOpInfo Inf) {
+
+    if (!UnaryOperator::isIncrementDecrementOp(Inf.Operator)) {
+      DiagnoseInvalidAtomic(
+          Inf.FoundExpr->getExprLoc(),
+          SemaRef.PDiag(diag::note_acc_atomic_unsupported_unary_operator));
+      return IDACInfo::Fail();
+    }
+    bool Failed = CheckOperandVariable(
+        Inf.SubExpr,
+        SemaRef.PDiag(diag::note_acc_atomic_operand_lvalue_scalar)
+            << /*none=*/2
+            << (Inf.IsIncrementOp() ? diag::OACCAtomicOpKind::Inc
+                                    : diag::OACCAtomicOpKind::Dec));
+    // For increment/decrements, the subexpr is the 'x' (x++, ++x, etc).
+    return IDACInfo{Failed, IDACInfo::Unary, Inf.SubExpr};
+  }
+
+  enum class SimpleAssignKind { None, Var, Expr };
+
+  // Check an assignment, and ensure the RHS is either x binop expr or expr
+  // binop x.
+  // If AllowSimpleAssign, also allows v = x;
+  IDACInfo CheckAssignmentWithBinOpOnRHS(BinaryOpInfo AssignInf,
+                                         SimpleAssignKind SAK) {
+    PartialDiagnostic PD =
+        SemaRef.PDiag(diag::note_acc_atomic_operand_lvalue_scalar)
+        << /*left=*/0 << diag::OACCAtomicOpKind::Assign;
+    if (CheckOperandVariable(AssignInf.LHS, PD))
+      return IDACInfo::Fail();
+
+    std::optional<BinaryOpInfo> BinInf = GetBinaryOperatorInfo(AssignInf.RHS);
+
+    if (!BinInf) {
+
+      // Capture in a compound statement allows v = x assignment.  So make sure
+      // we permit that here.
+      if (SAK != SimpleAssignKind::None) {
+        PartialDiagnostic PD =
+            SemaRef.PDiag(diag::note_acc_atomic_operand_lvalue_scalar)
+            << /*right=*/1 << diag::OACCAtomicOpKind::Assign;
+        if (SAK == SimpleAssignKind::Var) {
+          // In the var version, everywhere we allow v = x;, X is the RHS.
+          return IDACInfo{CheckOperandVariable(AssignInf.RHS, PD),
+                          IDACInfo::SimpleAssign, AssignInf.RHS};
+        }
+        assert(SAK == SimpleAssignKind::Expr);
+        // In the expression version, supported by v=x; x = expr;, we need to
+        // set to the LHS here.
+        return IDACInfo{CheckOperandExpr(AssignInf.RHS, PD),
+                        IDACInfo::ExprAssign, AssignInf.LHS};
+      }
+
+      DiagnoseInvalidAtomic(
+          AssignInf.RHS->getExprLoc(),
+          SemaRef.PDiag(diag::note_acc_atomic_expected_binop));
+
+      return IDACInfo::Fail();
+    }
+    switch (BinInf->Operator) {
+    default:
+      DiagnoseInvalidAtomic(
+          BinInf->FoundExpr->getExprLoc(),
+          SemaRef.PDiag(diag::note_acc_atomic_unsupported_binary_operator));
+      return IDACInfo::Fail();
+      // binop is one of +, *, -, /, &, ^, |, <<, or >>
+    case BO_Add:
+    case BO_Mul:
+    case BO_Sub:
+    case BO_Div:
+    case BO_And:
+    case BO_Xor:
+    case BO_Or:
+    case BO_Shl:
+    case BO_Shr:
+      // Handle these outside of the switch.
+      break;
+    }
+
+    llvm::FoldingSetNodeID LHS_ID, InnerLHS_ID, InnerRHS_ID;
+    AssignInf.LHS->Profile(LHS_ID, SemaRef.getASTContext(),
+                           /*Canonical=*/true);
+    BinInf->LHS->Profile(InnerLHS_ID, SemaRef.getASTContext(),
+                         /*Canonical=*/true);
+
+    // This is X = X binop expr;
+    // Check the RHS is an expression.
+    if (LHS_ID == InnerLHS_ID)
+      return IDACInfo{
+          CheckOperandExpr(
+              BinInf->RHS,
+              SemaRef.PDiag(diag::note_acc_atomic_operand_lvalue_scalar
+                            << /*right=*/1
+                            << diag::OACCAtomicOpKind::CompoundAssign)),
+          IDACInfo::AssignBinOp, AssignInf.LHS};
+
+    BinInf->RHS->Profile(InnerRHS_ID, SemaRef.getASTContext(),
+                         /*Canonical=*/true);
+    // This is X = expr binop X;
+    // Check the LHS is an expression
+    if (LHS_ID == InnerRHS_ID)
+      return IDACInfo{
+          CheckOperandExpr(
+              BinInf->LHS,
+              SemaRef.PDiag(diag::note_acc_atomic_operand_lvalue_scalar)
+                  << /*left=*/0 << diag::OACCAtomicOpKind::CompoundAssign),
+          IDACInfo::AssignBinOp, AssignInf.LHS};
+
+    // If nothing matches, error out.
+    DiagnoseInvalidAtomic(BinInf->FoundExpr->getExprLoc(),
+                          SemaRef.PDiag(diag::note_acc_atomic_mismatch_operand)
+                              << const_cast<Expr *>(AssignInf.LHS)
+                              << const_cast<Expr *>(BinInf->LHS)
+                              << const_cast<Expr *>(BinInf->RHS));
+    return IDACInfo::Fail();
+  }
+
+  // Ensures that the expression is an increment/decrement, an assignment, or a
+  // compound assignment. If its an assignment, allows the x binop expr/x binop
+  // expr syntax. If it is a compound-assignment, allows any expr on the RHS.
+  IDACInfo CheckIncDecAssignCompoundAssign(const Expr *E,
+                                           SimpleAssignKind SAK) {
+    std::optional<UnaryOpInfo> UInf = GetUnaryOperatorInfo(E);
+
+    // If this is a unary operator, only increment/decrement are allowed, so get
+    // unary operator, then check everything we can.
+    if (UInf)
+      return CheckIncDec(*UInf);
+
+    std::optional<BinaryOpInfo> BinInf = GetBinaryOperatorInfo(E);
+
+    // Unary or binary operator were the only choices, so error here.
+    if (!BinInf) {
+      DiagnoseInvalidAtomic(E->getExprLoc(),
+                            SemaRef.PDiag(diag::note_acc_atomic_expr_must_be)
+                                << diag::OACCAtomicExpr::UnaryCompAssign);
+      return IDACInfo::Fail();
+    }
+
+    switch (BinInf->Operator) {
+    default:
+      DiagnoseInvalidAtomic(
+          BinInf->FoundExpr->getExprLoc(),
+          SemaRef.PDiag(
+              diag::note_acc_atomic_unsupported_compound_binary_operator));
+      return IDACInfo::Fail();
+    case BO_Assign:
+      return CheckAssignmentWithBinOpOnRHS(*BinInf, SAK);
+    case BO_AddAssign:
+    case BO_MulAssign:
+    case BO_SubAssign:
+    case BO_DivAssign:
+    case BO_AndAssign:
+    case BO_XorAssign:
+    case BO_OrAssign:
+    case BO_ShlAssign:
+    case BO_ShrAssign: {
+      PartialDiagnostic LPD =
+          SemaRef.PDiag(diag::note_acc_atomic_operand_lvalue_scalar)
+          << /*left=*/0 << diag::OACCAtomicOpKind::CompoundAssign;
+      PartialDiagnostic RPD =
+          SemaRef.PDiag(diag::note_acc_atomic_operand_lvalue_scalar)
+          << /*right=*/1 << diag::OACCAtomicOpKind::CompoundAssign;
+      // nothing to do other than check the variable expressions.
+      // success or failure
+      bool Failed = CheckOperandVariable(BinInf->LHS, LPD) ||
+                    CheckOperandExpr(BinInf->RHS, RPD);
+
+      return IDACInfo{Failed, IDACInfo::CompoundAssign, BinInf->LHS};
+    }
+    }
+    llvm_unreachable("all binary operator kinds should be checked above");
+  }
+
+  StmtResult CheckRead() {
+    Expr *AssocExpr = RequireExpr(
+        AssocStmt.get(), SemaRef.PDiag(diag::note_acc_atomic_expr_must_be)
+                             << diag::OACCAtomicExpr::Assign);
+
+    if (!AssocExpr)
+      return getRecoveryExpr();
+
+    std::optional<BinaryOpInfo> AssignRes = CheckAssignment(AssocExpr);
+    if (!AssignRes)
+      return getRecoveryExpr();
+
+    PartialDiagnostic PD =
+        SemaRef.PDiag(diag::note_acc_atomic_operand_lvalue_scalar)
+        << /*right=*/1 << diag::OACCAtomicOpKind::Assign;
+
+    // Finally, check the RHS.
+    if (CheckOperandVariable(AssignRes->RHS, PD))
+      return getRecoveryExpr();
+
+    return AssocStmt;
+  }
+
+  StmtResult CheckWrite() {
+    Expr *AssocExpr = RequireExpr(
+        AssocStmt.get(), SemaRef.PDiag(diag::note_acc_atomic_expr_must_be)
+                             << diag::OACCAtomicExpr::Assign);
+
+    if (!AssocExpr)
+      return getRecoveryExpr();
+
+    std::optional<BinaryOpInfo> AssignRes = CheckAssignment(AssocExpr);
+    if (!AssignRes)
+      return getRecoveryExpr();
+
+    PartialDiagnostic PD =
+        SemaRef.PDiag(diag::note_acc_atomic_operand_lvalue_scalar)
+        << /*right=*/1 << diag::OACCAtomicOpKind::Assign;
+
+    // Finally, check the RHS.
+    if (CheckOperandExpr(AssignRes->RHS, PD))
+      return getRecoveryExpr();
+
+    return AssocStmt;
+  }
+
+  StmtResult CheckUpdate() {
+    Expr *AssocExpr = RequireExpr(
+        AssocStmt.get(), SemaRef.PDiag(diag::note_acc_atomic_expr_must_be)
+                             << diag::OACCAtomicExpr::UnaryCompAssign);
+
+    if (!AssocExpr ||
+        CheckIncDecAssignCompoundAssign(AssocExpr, SimpleAssignKind::None)
+            .Failed)
+      return getRecoveryExpr();
+
+    return AssocStmt;
+  }
+
+  bool CheckVarRefsSame(IDACInfo::ExprKindTy FirstKind, const Expr *FirstX,
+                        IDACInfo::ExprKindTy SecondKind, const Expr *SecondX) {
+    llvm::FoldingSetNodeID First_ID, Second_ID;
+    FirstX->Profile(First_ID, SemaRef.getASTContext(), /*Canonical=*/true);
+    SecondX->Profile(Second_ID, SemaRef.getASTContext(), /*Canonical=*/true);
+
+    if (First_ID == Second_ID)
+      return false;
+
+    PartialDiagnostic PD =
+        SemaRef.PDiag(diag::note_acc_atomic_mismatch_compound_operand)
+        << FirstKind << const_cast<Expr *>(FirstX) << SecondKind
+        << const_cast<Expr *>(SecondX);
+
+    return DiagnoseInvalidAtomic(SecondX->getExprLoc(), PD);
+  }
+
+  StmtResult CheckCapture() {
+    if (const auto *CmpdStmt = dyn_cast<CompoundStmt>(AssocStmt.get())) {
+      auto *const *BodyItr = CmpdStmt->body().begin();
+      PartialDiagnostic PD = SemaRef.PDiag(diag::note_acc_atomic_expr_must_be)
+                             << diag::OACCAtomicExpr::UnaryCompAssign;
+      // If we don't have at least 1 statement, error.
+      if (BodyItr == CmpdStmt->body().end()) {
+        DiagnoseInvalidAtomic(CmpdStmt->getBeginLoc(), PD);
+        return getRecoveryExpr();
+      }
+
+      // First Expr can be inc/dec, assign, or compound assign.
+      Expr *FirstExpr = RequireExpr(*BodyItr, PD);
+      if (!FirstExpr)
+        return getRecoveryExpr();
+
+      IDACInfo FirstExprResults =
+          CheckIncDecAssignCompoundAssign(FirstExpr, SimpleAssignKind::Var);
+      if (FirstExprResults.Failed)
+        return getRecoveryExpr();
+
+      ++BodyItr;
+
+      // If we don't have second statement, error.
+      if (BodyItr == CmpdStmt->body().end()) {
+        DiagnoseInvalidAtomic(CmpdStmt->getEndLoc(), PD);
+        return getRecoveryExpr();
+      }
+
+      Expr *SecondExpr = RequireExpr(*BodyItr, PD);
+      if (!SecondExpr)
+        return getRecoveryExpr();
+
+      assert(FirstExprResults.ExprKind != IDACInfo::Invalid);
+
+      switch (FirstExprResults.ExprKind) {
+      case IDACInfo::Invalid:
+      case IDACInfo::ExprAssign:
+        llvm_unreachable("Should have error'ed out by now");
+      case IDACInfo::Unary:
+      case IDACInfo::CompoundAssign:
+      case IDACInfo::AssignBinOp: {
+        // Everything but simple-assign can only be followed by a simple
+        // assignment.
+        std::optional<BinaryOpInfo> AssignRes = CheckAssignment(SecondExpr);
+        if (!AssignRes)
+          return getRecoveryExpr();
+
+        PartialDiagnostic PD =
+            SemaRef.PDiag(diag::note_acc_atomic_operand_lvalue_scalar)
+            << /*right=*/1 << diag::OACCAtomicOpKind::Assign;
+
+        if (CheckOperandVariable(AssignRes->RHS, PD))
+          return getRecoveryExpr();
+
+        if (CheckVarRefsSame(FirstExprResults.ExprKind, FirstExprResults.X_Var,
+                             IDACInfo::SimpleAssign, AssignRes->RHS))
+          return getRecoveryExpr();
+        break;
+      }
+      case IDACInfo::SimpleAssign: {
+        // If the first was v = x, anything but simple expression is allowed.
+        IDACInfo SecondExprResults =
+            CheckIncDecAssignCompoundAssign(SecondExpr, SimpleAssignKind::Expr);
+        if (SecondExprResults.Failed)
+          return getRecoveryExpr();
+
+        if (CheckVarRefsSame(FirstExprResults.ExprKind, FirstExprResults.X_Var,
+                             SecondExprResults.ExprKind,
+                             SecondExprResults.X_Var))
+          return getRecoveryExpr();
+        break;
+      }
+      }
+      ++BodyItr;
+      if (BodyItr != CmpdStmt->body().end()) {
+        DiagnoseInvalidAtomic(
+            (*BodyItr)->getBeginLoc(),
+            SemaRef.PDiag(diag::note_acc_atomic_too_many_stmts));
+        return getRecoveryExpr();
+      }
+    } else {
+      // This check doesn't need to happen if it is a compound stmt.
+      Expr *AssocExpr = RequireExpr(
+          AssocStmt.get(), SemaRef.PDiag(diag::note_acc_atomic_expr_must_be)
+                               << diag::OACCAtomicExpr::Assign);
+      if (!AssocExpr)
+        return getRecoveryExpr();
+
+      // First, we require an assignment.
+      std::optional<BinaryOpInfo> AssignRes = CheckAssignment(AssocExpr);
+
+      if (!AssignRes)
+        return getRecoveryExpr();
+
+      if (CheckIncDecAssignCompoundAssign(AssignRes->RHS,
+                                          SimpleAssignKind::None)
+              .Failed)
+        return getRecoveryExpr();
+    }
+
+    return AssocStmt;
+  }
+
+public:
+  AtomicOperandChecker(SemaOpenACC &S, OpenACCAtomicKind AtKind,
+                       SourceLocation DirLoc, StmtResult AssocStmt)
+      : SemaRef(S), AtKind(AtKind), AtomicDirLoc(DirLoc), AssocStmt(AssocStmt) {
+  }
+
+  StmtResult Check() {
+
+    switch (AtKind) {
+    case OpenACCAtomicKind::Read:
+      return CheckRead();
+    case OpenACCAtomicKind::Write:
+      return CheckWrite();
+    case OpenACCAtomicKind::None:
+    case OpenACCAtomicKind::Update:
+      return CheckUpdate();
+    case OpenACCAtomicKind::Capture:
+      return CheckCapture();
+    }
+    llvm_unreachable("Unhandled atomic kind?");
+  }
+};
+} // namespace
+
+StmtResult SemaOpenACC::CheckAtomicAssociatedStmt(SourceLocation AtomicDirLoc,
+                                                  OpenACCAtomicKind AtKind,
+                                                  StmtResult AssocStmt) {
+  if (!AssocStmt.isUsable())
+    return AssocStmt;
+
+  if (isa<RecoveryExpr>(AssocStmt.get()))
+    return AssocStmt;
+
+  AtomicOperandChecker Checker{*this, AtKind, AtomicDirLoc, AssocStmt};
+  return Checker.Check();
+}
diff --git clang/lib/Sema/SemaOpenACCClause.cpp clang/lib/Sema/SemaOpenACCClause.cpp
index 000934225402..1e74f126c31c 100644
--- clang/lib/Sema/SemaOpenACCClause.cpp
+++ clang/lib/Sema/SemaOpenACCClause.cpp
@@ -589,7 +589,6 @@ bool checkValidAfterDeviceType(
 // construct has been implemented.
 bool isDirectiveKindImplemented(OpenACCDirectiveKind DK) {
   return DK != OpenACCDirectiveKind::Declare &&
-         DK != OpenACCDirectiveKind::Atomic &&
          DK != OpenACCDirectiveKind::Routine;
 }
 
diff --git clang/lib/Sema/SemaTemplateDeduction.cpp clang/lib/Sema/SemaTemplateDeduction.cpp
index 6aaf86a6a6ff..137942f0c30b 100644
--- clang/lib/Sema/SemaTemplateDeduction.cpp
+++ clang/lib/Sema/SemaTemplateDeduction.cpp
@@ -4074,7 +4074,22 @@ TemplateDeductionResult Sema::FinishTemplateArgumentDeduction(
   if (FunctionTemplate->getFriendObjectKind())
     Owner = FunctionTemplate->getLexicalDeclContext();
   FunctionDecl *FD = FunctionTemplate->getTemplatedDecl();
-
+  // additional check for inline friend,
+  // ```
+  //   template <class F1> int foo(F1 X);
+  //   template <int A1> struct A {
+  //     template <class F1> friend int foo(F1 X) { return A1; }
+  //   };
+  //   template struct A<1>;
+  //   int a = foo(1.0);
+  // ```
+  const FunctionDecl *FDFriend;
+  if (FD->getFriendObjectKind() == Decl::FriendObjectKind::FOK_None &&
+      FD->isDefined(FDFriend, /*CheckForPendingFriendDefinition*/ true) &&
+      FDFriend->getFriendObjectKind() != Decl::FriendObjectKind::FOK_None) {
+    FD = const_cast<FunctionDecl *>(FDFriend);
+    Owner = FD->getLexicalDeclContext();
+  }
   MultiLevelTemplateArgumentList SubstArgs(
       FunctionTemplate, CanonicalDeducedArgumentList->asArray(),
       /*Final=*/false);
diff --git clang/lib/Sema/SemaTemplateDeductionGuide.cpp clang/lib/Sema/SemaTemplateDeductionGuide.cpp
index 00c5dfd3d7a4..0d079677eecc 100644
--- clang/lib/Sema/SemaTemplateDeductionGuide.cpp
+++ clang/lib/Sema/SemaTemplateDeductionGuide.cpp
@@ -1227,11 +1227,14 @@ void DeclareImplicitDeductionGuidesForTypeAlias(
         NewParam->setScopeInfo(0, I);
         FPTL.setParam(I, NewParam);
       }
-      auto *Transformed = cast<FunctionDecl>(buildDeductionGuide(
+      auto *Transformed = cast<CXXDeductionGuideDecl>(buildDeductionGuide(
           SemaRef, AliasTemplate, /*TemplateParams=*/nullptr,
           /*Constructor=*/nullptr, DG->getExplicitSpecifier(), FunctionType,
           AliasTemplate->getBeginLoc(), AliasTemplate->getLocation(),
           AliasTemplate->getEndLoc(), DG->isImplicit()));
+      Transformed->setSourceDeductionGuide(DG);
+      Transformed->setSourceDeductionGuideKind(
+          CXXDeductionGuideDecl::SourceDeductionGuideKind::Alias);
 
       // FIXME: Here the synthesized deduction guide is not a templated
       // function. Per [dcl.decl]p4, the requires-clause shall be present only
@@ -1246,6 +1249,7 @@ void DeclareImplicitDeductionGuidesForTypeAlias(
           Constraint = Conjunction.getAs<Expr>();
       }
       Transformed->setTrailingRequiresClause(Constraint);
+      continue;
     }
     FunctionTemplateDecl *F = dyn_cast<FunctionTemplateDecl>(G);
     if (!F)
diff --git clang/lib/Sema/SemaTemplateInstantiate.cpp clang/lib/Sema/SemaTemplateInstantiate.cpp
index 12e98a33d078..dc3bfa97eff3 100644
--- clang/lib/Sema/SemaTemplateInstantiate.cpp
+++ clang/lib/Sema/SemaTemplateInstantiate.cpp
@@ -479,6 +479,9 @@ MultiLevelTemplateArgumentList Sema::getTemplateInstantiationArgs(
   using namespace TemplateInstArgsHelpers;
   const Decl *CurDecl = ND;
 
+  if (!CurDecl)
+    CurDecl = Decl::castFromDeclContext(DC);
+
   if (Innermost) {
     Result.addOuterTemplateArguments(const_cast<NamedDecl *>(ND), *Innermost,
                                      Final);
@@ -492,10 +495,8 @@ MultiLevelTemplateArgumentList Sema::getTemplateInstantiationArgs(
     // has a depth of 0.
     if (const auto *TTP = dyn_cast<TemplateTemplateParmDecl>(CurDecl))
       HandleDefaultTempArgIntoTempTempParam(TTP, Result);
-    CurDecl = DC ? Decl::castFromDeclContext(DC)
-                 : Response::UseNextDecl(CurDecl).NextDecl;
-  } else if (!CurDecl)
-    CurDecl = Decl::castFromDeclContext(DC);
+    CurDecl = Response::UseNextDecl(CurDecl).NextDecl;
+  }
 
   while (!CurDecl->isFileContextDecl()) {
     Response R;
diff --git clang/lib/Sema/SemaTemplateInstantiateDecl.cpp clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index d530ed0847ae..4855e8a23689 100644
--- clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -12,7 +12,6 @@
 #include "TreeTransform.h"
 #include "clang/AST/ASTConsumer.h"
 #include "clang/AST/ASTContext.h"
-#include "clang/AST/ASTLambda.h"
 #include "clang/AST/ASTMutationListener.h"
 #include "clang/AST/DeclTemplate.h"
 #include "clang/AST/DependentDiagnostic.h"
@@ -5277,26 +5276,9 @@ void Sema::InstantiateFunctionDefinition(SourceLocation PointOfInstantiation,
     RebuildTypeSourceInfoForDefaultSpecialMembers();
     SetDeclDefaulted(Function, PatternDecl->getLocation());
   } else {
-    NamedDecl *ND = Function;
-    DeclContext *DC = ND->getLexicalDeclContext();
-    std::optional<ArrayRef<TemplateArgument>> Innermost;
-    if (auto *Primary = Function->getPrimaryTemplate();
-        Primary &&
-        !isGenericLambdaCallOperatorOrStaticInvokerSpecialization(Function) &&
-        Function->getTemplateSpecializationKind() !=
-            TSK_ExplicitSpecialization) {
-      auto It = llvm::find_if(Primary->redecls(),
-                              [](const RedeclarableTemplateDecl *RTD) {
-                                return cast<FunctionTemplateDecl>(RTD)
-                                    ->isCompatibleWithDefinition();
-                              });
-      assert(It != Primary->redecls().end() &&
-             "Should't get here without a definition");
-      DC = (*It)->getLexicalDeclContext();
-      Innermost.emplace(Function->getTemplateSpecializationArgs()->asArray());
-    }
     MultiLevelTemplateArgumentList TemplateArgs = getTemplateInstantiationArgs(
-        Function, DC, /*Final=*/false, Innermost, false, PatternDecl);
+        Function, Function->getLexicalDeclContext(), /*Final=*/false,
+        /*Innermost=*/std::nullopt, false, PatternDecl);
 
     // Substitute into the qualifier; we can get a substitution failure here
     // through evil use of alias templates.
diff --git clang/lib/Sema/SemaType.cpp clang/lib/Sema/SemaType.cpp
index 33d5378944dd..1fa5239a597c 100644
--- clang/lib/Sema/SemaType.cpp
+++ clang/lib/Sema/SemaType.cpp
@@ -1826,7 +1826,8 @@ QualType Sema::BuildPointerType(QualType T,
   if (checkQualifiedFunction(*this, T, Loc, QFK_Pointer))
     return QualType();
 
-  assert(!T->isObjCObjectType() && "Should build ObjCObjectPointerType");
+  if (T->isObjCObjectType())
+    return Context.getObjCObjectPointerType(T);
 
   // In ARC, it is forbidden to build pointers to unqualified pointers.
   if (getLangOpts().ObjCAutoRefCount)
@@ -8491,7 +8492,8 @@ static void HandleRISCVRVVVectorBitsTypeAttr(QualType &CurType,
     return;
   }
 
-  auto VScale = S.Context.getTargetInfo().getVScaleRange(S.getLangOpts());
+  auto VScale =
+      S.Context.getTargetInfo().getVScaleRange(S.getLangOpts(), false);
   if (!VScale || !VScale->first || VScale->first != VScale->second) {
     S.Diag(Attr.getLoc(), diag::err_attribute_riscv_rvv_bits_unsupported)
         << Attr;
@@ -9807,8 +9809,7 @@ QualType Sema::BuiltinAddPointer(QualType BaseType, SourceLocation Loc) {
 }
 
 QualType Sema::BuiltinRemovePointer(QualType BaseType, SourceLocation Loc) {
-  // We don't want block pointers or ObjectiveC's id type.
-  if (!BaseType->isAnyPointerType() || BaseType->isObjCIdType())
+  if (!BaseType->isAnyPointerType())
     return BaseType;
 
   return BaseType->getPointeeType();
diff --git clang/lib/Sema/TreeTransform.h clang/lib/Sema/TreeTransform.h
index 808b56448e1e..60100d77c22a 100644
--- clang/lib/Sema/TreeTransform.h
+++ clang/lib/Sema/TreeTransform.h
@@ -4211,6 +4211,17 @@ public:
         Exprs, RParenLoc, EndLoc, Clauses, {});
   }
 
+  StmtResult RebuildOpenACCAtomicConstruct(SourceLocation BeginLoc,
+                                           SourceLocation DirLoc,
+                                           OpenACCAtomicKind AtKind,
+                                           SourceLocation EndLoc,
+                                           StmtResult AssociatedStmt) {
+    return getSema().OpenACC().ActOnEndStmtDirective(
+        OpenACCDirectiveKind::Atomic, BeginLoc, DirLoc, SourceLocation{},
+        SourceLocation{}, {}, AtKind, SourceLocation{}, EndLoc, {},
+        AssociatedStmt);
+  }
+
   ExprResult RebuildOpenACCAsteriskSizeExpr(SourceLocation AsteriskLoc) {
     return getSema().OpenACC().ActOnOpenACCAsteriskSizeExpr(AsteriskLoc);
   }
@@ -12613,6 +12624,29 @@ TreeTransform<Derived>::TransformOpenACCWaitConstruct(OpenACCWaitConstruct *C) {
       QueueIdExprs, C->getRParenLoc(), C->getEndLoc(), TransformedClauses);
 }
 
+template <typename Derived>
+StmtResult TreeTransform<Derived>::TransformOpenACCAtomicConstruct(
+    OpenACCAtomicConstruct *C) {
+  getSema().OpenACC().ActOnConstruct(C->getDirectiveKind(), C->getBeginLoc());
+
+  if (getSema().OpenACC().ActOnStartStmtDirective(C->getDirectiveKind(),
+                                                  C->getBeginLoc(), {}))
+    return StmtError();
+
+  // Transform Associated Stmt.
+  SemaOpenACC::AssociatedStmtRAII AssocStmtRAII(
+      getSema().OpenACC(), C->getDirectiveKind(), C->getDirectiveLoc(), {}, {});
+
+  StmtResult AssocStmt = getDerived().TransformStmt(C->getAssociatedStmt());
+  AssocStmt = getSema().OpenACC().ActOnAssociatedStmt(
+      C->getBeginLoc(), C->getDirectiveKind(), C->getAtomicKind(), {},
+      AssocStmt);
+
+  return getDerived().RebuildOpenACCAtomicConstruct(
+      C->getBeginLoc(), C->getDirectiveLoc(), C->getAtomicKind(),
+      C->getEndLoc(), AssocStmt);
+}
+
 template <typename Derived>
 ExprResult TreeTransform<Derived>::TransformOpenACCAsteriskSizeExpr(
     OpenACCAsteriskSizeExpr *E) {
diff --git clang/lib/Serialization/ASTReader.cpp clang/lib/Serialization/ASTReader.cpp
index f524251c48dd..24acd6e297e7 100644
--- clang/lib/Serialization/ASTReader.cpp
+++ clang/lib/Serialization/ASTReader.cpp
@@ -10186,12 +10186,12 @@ void ASTReader::visitTopLevelModuleMaps(
 }
 
 void ASTReader::finishPendingActions() {
-  while (
-      !PendingIdentifierInfos.empty() || !PendingDeducedFunctionTypes.empty() ||
-      !PendingDeducedVarTypes.empty() || !PendingIncompleteDeclChains.empty() ||
-      !PendingDeclChains.empty() || !PendingMacroIDs.empty() ||
-      !PendingDeclContextInfos.empty() || !PendingUpdateRecords.empty() ||
-      !PendingObjCExtensionIvarRedeclarations.empty()) {
+  while (!PendingIdentifierInfos.empty() ||
+         !PendingDeducedFunctionTypes.empty() ||
+         !PendingDeducedVarTypes.empty() || !PendingDeclChains.empty() ||
+         !PendingMacroIDs.empty() || !PendingDeclContextInfos.empty() ||
+         !PendingUpdateRecords.empty() ||
+         !PendingObjCExtensionIvarRedeclarations.empty()) {
     // If any identifiers with corresponding top-level declarations have
     // been loaded, load those declarations now.
     using TopLevelDeclsMap =
@@ -10239,13 +10239,6 @@ void ASTReader::finishPendingActions() {
     }
     PendingDeducedVarTypes.clear();
 
-    // For each decl chain that we wanted to complete while deserializing, mark
-    // it as "still needs to be completed".
-    for (unsigned I = 0; I != PendingIncompleteDeclChains.size(); ++I) {
-      markIncompleteDeclChain(PendingIncompleteDeclChains[I]);
-    }
-    PendingIncompleteDeclChains.clear();
-
     // Load pending declaration chains.
     for (unsigned I = 0; I != PendingDeclChains.size(); ++I)
       loadPendingDeclChain(PendingDeclChains[I].first,
@@ -10483,6 +10476,12 @@ void ASTReader::finishPendingActions() {
   for (auto *ND : PendingMergedDefinitionsToDeduplicate)
     getContext().deduplicateMergedDefinitonsFor(ND);
   PendingMergedDefinitionsToDeduplicate.clear();
+
+  // For each decl chain that we wanted to complete while deserializing, mark
+  // it as "still needs to be completed".
+  for (Decl *D : PendingIncompleteDeclChains)
+    markIncompleteDeclChain(D);
+  PendingIncompleteDeclChains.clear();
 }
 
 void ASTReader::diagnoseOdrViolations() {
diff --git clang/lib/Serialization/ASTReaderDecl.cpp clang/lib/Serialization/ASTReaderDecl.cpp
index 7a15e60d87d8..8210eb2143ac 100644
--- clang/lib/Serialization/ASTReaderDecl.cpp
+++ clang/lib/Serialization/ASTReaderDecl.cpp
@@ -1064,7 +1064,6 @@ void ASTDeclReader::VisitFunctionDecl(FunctionDecl *FD) {
   FD->setHasImplicitReturnZero(FunctionDeclBits.getNextBit());
   FD->setIsMultiVersion(FunctionDeclBits.getNextBit());
   FD->setLateTemplateParsed(FunctionDeclBits.getNextBit());
-  FD->setInstantiatedFromMemberTemplate(FunctionDeclBits.getNextBit());
   FD->setFriendConstraintRefersToEnclosingTemplate(
       FunctionDeclBits.getNextBit());
   FD->setUsesSEHTry(FunctionDeclBits.getNextBit());
diff --git clang/lib/Serialization/ASTReaderStmt.cpp clang/lib/Serialization/ASTReaderStmt.cpp
index b15eca87993a..dc953ddeee85 100644
--- clang/lib/Serialization/ASTReaderStmt.cpp
+++ clang/lib/Serialization/ASTReaderStmt.cpp
@@ -2923,6 +2923,15 @@ void ASTStmtReader::VisitOpenACCWaitConstruct(OpenACCWaitConstruct *S) {
   }
 }
 
+void ASTStmtReader::VisitOpenACCAtomicConstruct(OpenACCAtomicConstruct *S) {
+  VisitStmt(S);
+  S->Kind = Record.readEnum<OpenACCDirectiveKind>();
+  S->Range = Record.readSourceRange();
+  S->DirectiveLoc = Record.readSourceLocation();
+  S->AtomicKind = Record.readEnum<OpenACCAtomicKind>();
+  S->setAssociatedStmt(Record.readSubStmt());
+}
+
 //===----------------------------------------------------------------------===//
 // HLSL Constructs/Directives.
 //===----------------------------------------------------------------------===//
@@ -4454,6 +4463,10 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
       S = OpenACCUpdateConstruct::CreateEmpty(Context, NumClauses);
       break;
     }
+    case STMT_OPENACC_ATOMIC_CONSTRUCT: {
+      S = OpenACCAtomicConstruct::CreateEmpty(Context);
+      break;
+    }
     case EXPR_REQUIRES: {
       unsigned numLocalParameters = Record[ASTStmtReader::NumExprFields];
       unsigned numRequirement = Record[ASTStmtReader::NumExprFields + 1];
diff --git clang/lib/Serialization/ASTWriter.cpp clang/lib/Serialization/ASTWriter.cpp
index e81a441d753b..ef8ee5bc94d0 100644
--- clang/lib/Serialization/ASTWriter.cpp
+++ clang/lib/Serialization/ASTWriter.cpp
@@ -5359,7 +5359,7 @@ ASTWriter::WriteAST(llvm::PointerUnion<Sema *, Preprocessor *> Subject,
   llvm::TimeTraceScope scope("WriteAST", OutputFile);
   WritingAST = true;
 
-  Sema *SemaPtr = Subject.dyn_cast<Sema *>();
+  Sema *SemaPtr = dyn_cast<Sema *>(Subject);
   Preprocessor &PPRef =
       SemaPtr ? SemaPtr->getPreprocessor() : *cast<Preprocessor *>(Subject);
 
diff --git clang/lib/Serialization/ASTWriterDecl.cpp clang/lib/Serialization/ASTWriterDecl.cpp
index 6a79444bdb98..fa2294da95de 100644
--- clang/lib/Serialization/ASTWriterDecl.cpp
+++ clang/lib/Serialization/ASTWriterDecl.cpp
@@ -679,7 +679,7 @@ void ASTDeclWriter::VisitDeclaratorDecl(DeclaratorDecl *D) {
 }
 
 void ASTDeclWriter::VisitFunctionDecl(FunctionDecl *D) {
-  static_assert(DeclContext::NumFunctionDeclBits == 45,
+  static_assert(DeclContext::NumFunctionDeclBits == 44,
                 "You need to update the serializer after you change the "
                 "FunctionDeclBits");
 
@@ -785,7 +785,6 @@ void ASTDeclWriter::VisitFunctionDecl(FunctionDecl *D) {
   FunctionDeclBits.addBit(D->hasImplicitReturnZero());
   FunctionDeclBits.addBit(D->isMultiVersion());
   FunctionDeclBits.addBit(D->isLateTemplateParsed());
-  FunctionDeclBits.addBit(D->isInstantiatedFromMemberTemplate());
   FunctionDeclBits.addBit(D->FriendConstraintRefersToEnclosingTemplate());
   FunctionDeclBits.addBit(D->usesSEHTry());
   Record.push_back(FunctionDeclBits);
diff --git clang/lib/Serialization/ASTWriterStmt.cpp clang/lib/Serialization/ASTWriterStmt.cpp
index e6701c586e02..e5caf3debc02 100644
--- clang/lib/Serialization/ASTWriterStmt.cpp
+++ clang/lib/Serialization/ASTWriterStmt.cpp
@@ -3007,6 +3007,17 @@ void ASTStmtWriter::VisitOpenACCWaitConstruct(OpenACCWaitConstruct *S) {
   Code = serialization::STMT_OPENACC_WAIT_CONSTRUCT;
 }
 
+void ASTStmtWriter::VisitOpenACCAtomicConstruct(OpenACCAtomicConstruct *S) {
+  VisitStmt(S);
+  Record.writeEnum(S->Kind);
+  Record.AddSourceRange(S->Range);
+  Record.AddSourceLocation(S->DirectiveLoc);
+  Record.writeEnum(S->getAtomicKind());
+  Record.AddStmt(S->getAssociatedStmt());
+
+  Code = serialization::STMT_OPENACC_ATOMIC_CONSTRUCT;
+}
+
 //===----------------------------------------------------------------------===//
 // HLSL Constructs/Directives.
 //===----------------------------------------------------------------------===//
diff --git clang/lib/StaticAnalyzer/Checkers/ExprInspectionChecker.cpp clang/lib/StaticAnalyzer/Checkers/ExprInspectionChecker.cpp
index 5534ef86a7be..28898bb37082 100644
--- clang/lib/StaticAnalyzer/Checkers/ExprInspectionChecker.cpp
+++ clang/lib/StaticAnalyzer/Checkers/ExprInspectionChecker.cpp
@@ -227,10 +227,11 @@ void ExprInspectionChecker::analyzerWarnIfReached(const CallExpr *CE,
 
 void ExprInspectionChecker::analyzerNumTimesReached(const CallExpr *CE,
                                                     CheckerContext &C) const {
-  ++ReachedStats[CE].NumTimesReached;
-  if (!ReachedStats[CE].ExampleNode) {
+  ReachedStat &Stat = ReachedStats[CE];
+  ++Stat.NumTimesReached;
+  if (!Stat.ExampleNode) {
     // Later, in checkEndAnalysis, we'd throw a report against it.
-    ReachedStats[CE].ExampleNode = C.generateNonFatalErrorNode();
+    Stat.ExampleNode = C.generateNonFatalErrorNode();
   }
 }
 
diff --git clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp
index a9b4dbb39b5b..a6142063895d 100644
--- clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp
+++ clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp
@@ -1198,7 +1198,10 @@ static bool isInitializationOfVar(const ExplodedNode *N, const VarRegion *VR) {
     // If we ever directly evaluate global DeclStmts, this assertion will be
     // invalid, but this still seems preferable to silently accepting an
     // initialization that may be for a path-sensitive variable.
-    assert(VR->getDecl()->isStaticLocal() && "non-static stackless VarRegion");
+    [[maybe_unused]] bool IsLocalStaticOrLocalExtern =
+        VR->getDecl()->isStaticLocal() || VR->getDecl()->isLocalExternDecl();
+    assert(IsLocalStaticOrLocalExtern &&
+           "Declared a variable on the stack without Stack memspace?");
     return true;
   }
 
diff --git clang/lib/StaticAnalyzer/Core/ExplodedGraph.cpp clang/lib/StaticAnalyzer/Core/ExplodedGraph.cpp
index 55bcb6e220e1..7b2cccce93cf 100644
--- clang/lib/StaticAnalyzer/Core/ExplodedGraph.cpp
+++ clang/lib/StaticAnalyzer/Core/ExplodedGraph.cpp
@@ -488,15 +488,17 @@ ExplodedGraph::trim(ArrayRef<const NodeTy *> Sinks,
   while (!WL2.empty()) {
     const ExplodedNode *N = WL2.pop_back_val();
 
+    auto [Place, Inserted] = Pass2.try_emplace(N);
+
     // Skip this node if we have already processed it.
-    if (Pass2.contains(N))
+    if (!Inserted)
       continue;
 
     // Create the corresponding node in the new graph and record the mapping
     // from the old node to the new node.
     ExplodedNode *NewN = G->createUncachedNode(N->getLocation(), N->State,
                                                N->getID(), N->isSink());
-    Pass2[N] = NewN;
+    Place->second = NewN;
 
     // Also record the reverse mapping from the new node to the old node.
     if (InverseMap) (*InverseMap)[NewN] = N;
diff --git clang/lib/StaticAnalyzer/Core/ExprEngine.cpp clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
index 2b1872f8386a..9545ce5f2569 100644
--- clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
+++ clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
@@ -1836,6 +1836,7 @@ void ExprEngine::Visit(const Stmt *S, ExplodedNode *Pred,
     case Stmt::OpenACCShutdownConstructClass:
     case Stmt::OpenACCSetConstructClass:
     case Stmt::OpenACCUpdateConstructClass:
+    case Stmt::OpenACCAtomicConstructClass:
     case Stmt::OMPUnrollDirectiveClass:
     case Stmt::OMPMetaDirectiveClass:
     case Stmt::HLSLOutArgExprClass: {
diff --git clang/lib/StaticAnalyzer/Core/ExprEngineObjC.cpp clang/lib/StaticAnalyzer/Core/ExprEngineObjC.cpp
index f075df3ab5e4..9426e0afd65a 100644
--- clang/lib/StaticAnalyzer/Core/ExprEngineObjC.cpp
+++ clang/lib/StaticAnalyzer/Core/ExprEngineObjC.cpp
@@ -124,24 +124,26 @@ void ExprEngine::VisitObjCForCollectionStmt(const ObjCForCollectionStmt *S,
 
   bool isContainerNull = state->isNull(collectionV).isConstrainedTrue();
 
-  ExplodedNodeSet dstLocation;
-  evalLocation(dstLocation, S, elem, Pred, state, elementV, false);
+  ExplodedNodeSet DstLocation; // states in `DstLocation` may differ from `Pred`
+  evalLocation(DstLocation, S, elem, Pred, state, elementV, false);
 
-  ExplodedNodeSet Tmp;
-  StmtNodeBuilder Bldr(Pred, Tmp, *currBldrCtx);
+  for (ExplodedNode *dstLocation : DstLocation) {
+    ExplodedNodeSet DstLocationSingleton{dstLocation}, Tmp;
+    StmtNodeBuilder Bldr(dstLocation, Tmp, *currBldrCtx);
 
-  if (!isContainerNull)
-    populateObjCForDestinationSet(dstLocation, svalBuilder, S, elem, elementV,
-                                  SymMgr, currBldrCtx, Bldr,
-                                  /*hasElements=*/true);
+    if (!isContainerNull)
+      populateObjCForDestinationSet(DstLocationSingleton, svalBuilder, S, elem,
+                                    elementV, SymMgr, currBldrCtx, Bldr,
+                                    /*hasElements=*/true);
 
-  populateObjCForDestinationSet(dstLocation, svalBuilder, S, elem, elementV,
-                                SymMgr, currBldrCtx, Bldr,
-                                /*hasElements=*/false);
+    populateObjCForDestinationSet(DstLocationSingleton, svalBuilder, S, elem,
+                                  elementV, SymMgr, currBldrCtx, Bldr,
+                                  /*hasElements=*/false);
 
-  // Finally, run any custom checkers.
-  // FIXME: Eventually all pre- and post-checks should live in VisitStmt.
-  getCheckerManager().runCheckersForPostStmt(Dst, Tmp, S, *this);
+    // Finally, run any custom checkers.
+    // FIXME: Eventually all pre- and post-checks should live in VisitStmt.
+    getCheckerManager().runCheckersForPostStmt(Dst, Tmp, S, *this);
+  }
 }
 
 void ExprEngine::VisitObjCMessage(const ObjCMessageExpr *ME,
diff --git clang/test/AST/ByteCode/literals.cpp clang/test/AST/ByteCode/literals.cpp
index b75ca2b19a96..a80ee7ad84fc 100644
--- clang/test/AST/ByteCode/literals.cpp
+++ clang/test/AST/ByteCode/literals.cpp
@@ -914,12 +914,18 @@ namespace TypeTraits {
 }
 
 #if __cplusplus >= 201402L
+namespace SomeNS {
+  using MyInt = int;
+}
+
 constexpr int ignoredDecls() {
   static_assert(true, "");
   struct F { int a; };
   enum E { b };
   using A = int;
   typedef int Z;
+  namespace NewNS = SomeNS;
+  using NewNS::MyInt;
 
   return F{12}.a;
 }
diff --git clang/test/AST/ByteCode/unions.cpp clang/test/AST/ByteCode/unions.cpp
index e90b123c90de..b1fbb0c4dfc0 100644
--- clang/test/AST/ByteCode/unions.cpp
+++ clang/test/AST/ByteCode/unions.cpp
@@ -402,7 +402,6 @@ namespace UnionInBase {
   static_assert(return_uninit().a.x == 2);
 }
 
-/// FIXME: Our diagnostic here is a little off.
 namespace One {
   struct A { long x; };
 
@@ -421,4 +420,47 @@ namespace One {
                       // both-note {{constinit}}
 }
 
+namespace CopyAssign {
+  union A {
+    int a;
+    int b;
+  };
+
+  constexpr int f() {
+    A a{12};
+    A b{13};
+
+    b.b = 32;
+    b = a ;
+    return b.a;
+  }
+  static_assert(f()== 12);
+
+
+  constexpr int f2() {
+    A a{12};
+    A b{13};
+
+    b.b = 32;
+    b = a ;
+    return b.b; // both-note {{read of member 'b' of union with active member 'a'}}
+  }
+  static_assert(f2() == 12); // both-error {{not an integral constant expression}} \
+                             // both-note {{in call to}}
+}
+
+namespace MoveAssign {
+  union A {
+    int a;
+    int b;
+  };
+
+  constexpr int f() {
+    A b{13};
+
+    b = A{12} ;
+    return b.a;
+  }
+  static_assert(f()== 12);
+}
 #endif
diff --git clang/test/AST/ast-print-openacc-atomic-construct.cpp clang/test/AST/ast-print-openacc-atomic-construct.cpp
new file mode 100644
index 000000000000..572f2ea4842d
--- /dev/null
+++ clang/test/AST/ast-print-openacc-atomic-construct.cpp
@@ -0,0 +1,33 @@
+// RUN: %clang_cc1 -fopenacc -ast-print %s -o - | FileCheck %s
+
+void foo(int v, int x) {
+// CHECK: #pragma acc atomic read
+// CHECK-NEXT:   v = x;
+#pragma acc atomic read
+  v = x;
+// CHECK-NEXT: pragma acc atomic write
+// CHECK-NEXT:  v = x + 1;
+#pragma acc atomic write
+  v = x + 1;
+// CHECK-NEXT: pragma acc atomic update
+// CHECK-NEXT:  x++;
+#pragma acc atomic update
+  x++;
+// CHECK-NEXT: pragma acc atomic 
+// CHECK-NEXT:  x--;
+#pragma acc atomic
+  x--;
+// CHECK-NEXT: pragma acc atomic capture
+// CHECK-NEXT:  v = x++;
+#pragma acc atomic capture
+  v = x++;
+
+// CHECK-NEXT: #pragma acc atomic capture
+// CHECK-NEXT: { 
+// CHECK-NEXT: x--;
+// CHECK-NEXT: v = x;
+// CHECK-NEXT: }
+#pragma acc atomic capture
+  { x--; v = x; }
+
+}
diff --git clang/test/AST/attr-print-emit.cpp clang/test/AST/attr-print-emit.cpp
index a9bca6778d0f..77826f8f9af0 100644
--- clang/test/AST/attr-print-emit.cpp
+++ clang/test/AST/attr-print-emit.cpp
@@ -91,3 +91,8 @@ ANNOTATE_ATTR NONNULL_ATTR void fn_non_null_annotated_attr(int *) __attribute__(
 
 [[gnu::nonnull(1)]] [[gnu::always_inline]] void cxx11_attr(int*) ANNOTATE_ATTR;
 // CHECK: {{\[\[}}gnu::nonnull(1)]] {{\[\[}}gnu::always_inline]] void cxx11_attr(int *) __attribute__((annotate("Annotated")));
+
+struct Foo;
+
+// CHECK: void as_member_fn_ptr(int *(Foo::*member)(int) __attribute__((alloc_size(1))));
+void as_member_fn_ptr(int* (Foo::*member)(int)  __attribute__((alloc_size(1))));
diff --git clang/test/Analysis/bugfix-124477.m clang/test/Analysis/bugfix-124477.m
new file mode 100644
index 000000000000..80820f4c9344
--- /dev/null
+++ clang/test/Analysis/bugfix-124477.m
@@ -0,0 +1,39 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,apiModeling,nullability.NullableDereferenced,nullability.NullabilityBase -x objective-c %s
+/*
+  This test is reduced from a static analyzer crash. The bug causing
+  the crash is explained in #124477.  It can only be triggered in some
+  rare cases so please do not modify this reproducer.
+*/
+
+#pragma clang assume_nonnull begin
+# 15 "some-sys-header.h" 1 3
+@class NSArray, NSObject;
+
+@interface Base
+@property (readonly, copy) NSArray *array;
+@end
+
+#pragma clang assume_nonnull end
+# 8 "this-file.m" 2
+
+
+@interface Test : Base
+
+@property (readwrite, copy, nullable) NSObject *label;
+@property (readwrite, strong, nullable) Test * field;
+
+- (void)f;
+
+@end
+
+@implementation Test
+- (void)f
+{
+  NSObject * X;
+
+  for (NSObject *ele in self.field.array) {}
+  self.label = X;  
+}
+@end
+
+
diff --git clang/test/Analysis/null-deref-path-notes.cpp clang/test/Analysis/null-deref-path-notes.cpp
index c7b0619e297b..a37bbfe41a2c 100644
--- clang/test/Analysis/null-deref-path-notes.cpp
+++ clang/test/Analysis/null-deref-path-notes.cpp
@@ -23,3 +23,38 @@ void c::f(B &g, int &i) {
   f(h, b); // expected-note{{Calling 'c::f'}}
 }
 }
+
+namespace GH124975 {
+void no_crash_in_br_visitors(int *p) {
+  if (p) {}
+  // expected-note@-1 {{Assuming 'p' is null}}
+  // expected-note@-2 {{Taking false branch}}
+
+  extern bool ExternLocalCoin;
+  // expected-note@+2 {{Assuming 'ExternLocalCoin' is false}}
+  // expected-note@+1 {{Taking false branch}}
+  if (ExternLocalCoin)
+    return;
+
+  *p = 4;
+  // expected-warning@-1 {{Dereference of null pointer (loaded from variable 'p')}}
+  // expected-note@-2    {{Dereference of null pointer (loaded from variable 'p')}}
+}
+
+// Thread local variables are implicitly static, so let's test them too.
+void thread_local_alternative(int *p) {
+  if (p) {}
+  // expected-note@-1 {{Assuming 'p' is null}}
+  // expected-note@-2 {{Taking false branch}}
+
+  thread_local bool ThreadLocalCoin;
+  // expected-note@+2 {{'ThreadLocalCoin' is false}}
+  // expected-note@+1 {{Taking false branch}}
+  if (ThreadLocalCoin)
+    return;
+
+  *p = 4;
+  // expected-warning@-1 {{Dereference of null pointer (loaded from variable 'p')}}
+  // expected-note@-2    {{Dereference of null pointer (loaded from variable 'p')}}
+}
+} // namespace GH124975
diff --git clang/test/CIR/Lowering/global-var-simple.cpp clang/test/CIR/Lowering/global-var-simple.cpp
new file mode 100644
index 000000000000..06050e409d54
--- /dev/null
+++ clang/test/CIR/Lowering/global-var-simple.cpp
@@ -0,0 +1,81 @@
+// Global variables of intergal types
+// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o -  | FileCheck %s
+
+// Note: Currently unsupported features include default zero-initialization
+//       and alignment. The fact that "external" is only printed for globals
+//       without an initializer is a quirk of the LLVM AsmWriter.
+
+char c;
+// CHECK: @c = external dso_local global i8
+
+signed char sc;
+// CHECK: @sc = external dso_local global i8
+
+unsigned char uc;
+// CHECK: @uc = external dso_local global i8
+
+short ss;
+// CHECK: @ss = external dso_local global i16
+
+unsigned short us = 100;
+// CHECK: @us = dso_local global i16 100
+
+int si = 42;
+// CHECK: @si = dso_local global i32 42
+
+unsigned ui;
+// CHECK: @ui = external dso_local global i32
+
+long sl;
+// CHECK: @sl = external dso_local global i64
+
+unsigned long ul;
+// CHECK: @ul = external dso_local global i64
+
+long long sll;
+// CHECK: @sll = external dso_local global i64
+
+unsigned long long ull = 123456;
+// CHECK: @ull = dso_local global i64 123456
+
+__int128 s128;
+// CHECK: @s128 = external dso_local global i128
+
+unsigned __int128 u128;
+// CHECK: @u128 = external dso_local global i128
+
+wchar_t wc;
+// CHECK: @wc = external dso_local global i32
+
+char8_t c8;
+// CHECK: @c8 = external dso_local global i8
+
+char16_t c16;
+// CHECK: @c16 = external dso_local global i16
+
+char32_t c32;
+// CHECK: @c32 = external dso_local global i32
+
+_BitInt(20) sb20;
+// CHECK: @sb20 = external dso_local global i20
+
+unsigned _BitInt(48) ub48;
+// CHECK: @ub48 = external dso_local global i48
+
+_Float16 f16;
+// CHECK: @f16 = external dso_local global half
+
+__bf16 bf16;
+// CHECK: @bf16 = external dso_local global bfloat
+
+float f;
+// CHECK: @f = external dso_local global float
+
+double d = 1.25;
+// CHECK: @d = dso_local global double 1.250000e+00
+
+long double ld;
+// CHECK: @ld = external dso_local global x86_fp80
+
+__float128 f128;
+// CHECK: @f128 = external dso_local global fp128
diff --git clang/test/CIR/Lowering/hello.c clang/test/CIR/Lowering/hello.c
new file mode 100644
index 000000000000..ff78b6e6f6a5
--- /dev/null
+++ clang/test/CIR/Lowering/hello.c
@@ -0,0 +1,10 @@
+// Smoke test for ClangIR-to-LLVM IR code generation
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o -  | FileCheck %s
+
+int a;
+
+// CHECK: @a = external dso_local global i32
+
+int b = 2;
+
+// CHECK: @b = dso_local global i32 2
diff --git clang/test/CXX/class/class.init/p1.cpp clang/test/CXX/class/class.init/p1.cpp
new file mode 100644
index 000000000000..717dfba89763
--- /dev/null
+++ clang/test/CXX/class/class.init/p1.cpp
@@ -0,0 +1,14 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+
+namespace test_deleted_ctor_note {
+struct A {
+  int a;
+  A() = delete; // expected-note {{'A' has been explicitly marked deleted here}}
+  A(int a_) : a(a_) { }
+};
+
+struct B {
+  A a1, a2, a3; // expected-note {{default constructed field 'a2' declared here}}
+  B(int a_) : a1(a_), a3(a_) { } // expected-error{{call to deleted constructor of 'A'}}
+};
+}
diff --git clang/test/CXX/dcl.decl/dcl.init/p14-0x.cpp clang/test/CXX/dcl.decl/dcl.init/p14-0x.cpp
index e7f501352168..d548f9c8c2fd 100644
--- clang/test/CXX/dcl.decl/dcl.init/p14-0x.cpp
+++ clang/test/CXX/dcl.decl/dcl.init/p14-0x.cpp
@@ -27,7 +27,7 @@ class Friend {
 
 
 class S {
-  NoDefault nd1;
+  NoDefault nd1; // expected-note {{default constructed field 'nd1' declared here}}
   NoDefault nd2 = 42;
   Explicit e1; // expected-note {{here}}
   Explicit e2 = 42; // expected-error {{no viable conversion}}
diff --git clang/test/CodeCompletion/member-access.cpp clang/test/CodeCompletion/member-access.cpp
index bf35f7ad021f..b181466cdb62 100644
--- clang/test/CodeCompletion/member-access.cpp
+++ clang/test/CodeCompletion/member-access.cpp
@@ -417,3 +417,21 @@ void f() {
   // CHECK-DEPENDENT-NESTEDCLASS: [#int#]field
 }
 }
+
+namespace template_alias {
+struct A {
+  int b;
+};
+template <typename T>
+struct S {
+  A a;
+};
+template <typename T>
+using Alias = S<T>;
+template <typename T>
+void f(Alias<T> s) {
+  s.a.b;
+  // RUN: %clang_cc1 -fsyntax-only -code-completion-at=%s:433:7 %s -o - | FileCheck -check-prefix=CHECK-TEMPLATE-ALIAS %s
+  // CHECK-TEMPLATE-ALIAS: [#int#]b
+}
+}
diff --git clang/test/CodeGen/AArch64/sme-attributes-member-function-pointer.cpp clang/test/CodeGen/AArch64/sme-attributes-member-function-pointer.cpp
new file mode 100644
index 000000000000..ee784c816a06
--- /dev/null
+++ clang/test/CodeGen/AArch64/sme-attributes-member-function-pointer.cpp
@@ -0,0 +1,37 @@
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -x c++ -std=c++20  -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK
+
+struct TestStruct;
+
+__arm_new("za", "zt0") void test(TestStruct& TS,
+  void (TestStruct::*streaming_member_ptr)() __arm_streaming,
+  void (TestStruct::*streaming_compat_member)() __arm_streaming_compatible,
+  void (TestStruct::*arm_in_member)() __arm_in("za", "zt0"),
+  void (TestStruct::*arm_inout_member)() __arm_inout("za", "zt0"),
+  void (TestStruct::*arm_preserves_member)() __arm_preserves("za", "zt0"),
+  void (TestStruct::*arm_agnostic_member)() __arm_agnostic("sme_za_state")) {
+
+  // CHECK: call void %{{.*}} [[STREAMING_MEMBER_CALL_ATTRS:#.+]]
+  (TS.*streaming_member_ptr)();
+
+  // CHECK: call void %{{.*}} [[STREAMING_COMPAT_MEMBER_CALL_ATTRS:#.+]]
+  (TS.*streaming_compat_member)();
+
+  // CHECK: call void %{{.*}} [[ARM_IN_MEMBER_CALL_ATTRS:#.+]]
+  (TS.*arm_in_member)();
+
+  // CHECK: call void %{{.*}} [[ARM_INOUT_MEMBER_CALL_ATTRS:#.+]]
+  (TS.*arm_inout_member)();
+
+  // CHECK: call void %{{.*}} [[ARM_PRESERVES_MEMBER_CALL_ATTRS:#.+]]
+  (TS.*arm_preserves_member)();
+
+  // CHECK: call void %{{.*}} [[ARM_AGNOSTIC_MEMBER_CALL_ATTRS:#.+]]
+  (TS.*arm_agnostic_member)();
+}
+
+// CHECK: attributes [[STREAMING_MEMBER_CALL_ATTRS]] = { "aarch64_pstate_sm_enabled" }
+// CHECK: attributes [[STREAMING_COMPAT_MEMBER_CALL_ATTRS]] = { "aarch64_pstate_sm_compatible" }
+// CHECK: attributes [[ARM_IN_MEMBER_CALL_ATTRS]] = { "aarch64_in_za" "aarch64_in_zt0" }
+// CHECK: attributes [[ARM_INOUT_MEMBER_CALL_ATTRS]] = { "aarch64_inout_za" "aarch64_inout_zt0" }
+// CHECK: attributes [[ARM_PRESERVES_MEMBER_CALL_ATTRS]] = { "aarch64_preserves_za" "aarch64_preserves_zt0" }
+// CHECK: attributes [[ARM_AGNOSTIC_MEMBER_CALL_ATTRS]] = { "aarch64_za_state_agnostic" }
diff --git clang/test/CodeGen/AArch64/sme-intrinsics/aarch64-sme-attrs.cpp clang/test/CodeGen/AArch64/sme-intrinsics/aarch64-sme-attrs.cpp
index 54762c8b4141..c734c6953e5d 100644
--- clang/test/CodeGen/AArch64/sme-intrinsics/aarch64-sme-attrs.cpp
+++ clang/test/CodeGen/AArch64/sme-intrinsics/aarch64-sme-attrs.cpp
@@ -300,12 +300,12 @@ int test_variadic_template() __arm_inout("za") {
               preserves_za_decl);
 }
 
-// CHECK: attributes #[[SM_ENABLED]] = { mustprogress noinline nounwind "aarch64_pstate_sm_enabled" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
+// CHECK: attributes #[[SM_ENABLED]] = { mustprogress noinline nounwind vscale_range(1,16) "aarch64_pstate_sm_enabled" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
 // CHECK: attributes #[[NORMAL_DECL]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
 // CHECK: attributes #[[SM_ENABLED_DECL]] = { "aarch64_pstate_sm_enabled" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
 // CHECK: attributes #[[SM_COMPATIBLE]] = { mustprogress noinline nounwind "aarch64_pstate_sm_compatible" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
 // CHECK: attributes #[[SM_COMPATIBLE_DECL]] = { "aarch64_pstate_sm_compatible" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
-// CHECK: attributes #[[SM_BODY]] = { mustprogress noinline nounwind "aarch64_pstate_sm_body" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
+// CHECK: attributes #[[SM_BODY]] = { mustprogress noinline nounwind vscale_range(1,16) "aarch64_pstate_sm_body" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
 // CHECK: attributes #[[ZA_SHARED]] = { mustprogress noinline nounwind "aarch64_inout_za" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
 // CHECK: attributes #[[ZA_SHARED_DECL]] = { "aarch64_inout_za" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
 // CHECK: attributes #[[ZA_PRESERVED]] = { mustprogress noinline nounwind "aarch64_preserves_za" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
diff --git clang/test/CodeGen/allow-ubsan-check-inline.c clang/test/CodeGen/allow-ubsan-check-inline.c
index 1de24ab90dac..eed48cf15ecc 100644
--- clang/test/CodeGen/allow-ubsan-check-inline.c
+++ clang/test/CodeGen/allow-ubsan-check-inline.c
@@ -1,3 +1,8 @@
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -emit-llvm -o - %s -fsanitize=signed-integer-overflow -fsanitize-skip-hot-cutoff=signed-integer-overflow=0.000001 -O3 -mllvm -lower-allow-check-random-rate=1 -Rpass=lower-allow-check -Rpass-missed=lower-allow-check -fno-inline 2>&1 | FileCheck %s --check-prefixes=NOINL --implicit-check-not="remark:"
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -emit-llvm -o - %s -fsanitize=signed-integer-overflow -fsanitize-skip-hot-cutoff=signed-integer-overflow=0.000001 -O3 -mllvm -lower-allow-check-random-rate=1 -Rpass=lower-allow-check -Rpass-missed=lower-allow-check 2>&1 | FileCheck %s --check-prefixes=INLINE --implicit-check-not="remark:"
+//
+// -ubsan-guard-checks is deprecated and will be removed in the future;
+// use -fsanitize-skip-hot-cutoff, as shown above.
 // RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -emit-llvm -o - %s -fsanitize=signed-integer-overflow -mllvm -ubsan-guard-checks -O3 -mllvm -lower-allow-check-random-rate=1 -Rpass=lower-allow-check -Rpass-missed=lower-allow-check -fno-inline 2>&1 | FileCheck %s --check-prefixes=NOINL --implicit-check-not="remark:"
 // RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -emit-llvm -o - %s -fsanitize=signed-integer-overflow -mllvm -ubsan-guard-checks -O3 -mllvm -lower-allow-check-random-rate=1 -Rpass=lower-allow-check -Rpass-missed=lower-allow-check 2>&1 | FileCheck %s --check-prefixes=INLINE --implicit-check-not="remark:"
 
diff --git clang/test/CodeGen/allow-ubsan-check.c clang/test/CodeGen/allow-ubsan-check.c
index b88c1f9cb220..0cd81a77f5cc 100644
--- clang/test/CodeGen/allow-ubsan-check.c
+++ clang/test/CodeGen/allow-ubsan-check.c
@@ -1,4 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+//
+// We can't use -fsanitize-skip-hot-cutoff because that includes both -ubsan-guard-checks and
+//-lower-allow-check-percentile-cutoff.
 // RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -emit-llvm -O1 -o - %s -fsanitize=signed-integer-overflow,integer-divide-by-zero,null,local-bounds -mllvm -ubsan-guard-checks | FileCheck %s
 // RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -emit-llvm -O1 -o - %s -fsanitize=signed-integer-overflow,integer-divide-by-zero,null,local-bounds -mllvm -ubsan-guard-checks -fsanitize-trap=signed-integer-overflow,integer-divide-by-zero,null,local-bounds | FileCheck %s --check-prefixes=TR
 // RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -emit-llvm -O1 -o - %s -fsanitize=signed-integer-overflow,integer-divide-by-zero,null,local-bounds -mllvm -ubsan-guard-checks -fsanitize-recover=signed-integer-overflow,integer-divide-by-zero,null,local-bounds | FileCheck %s --check-prefixes=REC
@@ -7,18 +10,26 @@
 // CHECK-LABEL: define dso_local noundef i32 @div(
 // CHECK-SAME: i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[Y]], 0, !nosanitize [[META2:![0-9]+]]
-// CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[X]], -2147483648, !nosanitize [[META2]]
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[Y]], -1, !nosanitize [[META2]]
-// CHECK-NEXT:    [[OR_NOT5:%.*]] = and i1 [[TMP1]], [[TMP2]]
-// CHECK-NEXT:    [[DOTNOT3:%.*]] = or i1 [[TMP0]], [[OR_NOT5]]
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 3), !nosanitize [[META2]]
-// CHECK-NEXT:    [[DOTNOT1:%.*]] = and i1 [[DOTNOT3]], [[TMP3]]
-// CHECK-NEXT:    br i1 [[DOTNOT1]], label %[[HANDLER_DIVREM_OVERFLOW:.*]], label %[[CONT:.*]], !prof [[PROF3:![0-9]+]], !nosanitize [[META2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp ne i32 [[Y]], 0, !nosanitize [[META2:![0-9]+]]
+// CHECK-NEXT:    [[TMP1:%.*]] = icmp ne i32 [[X]], -2147483648, !nosanitize [[META2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[Y]], -1, !nosanitize [[META2]]
+// CHECK-NEXT:    [[OR:%.*]] = or i1 [[TMP1]], [[TMP2]], !nosanitize [[META2]]
+//
+//                                                                       27 == SO_IntegerDivideByZero
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 27), !nosanitize [[META2]]
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i1 [[TMP3]], true, !nosanitize [[META2]]
+// CHECK-NEXT:    [[TMP5:%.*]] = or i1 [[TMP0]], [[TMP4]], !nosanitize [[META2]]
+//
+//                                                                       41 == SO_SignedIntegerOverflow
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 41), !nosanitize [[META2]]
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i1 [[TMP6]], true, !nosanitize [[META2]]
+// CHECK-NEXT:    [[TMP8:%.*]] = or i1 [[OR]], [[TMP7]], !nosanitize [[META2]]
+// CHECK-NEXT:    [[TMP9:%.*]] = and i1 [[TMP5]], [[TMP8]], !nosanitize [[META2]]
+// CHECK-NEXT:    br i1 [[TMP9]], label %[[CONT:.*]], label %[[HANDLER_DIVREM_OVERFLOW:.*]], !prof [[PROF3:![0-9]+]], !nosanitize [[META2]]
 // CHECK:       [[HANDLER_DIVREM_OVERFLOW]]:
-// CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[X]] to i64, !nosanitize [[META2]]
-// CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[Y]] to i64, !nosanitize [[META2]]
-// CHECK-NEXT:    tail call void @__ubsan_handle_divrem_overflow_abort(ptr nonnull @[[GLOB1:[0-9]+]], i64 [[TMP4]], i64 [[TMP5]]) #[[ATTR6:[0-9]+]], !nosanitize [[META2]]
+// CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[X]] to i64, !nosanitize [[META2]]
+// CHECK-NEXT:    [[TMP11:%.*]] = zext i32 [[Y]] to i64, !nosanitize [[META2]]
+// CHECK-NEXT:    tail call void @__ubsan_handle_divrem_overflow_abort(ptr nonnull @[[GLOB1:[0-9]+]], i64 [[TMP10]], i64 [[TMP11]]) #[[ATTR6:[0-9]+]], !nosanitize [[META2]]
 // CHECK-NEXT:    unreachable, !nosanitize [[META2]]
 // CHECK:       [[CONT]]:
 // CHECK-NEXT:    [[DIV:%.*]] = sdiv i32 [[X]], [[Y]]
@@ -27,14 +38,18 @@
 // TR-LABEL: define dso_local noundef i32 @div(
 // TR-SAME: i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // TR-NEXT:  [[ENTRY:.*:]]
-// TR-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[Y]], 0, !nosanitize [[META2:![0-9]+]]
-// TR-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[X]], -2147483648, !nosanitize [[META2]]
-// TR-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[Y]], -1, !nosanitize [[META2]]
-// TR-NEXT:    [[OR_NOT5:%.*]] = and i1 [[TMP1]], [[TMP2]]
-// TR-NEXT:    [[DOTNOT3:%.*]] = or i1 [[TMP0]], [[OR_NOT5]]
-// TR-NEXT:    [[TMP3:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 3), !nosanitize [[META2]]
-// TR-NEXT:    [[DOTNOT1:%.*]] = and i1 [[DOTNOT3]], [[TMP3]]
-// TR-NEXT:    br i1 [[DOTNOT1]], label %[[TRAP:.*]], label %[[CONT:.*]], !nosanitize [[META2]]
+// TR-NEXT:    [[TMP0:%.*]] = icmp ne i32 [[Y]], 0, !nosanitize [[META2:![0-9]+]]
+// TR-NEXT:    [[TMP1:%.*]] = icmp ne i32 [[X]], -2147483648, !nosanitize [[META2]]
+// TR-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[Y]], -1, !nosanitize [[META2]]
+// TR-NEXT:    [[OR:%.*]] = or i1 [[TMP1]], [[TMP2]], !nosanitize [[META2]]
+// TR-NEXT:    [[TMP3:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 27), !nosanitize [[META2]]
+// TR-NEXT:    [[TMP4:%.*]] = xor i1 [[TMP3]], true, !nosanitize [[META2]]
+// TR-NEXT:    [[TMP5:%.*]] = or i1 [[TMP0]], [[TMP4]], !nosanitize [[META2]]
+// TR-NEXT:    [[TMP6:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 41), !nosanitize [[META2]]
+// TR-NEXT:    [[TMP7:%.*]] = xor i1 [[TMP6]], true, !nosanitize [[META2]]
+// TR-NEXT:    [[TMP8:%.*]] = or i1 [[OR]], [[TMP7]], !nosanitize [[META2]]
+// TR-NEXT:    [[TMP9:%.*]] = and i1 [[TMP5]], [[TMP8]], !nosanitize [[META2]]
+// TR-NEXT:    br i1 [[TMP9]], label %[[CONT:.*]], label %[[TRAP:.*]], !nosanitize [[META2]]
 // TR:       [[TRAP]]:
 // TR-NEXT:    tail call void @llvm.ubsantrap(i8 3) #[[ATTR5:[0-9]+]], !nosanitize [[META2]]
 // TR-NEXT:    unreachable, !nosanitize [[META2]]
@@ -45,18 +60,22 @@
 // REC-LABEL: define dso_local noundef i32 @div(
 // REC-SAME: i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // REC-NEXT:  [[ENTRY:.*:]]
-// REC-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[Y]], 0, !nosanitize [[META2:![0-9]+]]
-// REC-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[X]], -2147483648, !nosanitize [[META2]]
-// REC-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[Y]], -1, !nosanitize [[META2]]
-// REC-NEXT:    [[OR_NOT5:%.*]] = and i1 [[TMP1]], [[TMP2]]
-// REC-NEXT:    [[DOTNOT3:%.*]] = or i1 [[TMP0]], [[OR_NOT5]]
-// REC-NEXT:    [[TMP3:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 3), !nosanitize [[META2]]
-// REC-NEXT:    [[DOTNOT1:%.*]] = and i1 [[DOTNOT3]], [[TMP3]]
-// REC-NEXT:    br i1 [[DOTNOT1]], label %[[HANDLER_DIVREM_OVERFLOW:.*]], label %[[CONT:.*]], !prof [[PROF3:![0-9]+]], !nosanitize [[META2]]
+// REC-NEXT:    [[TMP0:%.*]] = icmp ne i32 [[Y]], 0, !nosanitize [[META2:![0-9]+]]
+// REC-NEXT:    [[TMP1:%.*]] = icmp ne i32 [[X]], -2147483648, !nosanitize [[META2]]
+// REC-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[Y]], -1, !nosanitize [[META2]]
+// REC-NEXT:    [[OR:%.*]] = or i1 [[TMP1]], [[TMP2]], !nosanitize [[META2]]
+// REC-NEXT:    [[TMP3:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 27), !nosanitize [[META2]]
+// REC-NEXT:    [[TMP4:%.*]] = xor i1 [[TMP3]], true, !nosanitize [[META2]]
+// REC-NEXT:    [[TMP5:%.*]] = or i1 [[TMP0]], [[TMP4]], !nosanitize [[META2]]
+// REC-NEXT:    [[TMP6:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 41), !nosanitize [[META2]]
+// REC-NEXT:    [[TMP7:%.*]] = xor i1 [[TMP6]], true, !nosanitize [[META2]]
+// REC-NEXT:    [[TMP8:%.*]] = or i1 [[OR]], [[TMP7]], !nosanitize [[META2]]
+// REC-NEXT:    [[TMP9:%.*]] = and i1 [[TMP5]], [[TMP8]], !nosanitize [[META2]]
+// REC-NEXT:    br i1 [[TMP9]], label %[[CONT:.*]], label %[[HANDLER_DIVREM_OVERFLOW:.*]], !prof [[PROF3:![0-9]+]], !nosanitize [[META2]]
 // REC:       [[HANDLER_DIVREM_OVERFLOW]]:
-// REC-NEXT:    [[TMP4:%.*]] = zext i32 [[X]] to i64, !nosanitize [[META2]]
-// REC-NEXT:    [[TMP5:%.*]] = zext i32 [[Y]] to i64, !nosanitize [[META2]]
-// REC-NEXT:    tail call void @__ubsan_handle_divrem_overflow(ptr nonnull @[[GLOB1:[0-9]+]], i64 [[TMP4]], i64 [[TMP5]]) #[[ATTR6:[0-9]+]], !nosanitize [[META2]]
+// REC-NEXT:    [[TMP10:%.*]] = zext i32 [[X]] to i64, !nosanitize [[META2]]
+// REC-NEXT:    [[TMP11:%.*]] = zext i32 [[Y]] to i64, !nosanitize [[META2]]
+// REC-NEXT:    tail call void @__ubsan_handle_divrem_overflow(ptr nonnull @[[GLOB1:[0-9]+]], i64 [[TMP10]], i64 [[TMP11]]) #[[ATTR6:[0-9]+]], !nosanitize [[META2]]
 // REC-NEXT:    br label %[[CONT]], !nosanitize [[META2]]
 // REC:       [[CONT]]:
 // REC-NEXT:    [[DIV:%.*]] = sdiv i32 [[X]], [[Y]]
@@ -70,21 +89,23 @@ int div(int x, int y) {
 // CHECK-SAME: ptr noundef readonly [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = icmp eq ptr [[X]], null, !nosanitize [[META2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 22), !nosanitize [[META2]]
+//
+//                                                                       29 == SO_Null
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 29), !nosanitize [[META2]]
 // CHECK-NEXT:    [[DOTNOT1:%.*]] = and i1 [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    br i1 [[DOTNOT1]], label %[[HANDLER_TYPE_MISMATCH:.*]], label %[[CONT:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// CHECK-NEXT:    br i1 [[DOTNOT1]], label %[[HANDLER_TYPE_MISMATCH:.*]], label %[[CONT:.*]], !prof [[PROF4:![0-9]+]], !nosanitize [[META2]]
 // CHECK:       [[HANDLER_TYPE_MISMATCH]]:
 // CHECK-NEXT:    tail call void @__ubsan_handle_type_mismatch_v1_abort(ptr nonnull @[[GLOB2:[0-9]+]], i64 0) #[[ATTR6]], !nosanitize [[META2]]
 // CHECK-NEXT:    unreachable, !nosanitize [[META2]]
 // CHECK:       [[CONT]]:
-// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA4:![0-9]+]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA5:![0-9]+]]
 // CHECK-NEXT:    ret i32 [[TMP2]]
 //
 // TR-LABEL: define dso_local i32 @null(
 // TR-SAME: ptr noundef readonly [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // TR-NEXT:  [[ENTRY:.*:]]
 // TR-NEXT:    [[TMP0:%.*]] = icmp eq ptr [[X]], null, !nosanitize [[META2]]
-// TR-NEXT:    [[TMP1:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 22), !nosanitize [[META2]]
+// TR-NEXT:    [[TMP1:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 29), !nosanitize [[META2]]
 // TR-NEXT:    [[DOTNOT1:%.*]] = and i1 [[TMP0]], [[TMP1]]
 // TR-NEXT:    br i1 [[DOTNOT1]], label %[[TRAP:.*]], label %[[CONT:.*]], !nosanitize [[META2]]
 // TR:       [[TRAP]]:
@@ -98,14 +119,14 @@ int div(int x, int y) {
 // REC-SAME: ptr noundef readonly [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // REC-NEXT:  [[ENTRY:.*:]]
 // REC-NEXT:    [[TMP0:%.*]] = icmp eq ptr [[X]], null, !nosanitize [[META2]]
-// REC-NEXT:    [[TMP1:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 22), !nosanitize [[META2]]
+// REC-NEXT:    [[TMP1:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 29), !nosanitize [[META2]]
 // REC-NEXT:    [[DOTNOT1:%.*]] = and i1 [[TMP0]], [[TMP1]]
-// REC-NEXT:    br i1 [[DOTNOT1]], label %[[HANDLER_TYPE_MISMATCH:.*]], label %[[CONT:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// REC-NEXT:    br i1 [[DOTNOT1]], label %[[HANDLER_TYPE_MISMATCH:.*]], label %[[CONT:.*]], !prof [[PROF4:![0-9]+]], !nosanitize [[META2]]
 // REC:       [[HANDLER_TYPE_MISMATCH]]:
 // REC-NEXT:    tail call void @__ubsan_handle_type_mismatch_v1(ptr nonnull @[[GLOB2:[0-9]+]], i64 0) #[[ATTR6]], !nosanitize [[META2]]
 // REC-NEXT:    br label %[[CONT]], !nosanitize [[META2]]
 // REC:       [[CONT]]:
-// REC-NEXT:    [[TMP2:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA4:![0-9]+]]
+// REC-NEXT:    [[TMP2:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA5:![0-9]+]]
 // REC-NEXT:    ret i32 [[TMP2]]
 //
 int null(int* x) {
@@ -117,9 +138,11 @@ int null(int* x) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[X]], i32 [[Y]]), !nosanitize [[META2]]
 // CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i32, i1 } [[TMP0]], 1, !nosanitize [[META2]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 0), !nosanitize [[META2]]
+//
+//                                                                       41 == SO_SignedIntegerOverflow
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 41), !nosanitize [[META2]]
 // CHECK-NEXT:    [[DOTDEMORGAN:%.*]] = and i1 [[TMP1]], [[TMP2]]
-// CHECK-NEXT:    br i1 [[DOTDEMORGAN]], label %[[HANDLER_ADD_OVERFLOW:.*]], label %[[CONT:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// CHECK-NEXT:    br i1 [[DOTDEMORGAN]], label %[[HANDLER_ADD_OVERFLOW:.*]], label %[[CONT:.*]], !prof [[PROF4]], !nosanitize [[META2]]
 // CHECK:       [[HANDLER_ADD_OVERFLOW]]:
 // CHECK-NEXT:    [[TMP3:%.*]] = zext i32 [[X]] to i64, !nosanitize [[META2]]
 // CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[Y]] to i64, !nosanitize [[META2]]
@@ -134,7 +157,7 @@ int null(int* x) {
 // TR-NEXT:  [[ENTRY:.*:]]
 // TR-NEXT:    [[TMP0:%.*]] = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[X]], i32 [[Y]]), !nosanitize [[META2]]
 // TR-NEXT:    [[TMP1:%.*]] = extractvalue { i32, i1 } [[TMP0]], 1, !nosanitize [[META2]]
-// TR-NEXT:    [[TMP2:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 0), !nosanitize [[META2]]
+// TR-NEXT:    [[TMP2:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 41), !nosanitize [[META2]]
 // TR-NEXT:    [[DOTDEMORGAN:%.*]] = and i1 [[TMP1]], [[TMP2]]
 // TR-NEXT:    br i1 [[DOTDEMORGAN]], label %[[TRAP:.*]], label %[[CONT:.*]], !nosanitize [[META2]]
 // TR:       [[TRAP]]:
@@ -149,9 +172,9 @@ int null(int* x) {
 // REC-NEXT:  [[ENTRY:.*:]]
 // REC-NEXT:    [[TMP0:%.*]] = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[X]], i32 [[Y]]), !nosanitize [[META2]]
 // REC-NEXT:    [[TMP1:%.*]] = extractvalue { i32, i1 } [[TMP0]], 1, !nosanitize [[META2]]
-// REC-NEXT:    [[TMP2:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 0), !nosanitize [[META2]]
+// REC-NEXT:    [[TMP2:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 41), !nosanitize [[META2]]
 // REC-NEXT:    [[DOTDEMORGAN:%.*]] = and i1 [[TMP1]], [[TMP2]]
-// REC-NEXT:    br i1 [[DOTDEMORGAN]], label %[[HANDLER_ADD_OVERFLOW:.*]], label %[[CONT:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// REC-NEXT:    br i1 [[DOTDEMORGAN]], label %[[HANDLER_ADD_OVERFLOW:.*]], label %[[CONT:.*]], !prof [[PROF4]], !nosanitize [[META2]]
 // REC:       [[HANDLER_ADD_OVERFLOW]]:
 // REC-NEXT:    [[TMP3:%.*]] = zext i32 [[X]] to i64, !nosanitize [[META2]]
 // REC-NEXT:    [[TMP4:%.*]] = zext i32 [[Y]] to i64, !nosanitize [[META2]]
@@ -175,12 +198,14 @@ void use(double*);
 // CHECK-NEXT:    call void @use(ptr noundef nonnull [[VLA]]) #[[ATTR7:[0-9]+]]
 // CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[I]] to i64
 // CHECK-NEXT:    [[TMP1:%.*]] = icmp ule i64 [[TMP0]], [[IDXPROM]]
+//
+//                                                                  71 == SO_LocalBounds
 // CHECK-NEXT:    [[TMP2:%.*]] = call i1 @llvm.allow.ubsan.check(i8 71), !nosanitize [[META2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = and i1 [[TMP1]], [[TMP2]], !nosanitize [[META2]]
 // CHECK-NEXT:    br i1 [[TMP3]], label %[[TRAP:.*]], label %[[BB4:.*]]
 // CHECK:       [[BB4]]:
 // CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[VLA]], i64 [[IDXPROM]]
-// CHECK-NEXT:    [[TMP5:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA8:![0-9]+]]
+// CHECK-NEXT:    [[TMP5:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA9:![0-9]+]]
 // CHECK-NEXT:    ret double [[TMP5]]
 // CHECK:       [[TRAP]]:
 // CHECK-NEXT:    call void @__ubsan_handle_local_out_of_bounds_abort() #[[ATTR6]], !nosanitize [[META2]]
@@ -218,7 +243,7 @@ void use(double*);
 // REC-NEXT:    br i1 [[TMP3]], label %[[TRAP:.*]], label %[[BB4:.*]]
 // REC:       [[BB4]]:
 // REC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[VLA]], i64 [[IDXPROM]]
-// REC-NEXT:    [[TMP5:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA8:![0-9]+]]
+// REC-NEXT:    [[TMP5:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA9:![0-9]+]]
 // REC-NEXT:    ret double [[TMP5]]
 // REC:       [[TRAP]]:
 // REC-NEXT:    call void @__ubsan_handle_local_out_of_bounds() #[[ATTR6]], !nosanitize [[META2]]
@@ -232,13 +257,14 @@ double lbounds(int b, int i) {
 
 //.
 // CHECK: [[META2]] = !{}
-// CHECK: [[PROF3]] = !{!"branch_weights", i32 1, i32 1048575}
-// CHECK: [[TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
-// CHECK: [[META5]] = !{!"int", [[META6:![0-9]+]], i64 0}
-// CHECK: [[META6]] = !{!"omnipotent char", [[META7:![0-9]+]], i64 0}
-// CHECK: [[META7]] = !{!"Simple C/C++ TBAA"}
-// CHECK: [[TBAA8]] = !{[[META9:![0-9]+]], [[META9]], i64 0}
-// CHECK: [[META9]] = !{!"double", [[META6]], i64 0}
+// CHECK: [[PROF3]] = !{!"branch_weights", i32 1048575, i32 1}
+// CHECK: [[PROF4]] = !{!"branch_weights", i32 1, i32 1048575}
+// CHECK: [[TBAA5]] = !{[[META6:![0-9]+]], [[META6]], i64 0}
+// CHECK: [[META6]] = !{!"int", [[META7:![0-9]+]], i64 0}
+// CHECK: [[META7]] = !{!"omnipotent char", [[META8:![0-9]+]], i64 0}
+// CHECK: [[META8]] = !{!"Simple C/C++ TBAA"}
+// CHECK: [[TBAA9]] = !{[[META10:![0-9]+]], [[META10]], i64 0}
+// CHECK: [[META10]] = !{!"double", [[META7]], i64 0}
 //.
 // TR: [[META2]] = !{}
 // TR: [[TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
@@ -249,11 +275,12 @@ double lbounds(int b, int i) {
 // TR: [[META8]] = !{!"double", [[META5]], i64 0}
 //.
 // REC: [[META2]] = !{}
-// REC: [[PROF3]] = !{!"branch_weights", i32 1, i32 1048575}
-// REC: [[TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
-// REC: [[META5]] = !{!"int", [[META6:![0-9]+]], i64 0}
-// REC: [[META6]] = !{!"omnipotent char", [[META7:![0-9]+]], i64 0}
-// REC: [[META7]] = !{!"Simple C/C++ TBAA"}
-// REC: [[TBAA8]] = !{[[META9:![0-9]+]], [[META9]], i64 0}
-// REC: [[META9]] = !{!"double", [[META6]], i64 0}
+// REC: [[PROF3]] = !{!"branch_weights", i32 1048575, i32 1}
+// REC: [[PROF4]] = !{!"branch_weights", i32 1, i32 1048575}
+// REC: [[TBAA5]] = !{[[META6:![0-9]+]], [[META6]], i64 0}
+// REC: [[META6]] = !{!"int", [[META7:![0-9]+]], i64 0}
+// REC: [[META7]] = !{!"omnipotent char", [[META8:![0-9]+]], i64 0}
+// REC: [[META8]] = !{!"Simple C/C++ TBAA"}
+// REC: [[TBAA9]] = !{[[META10:![0-9]+]], [[META10]], i64 0}
+// REC: [[META10]] = !{!"double", [[META7]], i64 0}
 //.
diff --git clang/test/CodeGen/arm-empty-args.cpp clang/test/CodeGen/arm-empty-args.cpp
new file mode 100644
index 000000000000..4e61c78b73ab
--- /dev/null
+++ clang/test/CodeGen/arm-empty-args.cpp
@@ -0,0 +1,131 @@
+// RUN: %clang_cc1 -triple armv7a-linux-gnueabi -emit-llvm -o - -x c %s | FileCheck %s --check-prefixes=CHECK,C
+// RUN: %clang_cc1 -triple armv7a-linux-gnueabi -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,CXX
+// RUN: %clang_cc1 -triple armv7a-linux-gnueabi -emit-llvm -o - %s -fclang-abi-compat=19 | FileCheck %s --check-prefixes=CHECK,CXXCLANG19
+// RUN: %clang_cc1 -triple thumbv7k-apple-watchos2.0 -target-abi aapcs16 -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,WATCHOS
+
+// Empty structs are ignored for PCS purposes on WatchOS and in C mode
+// elsewhere.  In C++ mode they consume a register slot though. Functions are
+// slightly bigger than minimal to make confirmation against actual GCC
+// behaviour easier.
+
+#if __cplusplus
+#define EXTERNC extern "C"
+#else
+#define EXTERNC
+#endif
+
+struct Empty {};
+
+// C: define{{.*}} i32 @empty_arg(i32 noundef %a)
+// CXX: define{{.*}} i32 @empty_arg(i8 %e.coerce, i32 noundef %a)
+// CXXCLANG19: define{{.*}} i32 @empty_arg(i32 noundef %a)
+// WATCHOS: define{{.*}} i32 @empty_arg(i32 noundef %a)
+EXTERNC int empty_arg(struct Empty e, int a) {
+  return a;
+}
+
+// C: define{{.*}} void @empty_ret()
+// CXX: define{{.*}} void @empty_ret()
+// CXXCLANG19: define{{.*}} void @empty_ret()
+// WATCHOS: define{{.*}} void @empty_ret()
+EXTERNC struct Empty empty_ret(void) {
+  struct Empty e;
+  return e;
+}
+
+// However, what counts as "empty" is a baroque mess. This is super-empty, it's
+// ignored even in C++ mode. It also has sizeof == 0, violating C++, but that's
+// legacy for you:
+
+struct SuperEmpty {
+  int arr[0];
+};
+
+// C: define{{.*}} i32 @super_empty_arg(i32 noundef %a)
+// CXX: define{{.*}} i32 @super_empty_arg(i32 noundef %a)
+// CXXCLANG19: define{{.*}} i32 @super_empty_arg(i32 noundef %a)
+// WATCHOS: define{{.*}} i32 @super_empty_arg(i32 noundef %a)
+EXTERNC int super_empty_arg(struct SuperEmpty e, int a) {
+  return a;
+}
+
+struct SortOfEmpty {
+  struct SuperEmpty e;
+};
+
+// C: define{{.*}} i32 @sort_of_empty_arg(i32 noundef %a)
+// CXX: define{{.*}} i32 @sort_of_empty_arg(i8 %e.coerce, i32 noundef %a)
+// CXXCLANG19: define{{.*}} i32 @sort_of_empty_arg(i32 noundef %a)
+// WATCHOS: define{{.*}} i32 @sort_of_empty_arg(i32 noundef %a)
+EXTERNC int sort_of_empty_arg(struct Empty e, int a) {
+  return a;
+}
+
+// C: define{{.*}} void @sort_of_empty_ret()
+// CXX: define{{.*}} void @sort_of_empty_ret()
+// CXXCLANG19: define{{.*}} void @sort_of_empty_ret()
+// WATCHOS: define{{.*}} void @sort_of_empty_ret()
+EXTERNC struct SortOfEmpty sort_of_empty_ret(void) {
+  struct SortOfEmpty e;
+  return e;
+}
+
+#include <stdarg.h>
+
+// va_arg matches the above rules, consuming an incoming argument in cases
+// where one would be passed, and not doing so when the argument should be
+// ignored.
+
+EXTERNC int empty_arg_variadic(int a, ...) {
+// CHECK-LABEL: @empty_arg_variadic(
+// C: %argp.next = getelementptr inbounds i8, ptr %argp.cur, i32 4
+// C-NOT: {{ getelementptr }}
+// CXX: %argp.next = getelementptr inbounds i8, ptr %argp.cur, i32 4
+// CXX: %argp.next2 = getelementptr inbounds i8, ptr %argp.cur1, i32 4
+// CXXCLANG19: %argp.next = getelementptr inbounds i8, ptr %argp.cur, i32 4
+// CXXCLANG19-NOT: {{ getelementptr }}
+// WATCHOS: %argp.next = getelementptr inbounds i8, ptr %argp.cur, i32 4
+// WATCHOS-NOT: {{ getelementptr }}
+  va_list vl;
+  va_start(vl, a);
+  struct Empty b = va_arg(vl, struct Empty);
+  int c = va_arg(vl, int);
+  va_end(vl);
+  return c;
+}
+
+EXTERNC int super_empty_arg_variadic(int a, ...) {
+// CHECK-LABEL: @super_empty_arg_variadic(
+// C: %argp.next = getelementptr inbounds i8, ptr %argp.cur, i32 4
+// C-NOT: {{ getelementptr }}
+// CXX: %argp.next = getelementptr inbounds i8, ptr %argp.cur, i32 4
+// CXX-NOT: {{ getelementptr }}
+// CXXCLANG19: %argp.next = getelementptr inbounds i8, ptr %argp.cur, i32 4
+// CXXCLANG19-NOT: {{ getelementptr }}
+// WATCHOS: %argp.next = getelementptr inbounds i8, ptr %argp.cur, i32 4
+// WATCHOS-NOT: {{ getelementptr }}
+  va_list vl;
+  va_start(vl, a);
+  struct SuperEmpty b = va_arg(vl, struct SuperEmpty);
+  int c = va_arg(vl, int);
+  va_end(vl);
+  return c;
+}
+
+EXTERNC int sort_of_empty_arg_variadic(int a, ...) {
+// CHECK-LABEL: @sort_of_empty_arg_variadic(
+// C: %argp.next = getelementptr inbounds i8, ptr %argp.cur, i32 4
+// C-NOT: {{ getelementptr }}
+// CXX: %argp.next = getelementptr inbounds i8, ptr %argp.cur, i32 4
+// CXX-NOT: {{ getelementptr }}
+// CXXCLANG19: %argp.next = getelementptr inbounds i8, ptr %argp.cur, i32 4
+// CXXCLANG19-NOT: {{ getelementptr }}
+// WATCHOS: %argp.next = getelementptr inbounds i8, ptr %argp.cur, i32 4
+// WATCHOS-NOT: {{ getelementptr }}
+  va_list vl;
+  va_start(vl, a);
+  struct SortOfEmpty b = va_arg(vl, struct SortOfEmpty);
+  int c = va_arg(vl, int);
+  va_end(vl);
+  return c;
+}
diff --git clang/test/CodeGen/attr-counted-by.c clang/test/CodeGen/attr-counted-by.c
index 71fc6c5da2b8..feb6f1543985 100644
--- clang/test/CodeGen/attr-counted-by.c
+++ clang/test/CodeGen/attr-counted-by.c
@@ -9,11 +9,13 @@
 #endif
 
 #ifdef COUNTED_BY
-#define __counted_by(member)    __attribute__((__counted_by__(member)))
+#define __counted_by(member)   __attribute__((__counted_by__(member)))
 #else
 #define __counted_by(member)
 #endif
 
+#define __bdos(P)              __builtin_dynamic_object_size(P, 0)
+
 #define DECLARE_FLEX_ARRAY(TYPE, NAME)                              \
   struct {                                                          \
     struct { } __empty_ ## NAME;                                    \
@@ -66,7 +68,7 @@ struct anon_struct {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT3:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3:![0-9]+]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB1:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8:[0-9]+]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB1:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR7:[0-9]+]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont3:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
@@ -112,11 +114,11 @@ void test1(struct annotated *p, int index, int val) {
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[COUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[INDEX]], [[TMP0]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT3:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT6:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB3:[0-9]+]], i64 [[INDEX]]) #[[ATTR8]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB3:[0-9]+]], i64 [[INDEX]]) #[[ATTR7]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       cont3:
+// SANITIZE-WITH-ATTR:       cont6:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = tail call i32 @llvm.smax.i32(i32 [[COUNTED_BY_LOAD]], i32 0)
@@ -127,11 +129,11 @@ void test1(struct annotated *p, int index, int val) {
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test2(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef captures(none) [[P:%.*]], i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.smax.i32(i32 [[COUNTED_BY_LOAD]], i32 0)
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = shl i32 [[TMP0]], 2
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret void
@@ -153,30 +155,30 @@ void test1(struct annotated *p, int index, int val) {
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 void test2(struct annotated *p, size_t index) {
-  p->array[index] = __builtin_dynamic_object_size(p->array, 1);
+  p->array[index] = __bdos(p->array);
 }
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -8589934592, 8589934589) i64 @test2_bdos(
-// SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+// SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = sext i32 [[COUNTED_BY_LOAD]] to i64
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[TMP0]], 2
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp sgt i32 [[COUNTED_BY_LOAD]], -1
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i64 [[TMP1]], i64 0
-// SANITIZE-WITH-ATTR-NEXT:    ret i64 [[TMP3]]
+// SANITIZE-WITH-ATTR-NEXT:    [[COUNT:%.*]] = sext i32 [[COUNTED_BY_LOAD]] to i64
+// SANITIZE-WITH-ATTR-NEXT:    [[FLEXIBLE_ARRAY_MEMBER_SIZE:%.*]] = shl nsw i64 [[COUNT]], 2
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = icmp sgt i32 [[COUNTED_BY_LOAD]], -1
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = select i1 [[TMP0]], i64 [[FLEXIBLE_ARRAY_MEMBER_SIZE]], i64 0
+// SANITIZE-WITH-ATTR-NEXT:    ret i64 [[TMP1]]
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -8589934592, 8589934589) i64 @test2_bdos(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = sext i32 [[COUNTED_BY_LOAD]] to i64
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[TMP0]], 2
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp sgt i32 [[COUNTED_BY_LOAD]], -1
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i64 [[TMP1]], i64 0
-// NO-SANITIZE-WITH-ATTR-NEXT:    ret i64 [[TMP3]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNT:%.*]] = sext i32 [[COUNTED_BY_LOAD]] to i64
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[FLEXIBLE_ARRAY_MEMBER_SIZE:%.*]] = shl nsw i64 [[COUNT]], 2
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = icmp sgt i32 [[COUNTED_BY_LOAD]], -1
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = select i1 [[TMP0]], i64 [[FLEXIBLE_ARRAY_MEMBER_SIZE]], i64 0
+// NO-SANITIZE-WITH-ATTR-NEXT:    ret i64 [[TMP1]]
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test2_bdos(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0]] {
@@ -189,7 +191,7 @@ void test2(struct annotated *p, size_t index) {
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 size_t test2_bdos(struct annotated *p) {
-  return __builtin_dynamic_object_size(p->array, 1);
+  return __bdos(p->array);
 }
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local void @test3(
@@ -201,7 +203,7 @@ size_t test2_bdos(struct annotated *p) {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[INDEX]], [[TMP0]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT3:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB4:[0-9]+]], i64 [[INDEX]]) #[[ATTR8]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB4:[0-9]+]], i64 [[INDEX]]) #[[ATTR7]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont3:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
@@ -236,11 +238,11 @@ size_t test2_bdos(struct annotated *p) {
 void test3(struct annotated *p, size_t index) {
   // This test differs from 'test2' by checking bdos on the whole array and not
   // just the FAM.
-  p->array[index] = __builtin_dynamic_object_size(p, 0);
+  p->array[index] = __bdos(p);
 }
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test3_bdos(
-// SANITIZE-WITH-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] {
+// SANITIZE-WITH-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
 // SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
@@ -260,107 +262,122 @@ void test3(struct annotated *p, size_t index) {
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 size_t test3_bdos(struct annotated *p) {
-  return __builtin_dynamic_object_size(p, 0);
+  return __bdos(p);
 }
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local void @test4(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]], i32 noundef [[FAM_IDX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
-// SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
-// SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
-// SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[COUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT4:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
+// SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 4
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[DOTCOUNTED_BY_LOAD]], 2
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT1:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB5:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB5:[0-9]+]], i64 3) #[[ATTR7]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       cont4:
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp sgt i32 [[COUNTED_BY_LOAD]], 2
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = shl i32 [[COUNTED_BY_LOAD]], 2
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 244
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP5:%.*]] = and i32 [[TMP4]], 252
-// SANITIZE-WITH-ATTR-NEXT:    [[CONV1:%.*]] = select i1 [[TMP2]], i32 [[TMP5]], i32 0
-// SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM]]
-// SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV1]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA4]]
-// SANITIZE-WITH-ATTR-NEXT:    [[ADD:%.*]] = add nsw i32 [[INDEX]], 1
-// SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM12:%.*]] = sext i32 [[ADD]] to i64
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[IDXPROM12]], [[TMP0]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP6]], label [[CONT19:%.*]], label [[HANDLER_OUT_OF_BOUNDS15:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       handler.out_of_bounds15:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB6:[0-9]+]], i64 [[IDXPROM12]]) #[[ATTR8]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       cont1:
+// SANITIZE-WITH-ATTR-NEXT:    [[FLEXIBLE_ARRAY_MEMBER_SIZE:%.*]] = shl i32 [[DOTCOUNTED_BY_LOAD]], 2
+// SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP2]], label [[CONT12:%.*]], label [[HANDLER_OUT_OF_BOUNDS8:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       handler.out_of_bounds8:
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB6:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR7]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       cont12:
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = icmp sgt i32 [[DOTCOUNTED_BY_LOAD]], 2
+// SANITIZE-WITH-ATTR-NEXT:    [[RESULT:%.*]] = add i32 [[FLEXIBLE_ARRAY_MEMBER_SIZE]], 244
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP4:%.*]] = and i32 [[RESULT]], 252
+// SANITIZE-WITH-ATTR-NEXT:    [[CONV2:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 0
+// SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM]]
+// SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV2]], ptr [[ARRAYIDX10]], align 4, !tbaa [[TBAA4]]
+// SANITIZE-WITH-ATTR-NEXT:    [[DOTNOT81:%.*]] = icmp eq i32 [[DOTCOUNTED_BY_LOAD]], 3
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[DOTNOT81]], label [[HANDLER_OUT_OF_BOUNDS18:%.*]], label [[CONT19:%.*]], !prof [[PROF8:![0-9]+]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       handler.out_of_bounds18:
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB7:[0-9]+]], i64 4) #[[ATTR7]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont19:
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[COUNTED_BY_LOAD]], 3
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP8:%.*]] = add i32 [[TMP3]], 240
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP9:%.*]] = and i32 [[TMP8]], 252
-// SANITIZE-WITH-ATTR-NEXT:    [[CONV8:%.*]] = select i1 [[TMP7]], i32 [[TMP9]], i32 0
-// SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds nuw [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM12]]
-// SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV8]], ptr [[ARRAYIDX17]], align 4, !tbaa [[TBAA4]]
-// SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD21:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
-// SANITIZE-WITH-ATTR-NEXT:    [[ADD27:%.*]] = add nsw i32 [[INDEX]], 2
-// SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM28:%.*]] = sext i32 [[ADD27]] to i64
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP10:%.*]] = zext i32 [[COUNTED_BY_LOAD21]] to i64, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP11:%.*]] = icmp ult i64 [[IDXPROM28]], [[TMP10]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP11]], label [[CONT35:%.*]], label [[HANDLER_OUT_OF_BOUNDS31:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       handler.out_of_bounds31:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB7:[0-9]+]], i64 [[IDXPROM28]]) #[[ATTR8]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[ADD:%.*]] = add nsw i32 [[INDEX]], 1
+// SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM31:%.*]] = sext i32 [[ADD]] to i64
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[IDXPROM31]], [[TMP0]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP5]], label [[CONT38:%.*]], label [[HANDLER_OUT_OF_BOUNDS34:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       handler.out_of_bounds34:
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB8:[0-9]+]], i64 [[IDXPROM31]]) #[[ATTR7]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       cont38:
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP6:%.*]] = icmp sgt i32 [[DOTCOUNTED_BY_LOAD]], 3
+// SANITIZE-WITH-ATTR-NEXT:    [[RESULT25:%.*]] = add i32 [[FLEXIBLE_ARRAY_MEMBER_SIZE]], 240
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP7:%.*]] = and i32 [[RESULT25]], 252
+// SANITIZE-WITH-ATTR-NEXT:    [[CONV27:%.*]] = select i1 [[TMP6]], i32 [[TMP7]], i32 0
+// SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX36:%.*]] = getelementptr inbounds nuw [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM31]]
+// SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV27]], ptr [[ARRAYIDX36]], align 4, !tbaa [[TBAA4]]
+// SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM42:%.*]] = sext i32 [[FAM_IDX]] to i64
+// SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD44:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 4
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP8:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD44]] to i64, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[DOTNOT:%.*]] = icmp ugt i64 [[IDXPROM42]], [[TMP8]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[DOTNOT]], label [[HANDLER_OUT_OF_BOUNDS45:%.*]], label [[CONT46:%.*]], !prof [[PROF8]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       handler.out_of_bounds45:
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB9:[0-9]+]], i64 [[IDXPROM42]]) #[[ATTR7]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       cont46:
+// SANITIZE-WITH-ATTR-NEXT:    [[ADD59:%.*]] = add nsw i32 [[INDEX]], 2
+// SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM60:%.*]] = sext i32 [[ADD59]] to i64
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP9:%.*]] = icmp ult i64 [[IDXPROM60]], [[TMP8]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP9]], label [[CONT67:%.*]], label [[HANDLER_OUT_OF_BOUNDS63:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       handler.out_of_bounds63:
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB10:[0-9]+]], i64 [[IDXPROM60]]) #[[ATTR7]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       cont35:
-// SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX33:%.*]] = getelementptr inbounds nuw [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM28]]
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP12:%.*]] = icmp sgt i32 [[FAM_IDX]], -1
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP13:%.*]] = sext i32 [[COUNTED_BY_LOAD21]] to i64
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP14:%.*]] = sext i32 [[FAM_IDX]] to i64
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP15:%.*]] = sub nsw i64 [[TMP13]], [[TMP14]]
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP16:%.*]] = icmp sgt i64 [[TMP15]], -1
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP17:%.*]] = and i1 [[TMP12]], [[TMP16]]
-// SANITIZE-WITH-ATTR-NEXT:    [[DOTTR:%.*]] = trunc i64 [[TMP15]] to i32
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP18:%.*]] = shl i32 [[DOTTR]], 2
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP19:%.*]] = and i32 [[TMP18]], 252
-// SANITIZE-WITH-ATTR-NEXT:    [[CONV23:%.*]] = select i1 [[TMP17]], i32 [[TMP19]], i32 0
-// SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV23]], ptr [[ARRAYIDX33]], align 4, !tbaa [[TBAA4]]
+// SANITIZE-WITH-ATTR:       cont67:
+// SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX65:%.*]] = getelementptr inbounds nuw [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM60]]
+// SANITIZE-WITH-ATTR-NEXT:    [[COUNT50:%.*]] = sext i32 [[DOTCOUNTED_BY_LOAD44]] to i64
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP10:%.*]] = sub nsw i64 [[COUNT50]], [[IDXPROM42]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP11:%.*]] = tail call i64 @llvm.smax.i64(i64 [[TMP10]], i64 0)
+// SANITIZE-WITH-ATTR-NEXT:    [[DOTTR:%.*]] = trunc nuw i64 [[TMP11]] to i32
+// SANITIZE-WITH-ATTR-NEXT:    [[CONV54:%.*]] = shl i32 [[DOTTR]], 2
+// SANITIZE-WITH-ATTR-NEXT:    [[CONV55:%.*]] = and i32 [[CONV54]], 252
+// SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV55]], ptr [[ARRAYIDX65]], align 4, !tbaa [[TBAA4]]
 // SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test4(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]], i32 noundef [[FAM_IDX:%.*]]) local_unnamed_addr #[[ATTR1]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = shl i32 [[COUNTED_BY_LOAD]], 2
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = add i32 [[TMP0]], 244
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp sgt i32 [[COUNTED_BY_LOAD]], 2
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = and i32 [[TMP1]], 252
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[CONV1:%.*]] = select i1 [[TMP2]], i32 [[TMP3]], i32 0
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[FLEXIBLE_ARRAY_MEMBER_SIZE:%.*]] = shl i32 [[COUNTED_BY_LOAD]], 2
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[RESULT:%.*]] = add i32 [[FLEXIBLE_ARRAY_MEMBER_SIZE]], 244
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = icmp sgt i32 [[COUNTED_BY_LOAD]], 2
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = and i32 [[RESULT]], 252
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[CONV1:%.*]] = select i1 [[TMP0]], i32 [[TMP1]], i32 0
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV1]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD3:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP4:%.*]] = shl i32 [[COUNTED_BY_LOAD3]], 2
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], 240
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP6:%.*]] = icmp sgt i32 [[COUNTED_BY_LOAD3]], 3
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP7:%.*]] = and i32 [[TMP5]], 252
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[CONV5:%.*]] = select i1 [[TMP6]], i32 [[TMP7]], i32 0
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV1]], ptr [[ARRAYIDX3]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD7:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[FLEXIBLE_ARRAY_MEMBER_SIZE9:%.*]] = shl i32 [[COUNTED_BY_LOAD7]], 2
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[RESULT10:%.*]] = add i32 [[FLEXIBLE_ARRAY_MEMBER_SIZE9]], 240
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp sgt i32 [[COUNTED_BY_LOAD7]], 3
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = and i32 [[RESULT10]], 252
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[CONV12:%.*]] = select i1 [[TMP2]], i32 [[TMP3]], i32 0
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ADD:%.*]] = add nsw i32 [[INDEX]], 1
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM7:%.*]] = sext i32 [[ADD]] to i64
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM7]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV5]], ptr [[ARRAYIDX8]], align 4, !tbaa [[TBAA2]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD10:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP8:%.*]] = sext i32 [[COUNTED_BY_LOAD10]] to i64
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP9:%.*]] = sext i32 [[FAM_IDX]] to i64
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP10:%.*]] = sub nsw i64 [[TMP8]], [[TMP9]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP11:%.*]] = icmp sgt i64 [[TMP10]], -1
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP12:%.*]] = icmp sgt i32 [[FAM_IDX]], -1
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP13:%.*]] = and i1 [[TMP12]], [[TMP11]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOTTR:%.*]] = trunc i64 [[TMP10]] to i32
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP14:%.*]] = shl i32 [[DOTTR]], 2
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP15:%.*]] = and i32 [[TMP14]], 252
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[CONV12:%.*]] = select i1 [[TMP13]], i32 [[TMP15]], i32 0
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[ADD14:%.*]] = add nsw i32 [[INDEX]], 2
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM15:%.*]] = sext i32 [[ADD14]] to i64
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM15]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV12]], ptr [[ARRAYIDX16]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM14:%.*]] = sext i32 [[ADD]] to i64
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM14]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV12]], ptr [[ARRAYIDX15]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM17:%.*]] = sext i32 [[FAM_IDX]] to i64
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD20:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNT21:%.*]] = sext i32 [[COUNTED_BY_LOAD20]] to i64
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP4:%.*]] = sub nsw i64 [[COUNT21]], [[IDXPROM17]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP5:%.*]] = icmp sgt i64 [[TMP4]], -1
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP6:%.*]] = icmp sgt i32 [[FAM_IDX]], -1
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP7:%.*]] = and i1 [[TMP6]], [[TMP5]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOTTR:%.*]] = trunc i64 [[TMP4]] to i32
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP8:%.*]] = shl i32 [[DOTTR]], 2
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP9:%.*]] = and i32 [[TMP8]], 252
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[CONV26:%.*]] = select i1 [[TMP7]], i32 [[TMP9]], i32 0
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[ADD28:%.*]] = add nsw i32 [[INDEX]], 2
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM29:%.*]] = sext i32 [[ADD28]] to i64
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX30:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM29]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV26]], ptr [[ARRAYIDX30]], align 4, !tbaa [[TBAA2]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test4(
@@ -399,40 +416,44 @@ size_t test3_bdos(struct annotated *p) {
 //
 void test4(struct annotated *p, int index, int fam_idx) {
   // This tests calculating the size from a pointer inside the FAM.
-  p->array[index] = (unsigned char)__builtin_dynamic_object_size(&p->array[3], 1);
-  p->array[index + 1] = (unsigned char)__builtin_dynamic_object_size(&(p->array[4]), 1);
-  p->array[index + 2] = (unsigned char)__builtin_dynamic_object_size(&(p->array[fam_idx]), 1);
+  p->array[index] = (unsigned char)__bdos(&p->array[3]);
+  p->array[index + 1] = (unsigned char)__bdos(&(p->array[4]));
+  p->array[index + 2] = (unsigned char)__bdos(&(p->array[fam_idx]));
 }
 
-// SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -17179869180, 17179869181) i64 @test4_bdos(
-// SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 0, 8589934589) i64 @test4_bdos(
+// SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
-// SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
-// SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = sext i32 [[COUNTED_BY_LOAD]] to i64
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = sext i32 [[INDEX]] to i64
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = sub nsw i64 [[TMP0]], [[TMP1]]
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = shl nsw i64 [[TMP2]], 2
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP4:%.*]] = icmp sgt i64 [[TMP2]], -1
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[INDEX]], -1
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP6:%.*]] = and i1 [[TMP5]], [[TMP4]]
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP7:%.*]] = select i1 [[TMP6]], i64 [[TMP3]], i64 0
-// SANITIZE-WITH-ATTR-NEXT:    ret i64 [[TMP7]]
+// SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
+// SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
+// SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 4
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[DOTNOT:%.*]] = icmp ugt i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[DOTNOT]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], label [[CONT1:%.*]], !prof [[PROF8]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       handler.out_of_bounds:
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB11:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR7]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       cont1:
+// SANITIZE-WITH-ATTR-NEXT:    [[COUNT:%.*]] = sext i32 [[DOTCOUNTED_BY_LOAD]] to i64
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = sub nsw i64 [[COUNT]], [[IDXPROM]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.smax.i64(i64 [[TMP1]], i64 0)
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2
+// SANITIZE-WITH-ATTR-NEXT:    ret i64 [[TMP3]]
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -17179869180, 17179869181) i64 @test4_bdos(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = sext i32 [[COUNTED_BY_LOAD]] to i64
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = sext i32 [[INDEX]] to i64
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = sub nsw i64 [[TMP0]], [[TMP1]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = shl nsw i64 [[TMP2]], 2
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP4:%.*]] = icmp sgt i64 [[TMP2]], -1
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[INDEX]], -1
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP6:%.*]] = and i1 [[TMP5]], [[TMP4]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP7:%.*]] = select i1 [[TMP6]], i64 [[TMP3]], i64 0
-// NO-SANITIZE-WITH-ATTR-NEXT:    ret i64 [[TMP7]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNT:%.*]] = sext i32 [[COUNTED_BY_LOAD]] to i64
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = sub nsw i64 [[COUNT]], [[IDXPROM]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[RESULT:%.*]] = shl nsw i64 [[TMP0]], 2
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp sgt i64 [[TMP0]], -1
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp sgt i32 [[INDEX]], -1
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = and i1 [[TMP2]], [[TMP1]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i64 [[RESULT]], i64 0
+// NO-SANITIZE-WITH-ATTR-NEXT:    ret i64 [[TMP4]]
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test4_bdos(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
@@ -445,7 +466,7 @@ void test4(struct annotated *p, int index, int fam_idx) {
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 size_t test4_bdos(struct annotated *p, int index) {
-  return __builtin_dynamic_object_size(&p->array[index], 1);
+  return __bdos(&p->array[index]);
 }
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local void @test5(
@@ -457,7 +478,7 @@ size_t test4_bdos(struct annotated *p, int index) {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = icmp ugt i64 [[DOTCOUNTED_BY_LOAD]], [[IDXPROM]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label [[CONT3:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB8:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB12:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR7]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont3:
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16
@@ -493,11 +514,11 @@ size_t test4_bdos(struct annotated *p, int index) {
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 void test5(struct anon_struct *p, int index) {
-  p->array[index] = __builtin_dynamic_object_size(p, 1);
+  p->array[index] = __bdos(p);
 }
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test5_bdos(
-// SANITIZE-WITH-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// SANITIZE-WITH-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
 // SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
@@ -517,7 +538,7 @@ void test5(struct anon_struct *p, int index) {
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 size_t test5_bdos(struct anon_struct *p) {
-  return __builtin_dynamic_object_size(p, 1);
+  return __bdos(p);
 }
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local void @test6(
@@ -527,30 +548,30 @@ size_t test5_bdos(struct anon_struct *p) {
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i64, ptr [[COUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = icmp ugt i64 [[COUNTED_BY_LOAD]], [[IDXPROM]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label [[CONT3:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label [[CONT6:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB9:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB13:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR7]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       cont3:
+// SANITIZE-WITH-ATTR:       cont6:
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[TMP1]], i64 0, i64 [[IDXPROM]]
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.smax.i64(i64 [[COUNTED_BY_LOAD]], i64 0)
-// SANITIZE-WITH-ATTR-NEXT:    [[DOTTR:%.*]] = trunc i64 [[TMP2]] to i32
-// SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = shl i32 [[DOTTR]], 2
+// SANITIZE-WITH-ATTR-NEXT:    [[FLEXIBLE_ARRAY_MEMBER_SIZE:%.*]] = shl nuw i64 [[COUNTED_BY_LOAD]], 2
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.smax.i64(i64 [[FLEXIBLE_ARRAY_MEMBER_SIZE]], i64 0)
+// SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP2]] to i32
 // SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA4]]
 // SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test6(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i64, ptr [[COUNTED_BY_GEP]], align 4
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.smax.i64(i64 [[COUNTED_BY_LOAD]], i64 0)
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOTTR:%.*]] = trunc i64 [[TMP0]] to i32
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = shl i32 [[DOTTR]], 2
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[FLEXIBLE_ARRAY_MEMBER_SIZE:%.*]] = shl nuw i64 [[COUNTED_BY_LOAD]], 2
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.smax.i64(i64 [[FLEXIBLE_ARRAY_MEMBER_SIZE]], i64 0)
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP1]] to i32
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[TMP1]], i64 0, i64 [[IDXPROM]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret void
 //
@@ -573,26 +594,26 @@ size_t test5_bdos(struct anon_struct *p) {
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 void test6(struct anon_struct *p, int index) {
-  p->array[index] = __builtin_dynamic_object_size(p->array, 1);
+  p->array[index] = __bdos(p->array);
 }
 
-// SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 0, -3) i64 @test6_bdos(
-// SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 0, -9223372036854775808) i64 @test6_bdos(
+// SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i64, ptr [[COUNTED_BY_GEP]], align 4
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.smax.i64(i64 [[COUNTED_BY_LOAD]], i64 0)
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 2
-// SANITIZE-WITH-ATTR-NEXT:    ret i64 [[TMP1]]
+// SANITIZE-WITH-ATTR-NEXT:    [[FLEXIBLE_ARRAY_MEMBER_SIZE:%.*]] = shl nuw i64 [[COUNTED_BY_LOAD]], 2
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.smax.i64(i64 [[FLEXIBLE_ARRAY_MEMBER_SIZE]], i64 0)
+// SANITIZE-WITH-ATTR-NEXT:    ret i64 [[TMP0]]
 //
-// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 0, -3) i64 @test6_bdos(
+// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 0, -9223372036854775808) i64 @test6_bdos(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i64, ptr [[COUNTED_BY_GEP]], align 4
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.smax.i64(i64 [[COUNTED_BY_LOAD]], i64 0)
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 2
-// NO-SANITIZE-WITH-ATTR-NEXT:    ret i64 [[TMP1]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[FLEXIBLE_ARRAY_MEMBER_SIZE:%.*]] = shl nuw i64 [[COUNTED_BY_LOAD]], 2
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.smax.i64(i64 [[FLEXIBLE_ARRAY_MEMBER_SIZE]], i64 0)
+// NO-SANITIZE-WITH-ATTR-NEXT:    ret i64 [[TMP0]]
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test6_bdos(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0]] {
@@ -605,7 +626,7 @@ void test6(struct anon_struct *p, int index) {
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 size_t test6_bdos(struct anon_struct *p) {
-  return __builtin_dynamic_object_size(p->array, 1);
+  return __bdos(p->array);
 }
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local void @test7(
@@ -618,12 +639,12 @@ size_t test6_bdos(struct anon_struct *p) {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP1]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP2]], label [[CONT7:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB11:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB15:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR7]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont7:
 // SANITIZE-WITH-ATTR-NEXT:    [[INTS:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 9
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [0 x i8], ptr [[INTS]], i64 0, i64 [[IDXPROM]]
-// SANITIZE-WITH-ATTR-NEXT:    store i8 -1, ptr [[ARRAYIDX]], align 1, !tbaa [[TBAA8:![0-9]+]]
+// SANITIZE-WITH-ATTR-NEXT:    store i8 -1, ptr [[ARRAYIDX]], align 1, !tbaa [[TBAA9:![0-9]+]]
 // SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test7(
@@ -654,11 +675,11 @@ size_t test6_bdos(struct anon_struct *p) {
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 void test7(struct union_of_fams *p, int index) {
-  p->ints[index] = __builtin_dynamic_object_size(p, 1);
+  p->ints[index] = __bdos(p);
 }
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test7_bdos(
-// SANITIZE-WITH-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// SANITIZE-WITH-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
 // SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
@@ -678,33 +699,33 @@ void test7(struct union_of_fams *p, int index) {
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 size_t test7_bdos(struct union_of_fams *p) {
-  return __builtin_dynamic_object_size(p, 1);
+  return __bdos(p);
 }
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local void @test8(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
-// SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
-// SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i8, ptr [[COUNTED_BY_GEP]], align 4
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
+// SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i8, ptr [[TMP0]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i8 [[COUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT7:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = zext i8 [[COUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP1]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP2]], label [[CONT14:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB12:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB16:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR7]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       cont7:
+// SANITIZE-WITH-ATTR:       cont14:
 // SANITIZE-WITH-ATTR-NEXT:    [[INTS:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 9
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [0 x i8], ptr [[INTS]], i64 0, i64 [[IDXPROM]]
-// SANITIZE-WITH-ATTR-NEXT:    store i8 [[COUNTED_BY_LOAD]], ptr [[ARRAYIDX]], align 1, !tbaa [[TBAA8]]
+// SANITIZE-WITH-ATTR-NEXT:    store i8 [[COUNTED_BY_LOAD]], ptr [[ARRAYIDX]], align 1, !tbaa [[TBAA9]]
 // SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test8(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i8, ptr [[COUNTED_BY_GEP]], align 4
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[INTS:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 9
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i8, ptr [[TMP0]], align 4
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i8], ptr [[INTS]], i64 0, i64 [[IDXPROM]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    store i8 [[COUNTED_BY_LOAD]], ptr [[ARRAYIDX]], align 1, !tbaa [[TBAA6]]
@@ -729,24 +750,24 @@ size_t test7_bdos(struct union_of_fams *p) {
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 void test8(struct union_of_fams *p, int index) {
-  p->ints[index] = __builtin_dynamic_object_size(p->ints, 1);
+  p->ints[index] = __bdos(p->ints);
 }
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 0, 256) i64 @test8_bdos(
-// SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
-// SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
-// SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i8, ptr [[COUNTED_BY_GEP]], align 4
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i8 [[COUNTED_BY_LOAD]] to i64
-// SANITIZE-WITH-ATTR-NEXT:    ret i64 [[TMP0]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
+// SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i8, ptr [[TMP0]], align 4
+// SANITIZE-WITH-ATTR-NEXT:    [[COUNT:%.*]] = zext i8 [[COUNTED_BY_LOAD]] to i64
+// SANITIZE-WITH-ATTR-NEXT:    ret i64 [[COUNT]]
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 0, 256) i64 @test8_bdos(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i8, ptr [[COUNTED_BY_GEP]], align 4
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i8 [[COUNTED_BY_LOAD]] to i64
-// NO-SANITIZE-WITH-ATTR-NEXT:    ret i64 [[TMP0]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i8, ptr [[TMP0]], align 4
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNT:%.*]] = zext i8 [[COUNTED_BY_LOAD]] to i64
+// NO-SANITIZE-WITH-ATTR-NEXT:    ret i64 [[COUNT]]
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test8_bdos(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0]] {
@@ -759,7 +780,7 @@ void test8(struct union_of_fams *p, int index) {
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 size_t test8_bdos(struct union_of_fams *p) {
-  return __builtin_dynamic_object_size(p->ints, 1);
+  return __bdos(p->ints);
 }
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local void @test9(
@@ -772,12 +793,12 @@ size_t test8_bdos(struct union_of_fams *p) {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP1]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP2]], label [[CONT7:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB14:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB18:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR7]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont7:
 // SANITIZE-WITH-ATTR-NEXT:    [[BYTES:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [0 x i8], ptr [[BYTES]], i64 0, i64 [[IDXPROM]]
-// SANITIZE-WITH-ATTR-NEXT:    store i8 -1, ptr [[ARRAYIDX]], align 1, !tbaa [[TBAA8]]
+// SANITIZE-WITH-ATTR-NEXT:    store i8 -1, ptr [[ARRAYIDX]], align 1, !tbaa [[TBAA9]]
 // SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test9(
@@ -808,11 +829,11 @@ size_t test8_bdos(struct union_of_fams *p) {
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 void test9(struct union_of_fams *p, int index) {
-  p->bytes[index] = (unsigned char)__builtin_dynamic_object_size(p, 1);
+  p->bytes[index] = (unsigned char)__bdos(p);
 }
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test9_bdos(
-// SANITIZE-WITH-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// SANITIZE-WITH-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
 // SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
@@ -832,37 +853,37 @@ void test9(struct union_of_fams *p, int index) {
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 size_t test9_bdos(struct union_of_fams *p) {
-  return __builtin_dynamic_object_size(p, 1);
+  return __bdos(p);
 }
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local void @test10(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
-// SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
-// SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
+// SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[TMP0]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[COUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT7:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = zext i32 [[COUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP1]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP2]], label [[CONT14:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB15:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB19:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR7]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       cont7:
+// SANITIZE-WITH-ATTR:       cont14:
 // SANITIZE-WITH-ATTR-NEXT:    [[BYTES:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [0 x i8], ptr [[BYTES]], i64 0, i64 [[IDXPROM]]
 // SANITIZE-WITH-ATTR-NEXT:    [[NARROW:%.*]] = tail call i32 @llvm.smax.i32(i32 [[COUNTED_BY_LOAD]], i32 0)
 // SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = trunc i32 [[NARROW]] to i8
-// SANITIZE-WITH-ATTR-NEXT:    store i8 [[CONV]], ptr [[ARRAYIDX]], align 1, !tbaa [[TBAA8]]
+// SANITIZE-WITH-ATTR-NEXT:    store i8 [[CONV]], ptr [[ARRAYIDX]], align 1, !tbaa [[TBAA9]]
 // SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test10(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[BYTES:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[TMP0]], align 4
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[NARROW:%.*]] = tail call i32 @llvm.smax.i32(i32 [[COUNTED_BY_LOAD]], i32 0)
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = trunc i32 [[NARROW]] to i8
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[BYTES:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i8], ptr [[BYTES]], i64 0, i64 [[IDXPROM]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    store i8 [[CONV]], ptr [[ARRAYIDX]], align 1, !tbaa [[TBAA6]]
@@ -887,26 +908,26 @@ size_t test9_bdos(struct union_of_fams *p) {
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 void test10(struct union_of_fams *p, int index) {
-  p->bytes[index] = (unsigned char)__builtin_dynamic_object_size(p->bytes, 1);
+  p->bytes[index] = (unsigned char)__bdos(p->bytes);
 }
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 0, 2147483648) i64 @test10_bdos(
-// SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
-// SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
-// SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
+// SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[TMP0]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[NARROW:%.*]] = tail call i32 @llvm.smax.i32(i32 [[COUNTED_BY_LOAD]], i32 0)
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext nneg i32 [[NARROW]] to i64
-// SANITIZE-WITH-ATTR-NEXT:    ret i64 [[TMP0]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = zext nneg i32 [[NARROW]] to i64
+// SANITIZE-WITH-ATTR-NEXT:    ret i64 [[TMP1]]
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 0, 2147483648) i64 @test10_bdos(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[TMP0]], align 4
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[NARROW:%.*]] = tail call i32 @llvm.smax.i32(i32 [[COUNTED_BY_LOAD]], i32 0)
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext nneg i32 [[NARROW]] to i64
-// NO-SANITIZE-WITH-ATTR-NEXT:    ret i64 [[TMP0]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = zext nneg i32 [[NARROW]] to i64
+// NO-SANITIZE-WITH-ATTR-NEXT:    ret i64 [[TMP1]]
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test10_bdos(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0]] {
@@ -919,34 +940,44 @@ void test10(struct union_of_fams *p, int index) {
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 size_t test10_bdos(struct union_of_fams *p) {
-  return __builtin_dynamic_object_size(p->bytes, 1);
+  return __bdos(p->bytes);
 }
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local void @test11(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
+// SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
-// SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
-// SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 4
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[COUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT3:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT6:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB16:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB20:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR7]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       cont3:
+// SANITIZE-WITH-ATTR:       cont6:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM]]
-// SANITIZE-WITH-ATTR-NEXT:    store i32 4, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA4]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp sgt i32 [[COUNTED_BY_LOAD]], -3
+// SANITIZE-WITH-ATTR-NEXT:    [[FLEXIBLE_ARRAY_MEMBER_SIZE:%.*]] = shl i32 [[COUNTED_BY_LOAD]], 2
+// SANITIZE-WITH-ATTR-NEXT:    [[RESULT:%.*]] = add i32 [[FLEXIBLE_ARRAY_MEMBER_SIZE]], 8
+// SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = select i1 [[TMP2]], i32 [[RESULT]], i32 0
+// SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA4]]
 // SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test11(
-// NO-SANITIZE-WITH-ATTR-SAME: ptr noundef writeonly captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// NO-SANITIZE-WITH-ATTR-SAME: ptr noundef captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[FLEXIBLE_ARRAY_MEMBER_SIZE:%.*]] = shl i32 [[COUNTED_BY_LOAD]], 2
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[RESULT:%.*]] = add i32 [[FLEXIBLE_ARRAY_MEMBER_SIZE]], 8
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = icmp sgt i32 [[COUNTED_BY_LOAD]], -3
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = select i1 [[TMP0]], i32 [[RESULT]], i32 0
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    store i32 4, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test11(
@@ -955,44 +986,58 @@ size_t test10_bdos(struct union_of_fams *p) {
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM]]
-// SANITIZE-WITHOUT-ATTR-NEXT:    store i32 4, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test11(
-// NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef writeonly captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM]]
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 4, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 void test11(struct annotated *p, int index) {
-  p->array[index] = __builtin_dynamic_object_size(&p->count, 1);
+  p->array[index] = __bdos(&p->count);
 }
 
-// SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i64 @test11_bdos(
-// SANITIZE-WITH-ATTR-SAME: ptr noundef readnone captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -8589934584, 8589934597) i64 @test11_bdos(
+// SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
-// SANITIZE-WITH-ATTR-NEXT:    ret i64 4
+// SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
+// SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
+// SANITIZE-WITH-ATTR-NEXT:    [[COUNT1:%.*]] = sext i32 [[COUNTED_BY_LOAD]] to i64
+// SANITIZE-WITH-ATTR-NEXT:    [[FLEXIBLE_ARRAY_MEMBER_SIZE:%.*]] = shl nsw i64 [[COUNT1]], 2
+// SANITIZE-WITH-ATTR-NEXT:    [[RESULT:%.*]] = add nsw i64 [[FLEXIBLE_ARRAY_MEMBER_SIZE]], 8
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = icmp sgt i32 [[COUNTED_BY_LOAD]], -3
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = select i1 [[TMP0]], i64 [[RESULT]], i64 0
+// SANITIZE-WITH-ATTR-NEXT:    ret i64 [[TMP1]]
 //
-// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i64 @test11_bdos(
-// NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readnone captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -8589934584, 8589934597) i64 @test11_bdos(
+// NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITH-ATTR-NEXT:    ret i64 4
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNT1:%.*]] = sext i32 [[COUNTED_BY_LOAD]] to i64
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[FLEXIBLE_ARRAY_MEMBER_SIZE:%.*]] = shl nsw i64 [[COUNT1]], 2
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[RESULT:%.*]] = add nsw i64 [[FLEXIBLE_ARRAY_MEMBER_SIZE]], 8
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = icmp sgt i32 [[COUNTED_BY_LOAD]], -3
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = select i1 [[TMP0]], i64 [[RESULT]], i64 0
+// NO-SANITIZE-WITH-ATTR-NEXT:    ret i64 [[TMP1]]
 //
-// SANITIZE-WITHOUT-ATTR-LABEL: define dso_local noundef i64 @test11_bdos(
-// SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readnone captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test11_bdos(
+// SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SANITIZE-WITHOUT-ATTR-NEXT:  entry:
-// SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 4
+// SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
-// NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local noundef i64 @test11_bdos(
-// NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readnone captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test11_bdos(
+// NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR1]] {
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 4
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 size_t test11_bdos(struct annotated *p) {
-  return __builtin_dynamic_object_size(&p->count, 1);
+  return __bdos(&p->count);
 }
 
 struct {
@@ -1011,16 +1056,16 @@ struct hang {
 int test12_a, test12_b;
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i32 @test12(
-// SANITIZE-WITH-ATTR-SAME: i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR4:[0-9]+]] {
+// SANITIZE-WITH-ATTR-SAME: i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
 // SANITIZE-WITH-ATTR-NEXT:    [[BAZ:%.*]] = alloca [[STRUCT_HANG:%.*]], align 4
-// SANITIZE-WITH-ATTR-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr nonnull [[BAZ]]) #[[ATTR9:[0-9]+]]
-// SANITIZE-WITH-ATTR-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 4 dereferenceable(24) [[BAZ]], ptr noundef nonnull align 4 dereferenceable(24) @test12_bar, i64 24, i1 false), !tbaa.struct [[TBAA_STRUCT9:![0-9]+]]
+// SANITIZE-WITH-ATTR-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr nonnull [[BAZ]]) #[[ATTR8:[0-9]+]]
+// SANITIZE-WITH-ATTR-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 4 dereferenceable(24) [[BAZ]], ptr noundef nonnull align 4 dereferenceable(24) @test12_bar, i64 24, i1 false), !tbaa.struct [[TBAA_STRUCT10:![0-9]+]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[INDEX]], 6
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = zext i32 [[INDEX]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label [[CONT:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB18:[0-9]+]], i64 [[TMP1]]) #[[ATTR8]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB22:[0-9]+]], i64 [[TMP1]]) #[[ATTR7]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [6 x i32], ptr [[BAZ]], i64 0, i64 [[TMP1]]
@@ -1028,12 +1073,12 @@ int test12_a, test12_b;
 // SANITIZE-WITH-ATTR-NEXT:    store i32 [[TMP2]], ptr @test12_b, align 4, !tbaa [[TBAA4]]
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr @test12_foo, align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTNOT:%.*]] = icmp eq i32 [[DOTCOUNTED_BY_LOAD]], 0
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[DOTNOT]], label [[HANDLER_OUT_OF_BOUNDS4:%.*]], label [[HANDLER_TYPE_MISMATCH6:%.*]], !prof [[PROF10:![0-9]+]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[DOTNOT]], label [[HANDLER_OUT_OF_BOUNDS4:%.*]], label [[HANDLER_TYPE_MISMATCH6:%.*]], !prof [[PROF8]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds4:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB20:[0-9]+]], i64 0) #[[ATTR8]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB24:[0-9]+]], i64 0) #[[ATTR7]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.type_mismatch6:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_type_mismatch_v1_abort(ptr nonnull @[[GLOB21:[0-9]+]], i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr @test12_foo, i64 4) to i64)) #[[ATTR8]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_type_mismatch_v1_abort(ptr nonnull @[[GLOB25:[0-9]+]], i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr @test12_foo, i64 4) to i64)) #[[ATTR7]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i32 @test12(
@@ -1123,7 +1168,7 @@ struct test13_bar {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp ult i64 [[INDEX]], [[TMP1]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP2]], label [[CONT5:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB24:[0-9]+]], i64 [[INDEX]]) #[[ATTR8]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB28:[0-9]+]], i64 [[INDEX]]) #[[ATTR7]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont5:
 // SANITIZE-WITH-ATTR-NEXT:    [[REVMAP:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
@@ -1184,7 +1229,7 @@ struct test14_foo {
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label [[CONT3:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IDX]] to i64
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB25:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB29:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR7]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont3:
 // SANITIZE-WITH-ATTR-NEXT:    ret i32 undef
@@ -1238,7 +1283,7 @@ int test14(int idx) {
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label [[CONT1:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IDX]] to i64
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB27:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB31:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR7]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont1:
 // SANITIZE-WITH-ATTR-NEXT:    ret i32 undef
@@ -1280,13 +1325,21 @@ int test15(int idx) {
   return foo.blah[idx];
 }
 
-// SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i64 @test19(
-// SANITIZE-WITH-ATTR-SAME: ptr noundef readnone captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test19(
+// SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 680
+// SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 4
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = icmp ugt i32 [[DOTCOUNTED_BY_LOAD]], 1
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label [[CONT1:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       handler.out_of_bounds:
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB32:[0-9]+]], i64 2) #[[ATTR7]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       cont1:
 // SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
-// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i64 @test19(
-// NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readnone captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test19(
+// NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR3]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
@@ -1302,11 +1355,11 @@ int test15(int idx) {
 //
 size_t test19(struct annotated *p) {
   // Avoid pointer arithmetic. It could lead to security issues.
-  return __builtin_dynamic_object_size(&(p + 42)->array[2], 1);
+  return __bdos(&(p + 42)->array[2]);
 }
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i64 @test20(
-// SANITIZE-WITH-ATTR-SAME: ptr noundef readnone captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// SANITIZE-WITH-ATTR-SAME: ptr noundef readnone captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
 // SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
@@ -1327,11 +1380,11 @@ size_t test19(struct annotated *p) {
 //
 size_t test20(struct annotated *p) {
   // Avoid side-effects.
-  return __builtin_dynamic_object_size(&(++p)->array[2], 1);
+  return __bdos(&(++p)->array[2]);
 }
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i64 @test21(
-// SANITIZE-WITH-ATTR-SAME: ptr noundef readnone captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// SANITIZE-WITH-ATTR-SAME: ptr noundef readnone captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
 // SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
@@ -1352,11 +1405,11 @@ size_t test20(struct annotated *p) {
 //
 size_t test21(struct annotated *p) {
   // Avoid side-effects.
-  return __builtin_dynamic_object_size(&(p++)->array[2], 1);
+  return __bdos(&(p++)->array[2]);
 }
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i64 @test22(
-// SANITIZE-WITH-ATTR-SAME: ptr noundef readnone captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// SANITIZE-WITH-ATTR-SAME: ptr noundef readnone captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
 // SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
@@ -1377,11 +1430,11 @@ size_t test21(struct annotated *p) {
 //
 size_t test22(struct annotated *p) {
   // Avoid side-effects.
-  return __builtin_dynamic_object_size(&(--p)->array[2], 1);
+  return __bdos(&(--p)->array[2]);
 }
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i64 @test23(
-// SANITIZE-WITH-ATTR-SAME: ptr noundef readnone captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// SANITIZE-WITH-ATTR-SAME: ptr noundef readnone captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
 // SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
@@ -1402,7 +1455,7 @@ size_t test22(struct annotated *p) {
 //
 size_t test23(struct annotated *p) {
   // Avoid side-effects.
-  return __builtin_dynamic_object_size(&(p--)->array[2], 1);
+  return __bdos(&(p--)->array[2]);
 }
 
 struct tests_foo {
@@ -1418,7 +1471,7 @@ struct tests_foo {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = icmp ugt i32 [[DOTCOUNTED_BY_LOAD]], 10
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label [[CONT4:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB28:[0-9]+]], i64 10) #[[ATTR8]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB33:[0-9]+]], i64 10) #[[ATTR7]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont4:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[VAR]], i64 84
@@ -1459,7 +1512,7 @@ int test24(int c, struct tests_foo *var) {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[DOTCOUNTED_BY_LOAD]], 10
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT5:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB29:[0-9]+]], i64 10) #[[ATTR8]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB34:[0-9]+]], i64 10) #[[ATTR7]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont5:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 44
@@ -1511,7 +1564,7 @@ struct test26_foo {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT5:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB30:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB35:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR7]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont5:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARR:%.*]] = getelementptr inbounds nuw i8, ptr [[FOO]], i64 8
@@ -1582,7 +1635,7 @@ struct test27_foo {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT3:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB32:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB37:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR7]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont3:
 // SANITIZE-WITH-ATTR-NEXT:    [[ENTRIES:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 24
@@ -1648,7 +1701,7 @@ struct test28_foo {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP4]], label [[CONT17:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB34:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB39:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR7]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont17:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARR:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 12
@@ -1710,26 +1763,26 @@ struct annotated_struct_array {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = zext i32 [[IDX1]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label [[CONT3:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB36:[0-9]+]], i64 [[TMP1]]) #[[ATTR8]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB41:[0-9]+]], i64 [[TMP1]]) #[[ATTR7]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont3:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [10 x ptr], ptr [[ANN]], i64 0, i64 [[TMP1]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA23:![0-9]+]]
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
-// SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM15:%.*]] = sext i32 [[IDX2]] to i64
+// SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM27:%.*]] = sext i32 [[IDX2]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = zext i32 [[COUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[IDXPROM15]], [[TMP3]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP4]], label [[CONT20:%.*]], label [[HANDLER_OUT_OF_BOUNDS16:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       handler.out_of_bounds16:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB37:[0-9]+]], i64 [[IDXPROM15]]) #[[ATTR8]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[IDXPROM27]], [[TMP3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP4]], label [[CONT32:%.*]], label [[HANDLER_OUT_OF_BOUNDS28:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       handler.out_of_bounds28:
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB42:[0-9]+]], i64 [[IDXPROM27]]) #[[ATTR7]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR:       cont20:
+// SANITIZE-WITH-ATTR:       cont32:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 12
-// SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds nuw [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM15]]
+// SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX30:%.*]] = getelementptr inbounds nuw [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM27]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP5:%.*]] = tail call i32 @llvm.smax.i32(i32 [[COUNTED_BY_LOAD]], i32 0)
 // SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = shl i32 [[TMP5]], 2
-// SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX18]], align 4, !tbaa [[TBAA4]]
+// SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX30]], align 4, !tbaa [[TBAA4]]
 // SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test29(
@@ -1738,14 +1791,14 @@ struct annotated_struct_array {
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IDX1]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x ptr], ptr [[ANN]], i64 0, i64 [[IDXPROM]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA20:![0-9]+]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 12
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 8
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.smax.i32(i32 [[COUNTED_BY_LOAD]], i32 0)
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = shl i32 [[TMP1]], 2
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 12
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM4:%.*]] = sext i32 [[IDX2]] to i64
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM4]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX5]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM8:%.*]] = sext i32 [[IDX2]] to i64
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM8]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX9]], align 4, !tbaa [[TBAA2]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test29(
@@ -1779,7 +1832,7 @@ struct annotated_struct_array {
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 void test29(struct annotated_struct_array *ann, int idx1, int idx2) {
-  ann->ann_array[idx1]->array[idx2] = __builtin_dynamic_object_size(ann->ann_array[idx1]->array, 1);
+  ann->ann_array[idx1]->array[idx2] = __bdos(ann->ann_array[idx1]->array);
 }
 
 typedef struct {
@@ -1794,10 +1847,10 @@ struct test30_struct {
 };
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local void @test30(
-// SANITIZE-WITH-ATTR-SAME: ptr noundef [[PTR:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// SANITIZE-WITH-ATTR-SAME: ptr noundef [[PTR:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR3]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[IDX]] to i64, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB39:[0-9]+]], i64 [[TMP0]]) #[[ATTR8]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB44:[0-9]+]], i64 [[TMP0]]) #[[ATTR7]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test30(
@@ -1826,7 +1879,7 @@ struct test30_struct {
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 void test30(struct test30_struct *ptr, int idx) {
-  ptr->pcpu_refcnt.__padding[idx] = __builtin_dynamic_object_size(ptr, 1);
+  ptr->pcpu_refcnt.__padding[idx] = __bdos(ptr);
 }
 
 struct test31_empty {};
@@ -1838,7 +1891,7 @@ struct test31_struct {
 };
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local i32 @test31(
-// SANITIZE-WITH-ATTR-SAME: ptr noundef [[PTR:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// SANITIZE-WITH-ATTR-SAME: ptr noundef [[PTR:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
 // SANITIZE-WITH-ATTR-NEXT:    ret i32 -1
 //
@@ -1858,5 +1911,302 @@ struct test31_struct {
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i32 -1
 //
 int test31(struct test31_struct *ptr, int idx) {
-  return __builtin_dynamic_object_size(ptr, 0);
+  return __bdos(ptr);
+}
+
+struct annotated_with_array {
+  unsigned long flags[42];
+  int count;
+  size_t array[] __counted_by(count);
+};
+
+// SANITIZE-WITH-ATTR-LABEL: define dso_local void @test32(
+// SANITIZE-WITH-ATTR-SAME: ptr noundef [[PTR:%.*]], i32 noundef [[IDX1:%.*]], i32 noundef [[IDX2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[IDX2]], 43
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label [[CONT1:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       handler.out_of_bounds:
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = zext i32 [[IDX2]] to i64, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB46:[0-9]+]], i64 [[TMP1]]) #[[ATTR7]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       cont1:
+// SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR]], i64 336
+// SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
+// SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM4:%.*]] = sext i32 [[IDX1]] to i64
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = zext i32 [[COUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[IDXPROM4]], [[TMP2]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP3]], label [[CONT9:%.*]], label [[HANDLER_OUT_OF_BOUNDS5:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       handler.out_of_bounds5:
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB48:[0-9]+]], i64 [[IDXPROM4]]) #[[ATTR7]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       cont9:
+// SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR]], i64 344
+// SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [0 x i64], ptr [[ARRAY]], i64 0, i64 [[IDXPROM4]]
+// SANITIZE-WITH-ATTR-NEXT:    [[COUNT:%.*]] = sext i32 [[COUNTED_BY_LOAD]] to i64
+// SANITIZE-WITH-ATTR-NEXT:    [[FLEXIBLE_ARRAY_MEMBER_SIZE:%.*]] = shl nsw i64 [[COUNT]], 3
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP4:%.*]] = shl nuw nsw i32 [[IDX2]], 3
+// SANITIZE-WITH-ATTR-NEXT:    [[FIELD_OFFSET:%.*]] = zext nneg i32 [[TMP4]] to i64
+// SANITIZE-WITH-ATTR-NEXT:    [[REASS_SUB:%.*]] = sub nsw i64 [[FLEXIBLE_ARRAY_MEMBER_SIZE]], [[FIELD_OFFSET]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.smax.i64(i64 [[REASS_SUB]], i64 -344)
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP6:%.*]] = add nsw i64 [[TMP5]], 344
+// SANITIZE-WITH-ATTR-NEXT:    store i64 [[TMP6]], ptr [[ARRAYIDX7]], align 8, !tbaa [[TBAA25:![0-9]+]]
+// SANITIZE-WITH-ATTR-NEXT:    ret void
+//
+// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test32(
+// NO-SANITIZE-WITH-ATTR-SAME: ptr noundef captures(none) [[PTR:%.*]], i32 noundef [[IDX1:%.*]], i32 noundef [[IDX2:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IDX2]] to i64
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR]], i64 336
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNT:%.*]] = sext i32 [[COUNTED_BY_LOAD]] to i64
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = sub nsw i64 [[COUNT]], [[IDXPROM]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[REASS_SUB:%.*]] = shl nsw i64 [[TMP0]], 3
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[RESULT:%.*]] = add nsw i64 [[REASS_SUB]], 344
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp sgt i64 [[TMP0]], -44
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp sgt i32 [[IDX2]], -1
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = and i1 [[TMP2]], [[TMP1]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i64 [[RESULT]], i64 0
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR]], i64 344
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM1:%.*]] = sext i32 [[IDX1]] to i64
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [0 x i64], ptr [[ARRAY]], i64 0, i64 [[IDXPROM1]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    store i64 [[TMP4]], ptr [[ARRAYIDX2]], align 8, !tbaa [[TBAA22:![0-9]+]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    ret void
+//
+// SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test32(
+// SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[PTR:%.*]], i32 noundef [[IDX1:%.*]], i32 noundef [[IDX2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[IDX2]], 43
+// SANITIZE-WITHOUT-ATTR-NEXT:    br i1 [[TMP0]], label [[CONT7:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF8]], !nosanitize [[META9]]
+// SANITIZE-WITHOUT-ATTR:       handler.out_of_bounds:
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP1:%.*]] = zext i32 [[IDX2]] to i64, !nosanitize [[META9]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB17:[0-9]+]], i64 [[TMP1]]) #[[ATTR7]], !nosanitize [[META9]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    unreachable, !nosanitize [[META9]]
+// SANITIZE-WITHOUT-ATTR:       cont7:
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR]], i64 344
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM4:%.*]] = sext i32 [[IDX1]] to i64
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [0 x i64], ptr [[ARRAY]], i64 0, i64 [[IDXPROM4]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    store i64 -1, ptr [[ARRAYIDX5]], align 8, !tbaa [[TBAA25:![0-9]+]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    ret void
+//
+// NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test32(
+// NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[PTR:%.*]], i32 noundef [[IDX1:%.*]], i32 noundef [[IDX2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR]], i64 344
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM1:%.*]] = sext i32 [[IDX1]] to i64
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [0 x i64], ptr [[ARRAY]], i64 0, i64 [[IDXPROM1]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i64 -1, ptr [[ARRAYIDX2]], align 8, !tbaa [[TBAA22:![0-9]+]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret void
+//
+void test32(struct annotated_with_array *ptr, int idx1, int idx2) {
+  ptr->array[idx1] =  __bdos(&ptr->flags[idx2]);
+}
+
+// SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 0, 17179869521) i64 @test32_bdos(
+// SANITIZE-WITH-ATTR-SAME: ptr noundef [[PTR:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[INDEX]], 43
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label [[CONT1:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       handler.out_of_bounds:
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = zext i32 [[INDEX]] to i64, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB49:[0-9]+]], i64 [[TMP1]]) #[[ATTR7]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       cont1:
+// SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR]], i64 336
+// SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
+// SANITIZE-WITH-ATTR-NEXT:    [[COUNT:%.*]] = sext i32 [[COUNTED_BY_LOAD]] to i64
+// SANITIZE-WITH-ATTR-NEXT:    [[FLEXIBLE_ARRAY_MEMBER_SIZE:%.*]] = shl nsw i64 [[COUNT]], 3
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = shl nuw nsw i32 [[INDEX]], 3
+// SANITIZE-WITH-ATTR-NEXT:    [[FIELD_OFFSET:%.*]] = zext nneg i32 [[TMP2]] to i64
+// SANITIZE-WITH-ATTR-NEXT:    [[REASS_SUB:%.*]] = sub nsw i64 [[FLEXIBLE_ARRAY_MEMBER_SIZE]], [[FIELD_OFFSET]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = tail call i64 @llvm.smax.i64(i64 [[REASS_SUB]], i64 -344)
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP4:%.*]] = add nsw i64 [[TMP3]], 344
+// SANITIZE-WITH-ATTR-NEXT:    ret i64 [[TMP4]]
+//
+// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -34359738016, 34359738705) i64 @test32_bdos(
+// NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) [[PTR:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR]], i64 336
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNT:%.*]] = sext i32 [[COUNTED_BY_LOAD]] to i64
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = sub nsw i64 [[COUNT]], [[IDXPROM]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[REASS_SUB:%.*]] = shl nsw i64 [[TMP0]], 3
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[RESULT:%.*]] = add nsw i64 [[REASS_SUB]], 344
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp sgt i64 [[TMP0]], -44
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp sgt i32 [[INDEX]], -1
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = and i1 [[TMP2]], [[TMP1]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i64 [[RESULT]], i64 0
+// NO-SANITIZE-WITH-ATTR-NEXT:    ret i64 [[TMP4]]
+//
+// SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test32_bdos(
+// SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[PTR:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[INDEX]], 43
+// SANITIZE-WITHOUT-ATTR-NEXT:    br i1 [[TMP0]], label [[CONT1:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF8]], !nosanitize [[META9]]
+// SANITIZE-WITHOUT-ATTR:       handler.out_of_bounds:
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP1:%.*]] = zext i32 [[INDEX]] to i64
+// SANITIZE-WITHOUT-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB18:[0-9]+]], i64 [[TMP1]]) #[[ATTR7]], !nosanitize [[META9]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    unreachable, !nosanitize [[META9]]
+// SANITIZE-WITHOUT-ATTR:       cont1:
+// SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
+//
+// NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test32_bdos(
+// NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readnone [[PTR:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
+//
+size_t test32_bdos(struct annotated_with_array *ptr, int index) {
+  return __bdos(&ptr->flags[index]);
+}
+
+// SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test33(
+// SANITIZE-WITH-ATTR-SAME: ptr noundef [[PTR:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
+//
+// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test33(
+// NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readnone [[PTR:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
+//
+// SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test33(
+// SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[PTR:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
+//
+// NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test33(
+// NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readnone [[PTR:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
+//
+size_t test33(struct annotated *ptr) {
+  // Don't handle '&ptr->array' like normal.
+  return __bdos(&*&*&*&ptr->array);
+}
+
+struct multi_subscripts {
+  unsigned long flags[42][42];
+  int count;
+  int array[] __counted_by(count);
+};
+
+// SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test34(
+// SANITIZE-WITH-ATTR-SAME: ptr noundef [[PTR:%.*]], i32 noundef [[IDX1:%.*]], i32 noundef [[IDX2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[IDX1]], 42
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label [[CONT1:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       handler.out_of_bounds:
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = zext i32 [[IDX1]] to i64, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB51:[0-9]+]], i64 [[TMP1]]) #[[ATTR7]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       cont1:
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp ult i32 [[IDX2]], 43
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP2]], label [[CONT3:%.*]], label [[HANDLER_OUT_OF_BOUNDS2:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       handler.out_of_bounds2:
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = zext i32 [[IDX2]] to i64
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB52:[0-9]+]], i64 [[TMP3]]) #[[ATTR7]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       cont3:
+// SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
+//
+// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test34(
+// NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readnone [[PTR:%.*]], i32 noundef [[IDX1:%.*]], i32 noundef [[IDX2:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
+//
+// SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test34(
+// SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[PTR:%.*]], i32 noundef [[IDX1:%.*]], i32 noundef [[IDX2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[IDX1]], 42
+// SANITIZE-WITHOUT-ATTR-NEXT:    br i1 [[TMP0]], label [[CONT1:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF8]], !nosanitize [[META9]]
+// SANITIZE-WITHOUT-ATTR:       handler.out_of_bounds:
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP1:%.*]] = zext i32 [[IDX1]] to i64, !nosanitize [[META9]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB20:[0-9]+]], i64 [[TMP1]]) #[[ATTR7]], !nosanitize [[META9]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    unreachable, !nosanitize [[META9]]
+// SANITIZE-WITHOUT-ATTR:       cont1:
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP2:%.*]] = icmp ult i32 [[IDX2]], 43
+// SANITIZE-WITHOUT-ATTR-NEXT:    br i1 [[TMP2]], label [[CONT3:%.*]], label [[HANDLER_OUT_OF_BOUNDS2:%.*]], !prof [[PROF8]], !nosanitize [[META9]]
+// SANITIZE-WITHOUT-ATTR:       handler.out_of_bounds2:
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP3:%.*]] = zext i32 [[IDX2]] to i64
+// SANITIZE-WITHOUT-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB21:[0-9]+]], i64 [[TMP3]]) #[[ATTR7]], !nosanitize [[META9]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    unreachable, !nosanitize [[META9]]
+// SANITIZE-WITHOUT-ATTR:       cont3:
+// SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
+//
+// NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test34(
+// NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readnone [[PTR:%.*]], i32 noundef [[IDX1:%.*]], i32 noundef [[IDX2:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
+//
+size_t test34(struct multi_subscripts *ptr, int idx1, int idx2) {
+  return __bdos(&ptr->flags[idx1][idx2]);
+}
+
+// SANITIZE-WITH-ATTR-LABEL: define dso_local void @test35(
+// SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
+// SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 4
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[INDEX]], [[TMP0]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT3:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       handler.out_of_bounds:
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB53:[0-9]+]], i64 [[INDEX]]) #[[ATTR7]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR:       cont3:
+// SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
+// SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]]
+// SANITIZE-WITH-ATTR-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA4]]
+// SANITIZE-WITH-ATTR-NEXT:    ret void
+//
+// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test35(
+// NO-SANITIZE-WITH-ATTR-SAME: ptr noundef writeonly captures(none) [[P:%.*]], i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    ret void
+//
+// SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test35(
+// SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    ret void
+//
+// NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test35(
+// NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef writeonly captures(none) [[P:%.*]], i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret void
+//
+void test35(struct annotated *p, size_t index) {
+  p->array[index] = __bdos(&p->array[-42]);
+}
+
+// SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i64 @test35_bdos(
+// SANITIZE-WITH-ATTR-SAME: ptr noundef readnone captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:    ret i64 0
+//
+// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i64 @test35_bdos(
+// NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readnone captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR3]] {
+// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:    ret i64 0
+//
+// SANITIZE-WITHOUT-ATTR-LABEL: define dso_local noundef i64 @test35_bdos(
+// SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readnone captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 0
+//
+// NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local noundef i64 @test35_bdos(
+// NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readnone captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 0
+//
+size_t test35_bdos(struct annotated *p) {
+  return __bdos(&p->array[-42]);
 }
diff --git clang/test/CodeGen/blocks-windows.c clang/test/CodeGen/blocks-windows.c
index 4379cd2e6b63..3da5f3d99bd2 100644
--- clang/test/CodeGen/blocks-windows.c
+++ clang/test/CodeGen/blocks-windows.c
@@ -2,43 +2,43 @@
 // RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DEFN -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DEFN
 // RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS
 // RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -DBLOCKS_NOT_IN_BLOCKS_EXTERN -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN
-// RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_EXTERN_DLLIMPORT -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN-DLLIMPORT
-// RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_DLLIMPORT -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-DLLIMPORT
+// RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -DBLOCKS_NOT_IN_BLOCKS_EXTERN_DLLIMPORT -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN-DLLIMPORT
+// RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -DBLOCKS_NOT_IN_BLOCKS_DLLIMPORT -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-DLLIMPORT
 
 // RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DECL -Os -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DECL
 // RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DEFN -Os -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DEFN
 // RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -Os -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS
 // RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -DBLOCKS_NOT_IN_BLOCKS_EXTERN -Os -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN
-// RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_EXTERN_DLLIMPORT -Os -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN-DLLIMPORT
-// RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_DLLIMPORT -Os -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-DLLIMPORT
+// RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -DBLOCKS_NOT_IN_BLOCKS_EXTERN_DLLIMPORT -Os -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN-DLLIMPORT
+// RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -DBLOCKS_NOT_IN_BLOCKS_DLLIMPORT -Os -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-DLLIMPORT
 
 // RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DECL -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DECL
 // RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DEFN -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DEFN
 // RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS
 // RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -DBLOCKS_NOT_IN_BLOCKS_EXTERN -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN
-// RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_EXTERN_DLLIMPORT -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN-DLLIMPORT
-// RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_DLLIMPORT -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-DLLIMPORT
+// RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -DBLOCKS_NOT_IN_BLOCKS_EXTERN_DLLIMPORT -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN-DLLIMPORT
+// RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -DBLOCKS_NOT_IN_BLOCKS_DLLIMPORT -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-DLLIMPORT
 
 // RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DECL -Os -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DECL
 // RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DEFN -Os -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DEFN
 // RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -Os -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS
 // RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -DBLOCKS_NOT_IN_BLOCKS_EXTERN -Os -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN
-// RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_EXTERN_DLLIMPORT -Os -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN-DLLIMPORT
-// RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_DLLIMPORT -Os -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-DLLIMPORT
+// RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -DBLOCKS_NOT_IN_BLOCKS_EXTERN_DLLIMPORT -Os -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN-DLLIMPORT
+// RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -DBLOCKS_NOT_IN_BLOCKS_DLLIMPORT -Os -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-DLLIMPORT
 
 // RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DECL -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DECL
 // RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DEFN -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DEFN
 // RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS
 // RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -DBLOCKS_NOT_IN_BLOCKS_EXTERN -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN
-// RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_EXTERN_DLLIMPORT -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN-DLLIMPORT
-// RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_DLLIMPORT -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-DLLIMPORT
+// RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -DBLOCKS_NOT_IN_BLOCKS_EXTERN_DLLIMPORT -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN-DLLIMPORT
+// RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -DBLOCKS_NOT_IN_BLOCKS_DLLIMPORT -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-DLLIMPORT
 
 // RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DECL -Os -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DECL
 // RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DEFN -Os -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DEFN
 // RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -Os -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS
 // RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -DBLOCKS_NOT_IN_BLOCKS_EXTERN -Os -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN
-// RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_EXTERN_DLLIMPORT -Os -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN-DLLIMPORT
-// RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_DLLIMPORT -Os -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-DLLIMPORT
+// RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -DBLOCKS_NOT_IN_BLOCKS_EXTERN_DLLIMPORT -Os -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN-DLLIMPORT
+// RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -DBLOCKS_NOT_IN_BLOCKS_DLLIMPORT -Os -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-DLLIMPORT
 
 void *_Block_copy(void *);
 
diff --git clang/test/CodeGen/import-call-optimization.c clang/test/CodeGen/import-call-optimization.c
new file mode 100644
index 000000000000..cc4e37fda7bb
--- /dev/null
+++ clang/test/CodeGen/import-call-optimization.c
@@ -0,0 +1,5 @@
+// RUN: %clang_cc1 -import-call-optimization -emit-llvm %s -o - | FileCheck %s
+
+void f(void) {}
+
+// CHECK: !"import-call-optimization", i32 1}
diff --git clang/test/CodeGen/tbaa-pointers.c clang/test/CodeGen/tbaa-pointers.c
index 4aae2552f107..48adac503357 100644
--- clang/test/CodeGen/tbaa-pointers.c
+++ clang/test/CodeGen/tbaa-pointers.c
@@ -208,12 +208,9 @@ int void_ptrs(void **ptr) {
 // COMMON-LABEL: define i32 @void_ptrs(
 // COMMON-SAME: ptr noundef [[PTRA:%.+]])
 // COMMON:        [[PTR_ADDR:%.+]]  = alloca ptr, align 8
-// DISABLE-NEXT:  store ptr [[PTRA]], ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR]]
-// DISABLE-NEXT:  [[L0:%.+]] = load ptr, ptr  [[PTR_ADDR]], align 8, !tbaa  [[ANYPTR]]
-// DISABLE-NEXT:  [[L1:%.+]] = load ptr, ptr [[L0]], align 8, !tbaa  [[ANYPTR]]
-// DEFAULT-NEXT:  store ptr [[PTRA]], ptr [[PTR_ADDR]], align 8, !tbaa [[P2VOID:!.+]]
-// DEFAULT-NEXT:  [[L0:%.+]] = load ptr, ptr  [[PTR_ADDR]], align 8, !tbaa  [[P2VOID]]
-// DEFAULT-NEXT:  [[L1:%.+]] = load ptr, ptr [[L0]], align 8, !tbaa  [[P1VOID:!.+]]
+// COMMON-NEXT:   store ptr [[PTRA]], ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR]]
+// COMMON-NEXT:   [[L0:%.+]] = load ptr, ptr  [[PTR_ADDR]], align 8, !tbaa  [[ANYPTR]]
+// COMMON-NEXT:   [[L1:%.+]] = load ptr, ptr [[L0]], align 8, !tbaa  [[ANYPTR]]
 // COMMON-NEXT:   [[BOOL:%.+]] = icmp ne ptr [[L1]], null
 // COMMON-NEXT:   [[BOOL_EXT:%.+]] = zext i1 [[BOOL]] to i64
 // COMMON-NEXT:   [[COND:%.+]] = select i1 [[BOOL]], i32 0, i32 1
@@ -254,7 +251,3 @@ int void_ptrs(void **ptr) {
 // COMMON:  [[INT_TAG]] = !{[[INT_TY:!.+]], [[INT_TY]], i64 0}
 // COMMON:  [[INT_TY]] = !{!"int", [[CHAR]], i64 0}
 // DEFAULT: [[ANYPTR]] = !{[[ANY_POINTER]],  [[ANY_POINTER]], i64 0}
-// DEFAULT: [[P2VOID]] = !{[[P2VOID_TY:!.+]], [[P2VOID_TY]], i64 0}
-// DEFAULT: [[P2VOID_TY]] = !{!"p2 void", [[ANY_POINTER]], i64 0}
-// DEFAULT: [[P1VOID]] = !{[[P1VOID_TY:!.+]], [[P1VOID_TY]], i64 0}
-// DEFAULT: [[P1VOID_TY]] = !{!"p1 void", [[ANY_POINTER]], i64 0}
diff --git clang/test/CodeGen/xfail-alloc-align-fn-pointers.cpp clang/test/CodeGen/xfail-alloc-align-fn-pointers.cpp
new file mode 100644
index 000000000000..80067500284b
--- /dev/null
+++ clang/test/CodeGen/xfail-alloc-align-fn-pointers.cpp
@@ -0,0 +1,10 @@
+
+// RUN: %clang_cc1 %s
+
+// FIXME: These should not crash!
+// XFAIL: *
+
+void aa_fn_ptr(char* (*member)(char*)  __attribute__((alloc_align(1))));
+
+struct Test;
+void aa_member_fn_ptr(char* (Test::*member)(char*)  __attribute__((alloc_align(1))));
diff --git clang/test/CodeGenCXX/aarch64-ms-mangle-mfp8.cpp clang/test/CodeGenCXX/aarch64-ms-mangle-mfp8.cpp
new file mode 100644
index 000000000000..b5fd9171ad81
--- /dev/null
+++ clang/test/CodeGenCXX/aarch64-ms-mangle-mfp8.cpp
@@ -0,0 +1,14 @@
+// RUN: %clang_cc1 -triple aarch64-windows-msvc -emit-llvm -o - %s | FileCheck %s
+
+typedef __mfp8 mf8;
+typedef __attribute__((neon_vector_type(8))) __mfp8 mf8x8_t;
+typedef __attribute__((neon_vector_type(16))) __mfp8 mf8x16_t;
+
+// CHECK: "?f@@YAXU__mfp8@__clang@@@Z"
+void f(mf8 v) {}
+
+// CHECK: "?f@@YAXT?$__vector@U__mfp8@__clang@@$07@__clang@@@Z"
+void f(mf8x8_t v) {}
+
+// CHECK: "?f@@YAXT?$__vector@U__mfp8@__clang@@$0BA@@__clang@@@Z"
+void f(mf8x16_t v) {}
diff --git clang/test/CodeGenCXX/template-param-objects.cpp clang/test/CodeGenCXX/template-param-objects.cpp
index 11ebd21521e8..ff6acc438d13 100644
--- clang/test/CodeGenCXX/template-param-objects.cpp
+++ clang/test/CodeGenCXX/template-param-objects.cpp
@@ -5,6 +5,9 @@ struct S { char buf[32]; };
 template<S s> constexpr const char *begin() { return s.buf; }
 template<S s> constexpr const char *end() { return s.buf + __builtin_strlen(s.buf); }
 
+namespace { struct T { char buf[32]; }; }
+template<T t> constexpr const char* begin_anon() { return t.buf; }
+
 // ITANIUM: [[HELLO:@_ZTAXtl1StlA32_cLc104ELc101ELc108ELc108ELc111ELc32ELc119ELc111ELc114ELc108ELc100EEEE]]
 // MSABI: [[HELLO:@"[?][?]__N2US@@3D0GI@@0GF@@0GM@@0GM@@0GP@@0CA@@0HH@@0GP@@0HC@@0GM@@0GE@@0A@@0A@@0A@@0A@@0A@@0A@@0A@@0A@@0A@@0A@@0A@@0A@@0A@@0A@@0A@@0A@@0A@@0A@@0A@@0A@@0A@@@@@"]]
 // ITANIUM-SAME: = linkonce_odr constant { <{ [11 x i8], [21 x i8] }> } { <{ [11 x i8], [21 x i8] }> <{ [11 x i8] c"hello world", [21 x i8] zeroinitializer }> }, comdat
@@ -19,3 +22,10 @@ const char *p = begin<S{"hello world"}>();
 // MSABI: @"?q@@3PEBDEB"
 // CHECK-SAME: global ptr getelementptr (i8, ptr [[HELLO]], i64 11)
 const char *q = end<S{"hello world"}>();
+
+
+// CHECK: internal constant { <{ [10 x i8], [22 x i8] }> } { <{ [10 x i8], [22 x i8] }> <{ [10 x i8] c"hello anon", [22 x i8] zeroinitializer }> }
+// CHECK-NOT: comdat
+// ITANIUM: @r
+// MSABI: @"?r@@3PEBDEB"
+const char *r = begin_anon<T{"hello anon"}>();
diff --git clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
index 5599f4dd50f0..ace34dd0ca6d 100644
--- clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
+++ clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
@@ -651,7 +651,7 @@ kernel void test_target_features_kernel(global int *i) {
 //
 // GFX900: Function Attrs: convergent nounwind
 // GFX900-LABEL: define {{[^@]+}}@__test_block_invoke_3_kernel
-// GFX900-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) #[[ATTR6]] !kernel_arg_addr_space [[META28:![0-9]+]] !kernel_arg_access_qual [[META29:![0-9]+]] !kernel_arg_type [[META30:![0-9]+]] !kernel_arg_base_type [[META30]] !kernel_arg_type_qual [[META31:![0-9]+]] {
+// GFX900-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) #[[ATTR6]] !kernel_arg_addr_space [[META27:![0-9]+]] !kernel_arg_access_qual [[META28:![0-9]+]] !kernel_arg_type [[META29:![0-9]+]] !kernel_arg_base_type [[META29]] !kernel_arg_type_qual [[META30:![0-9]+]] {
 // GFX900-NEXT:  entry:
 // GFX900-NEXT:    [[TMP2:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5)
 // GFX900-NEXT:    store <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0]], ptr addrspace(5) [[TMP2]], align 8
@@ -688,7 +688,7 @@ kernel void test_target_features_kernel(global int *i) {
 //
 // GFX900: Function Attrs: convergent norecurse nounwind
 // GFX900-LABEL: define {{[^@]+}}@test_target_features_kernel
-// GFX900-SAME: (ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META32:![0-9]+]] !kernel_arg_access_qual [[META23]] !kernel_arg_type [[META33:![0-9]+]] !kernel_arg_base_type [[META33]] !kernel_arg_type_qual [[META25]] {
+// GFX900-SAME: (ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META31:![0-9]+]] !kernel_arg_access_qual [[META23]] !kernel_arg_type [[META32:![0-9]+]] !kernel_arg_base_type [[META32]] !kernel_arg_type_qual [[META25]] {
 // GFX900-NEXT:  entry:
 // GFX900-NEXT:    [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // GFX900-NEXT:    [[DEFAULT_QUEUE:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
@@ -700,7 +700,7 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-NEXT:    [[FLAGS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[FLAGS]] to ptr
 // GFX900-NEXT:    [[NDRANGE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[NDRANGE]] to ptr
 // GFX900-NEXT:    [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr
-// GFX900-NEXT:    store ptr addrspace(1) [[I]], ptr [[I_ADDR_ASCAST]], align 8, !tbaa [[TBAA34:![0-9]+]]
+// GFX900-NEXT:    store ptr addrspace(1) [[I]], ptr [[I_ADDR_ASCAST]], align 8, !tbaa [[TBAA33:![0-9]+]]
 // GFX900-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR8]]
 // GFX900-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR8]]
 // GFX900-NEXT:    store i32 0, ptr [[FLAGS_ASCAST]], align 4, !tbaa [[TBAA17]]
@@ -803,16 +803,15 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900: [[META23]] = !{!"none"}
 // GFX900: [[META24]] = !{!"__block_literal"}
 // GFX900: [[META25]] = !{!""}
-// GFX900: [[TBAA26]] = !{[[META27:![0-9]+]], [[META27]], i64 0}
-// GFX900: [[META27]] = !{!"p1 void", [[META9]], i64 0}
-// GFX900: [[META28]] = !{i32 0, i32 3}
-// GFX900: [[META29]] = !{!"none", !"none"}
-// GFX900: [[META30]] = !{!"__block_literal", !"void*"}
-// GFX900: [[META31]] = !{!"", !""}
-// GFX900: [[META32]] = !{i32 1}
-// GFX900: [[META33]] = !{!"int*"}
-// GFX900: [[TBAA34]] = !{[[META35:![0-9]+]], [[META35]], i64 0}
-// GFX900: [[META35]] = !{!"p1 int", [[META9]], i64 0}
+// GFX900: [[TBAA26]] = !{[[META9]], [[META9]], i64 0}
+// GFX900: [[META27]] = !{i32 0, i32 3}
+// GFX900: [[META28]] = !{!"none", !"none"}
+// GFX900: [[META29]] = !{!"__block_literal", !"void*"}
+// GFX900: [[META30]] = !{!"", !""}
+// GFX900: [[META31]] = !{i32 1}
+// GFX900: [[META32]] = !{!"int*"}
+// GFX900: [[TBAA33]] = !{[[META34:![0-9]+]], [[META34]], i64 0}
+// GFX900: [[META34]] = !{!"p1 int", [[META9]], i64 0}
 //.
 //// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 // CHECK: {{.*}}
diff --git clang/test/Driver/amdgpu-openmp-system-arch-fail.c clang/test/Driver/amdgpu-openmp-system-arch-fail.c
index b7e1d0b2c566..eb037183b4c3 100644
--- clang/test/Driver/amdgpu-openmp-system-arch-fail.c
+++ clang/test/Driver/amdgpu-openmp-system-arch-fail.c
@@ -12,9 +12,9 @@
 // case when amdgpu_arch returns nothing or fails
 // RUN:   not %clang -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp -fopenmp-targets=amdgcn-amd-amdhsa -nogpulib --amdgpu-arch-tool=%t/amdgpu_arch_fail %s 2>&1 \
 // RUN:   | FileCheck %s --check-prefix=NO-OUTPUT-ERROR
-// NO-OUTPUT-ERROR: error: cannot determine amdgcn architecture{{.*}}; consider passing it via '-march'
+// NO-OUTPUT-ERROR: error: cannot determine amdgcn architecture{{.*}}; consider passing it via '--offload-arch'
 
 // case when amdgpu_arch does not return anything with successful execution
 // RUN:   not %clang -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp -fopenmp-targets=amdgcn-amd-amdhsa -nogpulib --amdgpu-arch-tool=%t/amdgpu_arch_empty %s 2>&1 \
 // RUN:   | FileCheck %s --check-prefix=EMPTY-OUTPUT
-// EMPTY-OUTPUT: error: cannot determine amdgcn architecture: No AMD GPU detected in the system; consider passing it via '-march'
+// EMPTY-OUTPUT: error: cannot determine amdgcn architecture: No AMD GPU detected in the system; consider passing it via '--offload-arch'
diff --git clang/test/Driver/amdgpu-openmp-toolchain.c clang/test/Driver/amdgpu-openmp-toolchain.c
index 1c2ee2617313..1f4d724a269e 100644
--- clang/test/Driver/amdgpu-openmp-toolchain.c
+++ clang/test/Driver/amdgpu-openmp-toolchain.c
@@ -16,12 +16,12 @@
 // CHECK-PHASES: 0: input, "[[INPUT:.+]]", c, (host-openmp)
 // CHECK-PHASES: 1: preprocessor, {0}, cpp-output, (host-openmp)
 // CHECK-PHASES: 2: compiler, {1}, ir, (host-openmp)
-// CHECK-PHASES: 3: input, "[[INPUT]]", c, (device-openmp)
-// CHECK-PHASES: 4: preprocessor, {3}, cpp-output, (device-openmp)
-// CHECK-PHASES: 5: compiler, {4}, ir, (device-openmp)
-// CHECK-PHASES: 6: offload, "host-openmp (x86_64-unknown-linux-gnu)" {2}, "device-openmp (amdgcn-amd-amdhsa)" {5}, ir
-// CHECK-PHASES: 7: backend, {6}, ir, (device-openmp)
-// CHECK-PHASES: 8: offload, "device-openmp (amdgcn-amd-amdhsa)" {7}, ir
+// CHECK-PHASES: 3: input, "[[INPUT]]", c, (device-openmp, gfx906)
+// CHECK-PHASES: 4: preprocessor, {3}, cpp-output, (device-openmp, gfx906)
+// CHECK-PHASES: 5: compiler, {4}, ir, (device-openmp, gfx906)
+// CHECK-PHASES: 6: offload, "host-openmp (x86_64-unknown-linux-gnu)" {2}, "device-openmp (amdgcn-amd-amdhsa:gfx906)" {5}, ir
+// CHECK-PHASES: 7: backend, {6}, ir, (device-openmp, gfx906)
+// CHECK-PHASES: 8: offload, "device-openmp (amdgcn-amd-amdhsa:gfx906)" {7}, ir
 // CHECK-PHASES: 9: clang-offload-packager, {8}, image, (device-openmp)
 // CHECK-PHASES: 10: offload, "host-openmp (x86_64-unknown-linux-gnu)" {2}, "device-openmp (x86_64-unknown-linux-gnu)" {9}, ir
 // CHECK-PHASES: 11: backend, {10}, assembler, (host-openmp)
diff --git clang/test/Driver/hexagon-cpu-default.c clang/test/Driver/hexagon-cpu-default.c
new file mode 100644
index 000000000000..31fb839f2165
--- /dev/null
+++ clang/test/Driver/hexagon-cpu-default.c
@@ -0,0 +1,4 @@
+// CHECK: "-target-cpu" "hexagonv68"
+
+// RUN: %clang -c %s -### --target=hexagon-unknown-elf \
+// RUN:  2>&1 | FileCheck  %s
diff --git clang/test/Driver/hip-sanitize-options.hip clang/test/Driver/hip-sanitize-options.hip
index d94cbdacdaeb..8de0ee9e1842 100644
--- clang/test/Driver/hip-sanitize-options.hip
+++ clang/test/Driver/hip-sanitize-options.hip
@@ -18,7 +18,7 @@
 // RUN:   -nogpuinc --rocm-path=%S/Inputs/rocm \
 // RUN:   %s 2>&1 | FileCheck -check-prefixes=RDC %s
 
-// RUN: not %clang -### --target=x86_64-unknown-linux-gnu --offload-arch=gfx900:xnack+ \
+// RUN: not %clang -### --target=x86_64-unknown-linux-gnu -mcode-object-version=5 --offload-arch=gfx900:xnack+ \
 // RUN:   -fsanitize=address -fgpu-sanitize \
 // RUN:   -nogpuinc --rocm-path=%S/Inputs/rocm-invalid \
 // RUN:   %s 2>&1 | FileCheck -check-prefixes=FAIL %s
@@ -52,15 +52,15 @@
 // CHECK-NOT: {{"[^"]*lld(\.exe){0,1}".* ".*hip.bc"}}
 // CHECK: {{"[^"]*clang[^"]*".* "-triple" "x86_64-unknown-linux-gnu".* "-fsanitize=address"}}
 
-// NORDC: {{"[^"]*clang[^"]*".* "-emit-obj".* "-fcuda-is-device".* "-mlink-bitcode-file" ".*asanrtl.bc".* "-mlink-builtin-bitcode" ".*hip.bc".* "-fsanitize=address".*}} "-o" "[[OUT:[^"]*.o]]"
+// NORDC: {{"[^"]*clang[^"]*".* "-emit-obj".* "-fcuda-is-device".* "-mlink-builtin-bitcode" ".*hip.bc".* "-mlink-bitcode-file" ".*asanrtl.bc".* "-fsanitize=address".*}} "-o" "[[OUT:[^"]*.o]]"
 // NORDC-NOT: {{"[^"]*lld(\.exe){0,1}".*}} "[[OUT]]" {{".*asanrtl.bc" ".*hip.bc"}}
 // NORDC: {{"[^"]*clang[^"]*".* "-triple" "x86_64-unknown-linux-gnu".* "-fsanitize=address"}}
 
 // RDC: {{"[^"]*clang[^"]*".* "-triple" "x86_64-unknown-linux-gnu".* "-fsanitize=address"}}
-// RDC: {{"[^"]*clang[^"]*".* "-emit-llvm-bc".* "-fcuda-is-device".* "-mlink-bitcode-file" ".*asanrtl.bc".* "-mlink-builtin-bitcode" ".*hip.bc".* "-fsanitize=address".*}} "-o" "[[OUT:[^"]*.bc]]"
+// RDC: {{"[^"]*clang[^"]*".* "-emit-llvm-bc".* "-fcuda-is-device".* "-mlink-builtin-bitcode" ".*hip.bc".* "-mlink-bitcode-file" ".*asanrtl.bc".* "-fsanitize=address".*}} "-o" "[[OUT:[^"]*.bc]]"
 // RDC-NOT: {{"[^"]*lld(\.exe){0,1}".*}} "[[OUT]]" {{".*asanrtl.bc" ".*hip.bc"}}
 
-// FAIL: AMDGPU address sanitizer runtime library (asanrtl) is not found. Please install ROCm device library which supports address sanitizer
+// FAIL: error: cannot find ROCm device library for ABI version 5; provide its path via '--rocm-path' or '--rocm-device-lib-path', or pass '-nogpulib' to build without ROCm device library
 
 // XNACK-DAG: warning: ignoring '-fsanitize=leak' option as it is not currently supported for target 'amdgcn-amd-amdhsa'
 // XNACK-DAG: warning: ignoring '-fsanitize=address' option for offload arch 'gfx900:xnack-' as it is not currently supported there. Use it with an offload arch containing 'xnack+' instead
diff --git clang/test/Driver/openmp-offload-gpu.c clang/test/Driver/openmp-offload-gpu.c
index 74bd2a6aeee4..1f7e2996068c 100644
--- clang/test/Driver/openmp-offload-gpu.c
+++ clang/test/Driver/openmp-offload-gpu.c
@@ -235,13 +235,13 @@
 // CHECK-PHASES: 0: input, "[[INPUT:.+]]", c, (host-openmp)
 // CHECK-PHASES: 1: preprocessor, {0}, cpp-output, (host-openmp)
 // CHECK-PHASES: 2: compiler, {1}, ir, (host-openmp)
-// CHECK-PHASES: 3: input, "[[INPUT]]", c, (device-openmp)
-// CHECK-PHASES: 4: preprocessor, {3}, cpp-output, (device-openmp)
-// CHECK-PHASES: 5: compiler, {4}, ir, (device-openmp)
-// CHECK-PHASES: 6: offload, "host-openmp (x86_64-unknown-linux-gnu)" {2}, "device-openmp (nvptx64-nvidia-cuda)" {5}, ir
-// CHECK-PHASES: 7: backend, {6}, assembler, (device-openmp)
-// CHECK-PHASES: 8: assembler, {7}, object, (device-openmp)
-// CHECK-PHASES: 9: offload, "device-openmp (nvptx64-nvidia-cuda)" {8}, object
+// CHECK-PHASES: 3: input, "[[INPUT]]", c, (device-openmp, sm_52)
+// CHECK-PHASES: 4: preprocessor, {3}, cpp-output, (device-openmp, sm_52)
+// CHECK-PHASES: 5: compiler, {4}, ir, (device-openmp, sm_52)
+// CHECK-PHASES: 6: offload, "host-openmp (x86_64-unknown-linux-gnu)" {2}, "device-openmp (nvptx64-nvidia-cuda:sm_52)" {5}, ir
+// CHECK-PHASES: 7: backend, {6}, assembler, (device-openmp, sm_52)
+// CHECK-PHASES: 8: assembler, {7}, object, (device-openmp, sm_52)
+// CHECK-PHASES: 9: offload, "device-openmp (nvptx64-nvidia-cuda:sm_52)" {8}, object
 // CHECK-PHASES: 10: clang-offload-packager, {9}, image
 // CHECK-PHASES: 11: offload, "host-openmp (x86_64-unknown-linux-gnu)" {2}, "device-openmp (x86_64-unknown-linux-gnu)" {10}, ir
 // CHECK-PHASES: 12: backend, {11}, assembler, (host-openmp)
@@ -315,7 +315,7 @@
 // RUN:     -Xopenmp-target=nvptx64-nvidia-cuda -march=sm_52 --offload-device-only -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-DEVICE-ONLY
 // CHECK-DEVICE-ONLY: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT:.*]]"], output: "[[HOST_BC:.*]]"
 // CHECK-DEVICE-ONLY: "nvptx64-nvidia-cuda" - "clang", inputs: ["[[INPUT]]", "[[HOST_BC]]"], output: "[[DEVICE_ASM:.*]]"
-// CHECK-DEVICE-ONLY: "nvptx64-nvidia-cuda" - "NVPTX::Assembler", inputs: ["[[DEVICE_ASM]]"], output: "{{.*}}-openmp-nvptx64-nvidia-cuda.o"
+// CHECK-DEVICE-ONLY: "nvptx64-nvidia-cuda" - "NVPTX::Assembler", inputs: ["[[DEVICE_ASM]]"], output: "{{.*}}-openmp-nvptx64-nvidia-cuda-sm_52.o"
 
 // RUN:   %clang -### --target=x86_64-unknown-linux-gnu -ccc-print-bindings -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \
 // RUN:     -Xopenmp-target=nvptx64-nvidia-cuda -march=sm_52 --offload-device-only -E -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-DEVICE-ONLY-PP
diff --git clang/test/Driver/openmp-offload-jit.c clang/test/Driver/openmp-offload-jit.c
index 57f265ac37ea..b3566f06bbee 100644
--- clang/test/Driver/openmp-offload-jit.c
+++ clang/test/Driver/openmp-offload-jit.c
@@ -1,29 +1,29 @@
 // Check that we enable LTO-mode properly with '-fopenmp-target-jit' and that it
 // still enabled LTO-mode if `-fno-offload-lto` is on.
 // RUN: %clang -### --target=x86_64-unknown-linux-gnu -ccc-print-phases -fopenmp=libomp \
-// RUN:   -fopenmp-targets=nvptx64-nvidia-cuda -fopenmp-target-jit %s 2>&1 \
+// RUN:  --offload-arch=sm_52 -fopenmp-target-jit %s 2>&1 \
 // RUN: | FileCheck -check-prefix=PHASES-JIT %s
 // RUN: %clang -### --target=x86_64-unknown-linux-gnu -ccc-print-phases -fopenmp=libomp \
-// RUN:   -fopenmp-targets=nvptx64-nvidia-cuda -foffload-lto -fopenmp-target-jit %s 2>&1 \
+// RUN:   --offload-arch=sm_52 -foffload-lto -fopenmp-target-jit %s 2>&1 \
 // RUN: | FileCheck -check-prefix=PHASES-JIT %s
 // RUN: %clang -### --target=x86_64-unknown-linux-gnu -ccc-print-phases -fopenmp=libomp \
-// RUN:   -fopenmp-targets=amdgcn-amd-amdhsa -fopenmp-target-jit %s 2>&1 \
+// RUN:   --offload-arch=gfx90a -fopenmp-target-jit %s 2>&1 \
 // RUN: | FileCheck -check-prefix=PHASES-JIT %s
 // RUN: %clang -### --target=x86_64-unknown-linux-gnu -ccc-print-phases -fopenmp=libomp \
-// RUN:   -fopenmp-targets=amdgcn-amd-amdhsa -foffload-lto -fopenmp-target-jit %s 2>&1 \
+// RUN:   --offload-arch=gfx90a -foffload-lto -fopenmp-target-jit %s 2>&1 \
 // RUN: | FileCheck -check-prefix=PHASES-JIT %s
 // RUN: not %clang -### --target=x86_64-unknown-linux-gnu -ccc-print-phases -fopenmp=libomp \
-// RUN:   -fopenmp-targets=amdgcn-amd-amdhsa -fno-offload-lto -fopenmp-target-jit %s 2>&1 \
+// RUN:   --offload-arch=gfx90a -fno-offload-lto -fopenmp-target-jit %s 2>&1 \
 // RUN: | FileCheck -check-prefix=PHASES-JIT %s
 //
 //      PHASES-JIT: 0: input, "[[INPUT:.+]]", c, (host-openmp)
 // PHASES-JIT-NEXT: 1: preprocessor, {0}, cpp-output, (host-openmp)
 // PHASES-JIT-NEXT: 2: compiler, {1}, ir, (host-openmp)
-// PHASES-JIT-NEXT: 3: input, "[[INPUT]]", c, (device-openmp)
-// PHASES-JIT-NEXT: 4: preprocessor, {3}, cpp-output, (device-openmp)
-// PHASES-JIT-NEXT: 5: compiler, {4}, ir, (device-openmp)
+// PHASES-JIT-NEXT: 3: input, "[[INPUT]]", c, (device-openmp, {{.*}})
+// PHASES-JIT-NEXT: 4: preprocessor, {3}, cpp-output, (device-openmp, {{.*}})
+// PHASES-JIT-NEXT: 5: compiler, {4}, ir, (device-openmp, {{.*}})
 // PHASES-JIT-NEXT: 6: offload, "host-openmp (x86_64-unknown-linux-gnu)" {2}, "device-openmp ([[TARGET:.+]])" {5}, ir
-// PHASES-JIT-NEXT: 7: backend, {6}, lto-bc, (device-openmp)
+// PHASES-JIT-NEXT: 7: backend, {6}, lto-bc, (device-openmp, {{.*}})
 // PHASES-JIT-NEXT: 8: offload, "device-openmp ([[TARGET]])" {7}, lto-bc
 // PHASES-JIT-NEXT: 9: clang-offload-packager, {8}, image, (device-openmp)
 // PHASES-JIT-NEXT: 10: offload, "host-openmp (x86_64-unknown-linux-gnu)" {2}, "device-openmp (x86_64-unknown-linux-gnu)" {9}, ir
@@ -41,11 +41,11 @@
 // Check for incompatible combinations
 
 // RUN: not %clang -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp -fno-offload-lto \
-// RUN:   -fopenmp-targets=nvptx64-nvidia-cuda -fopenmp-target-jit %s 2>&1 \
+// RUN:   --offload-arch=sm_52 -fopenmp-target-jit %s 2>&1 \
 // RUN: | FileCheck -check-prefix=NO-LTO %s
 // NO-LTO: error: the combination of '-fno-offload-lto' and '-fopenmp-target-jit' is incompatible
 
 // RUN: not %clang -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp -foffload-lto=thin \
-// RUN:   -fopenmp-targets=nvptx64-nvidia-cuda -fopenmp-target-jit %s 2>&1 \
+// RUN:   --offload-arch=sm_52 -fopenmp-targets=nvptx64-nvidia-cuda -fopenmp-target-jit %s 2>&1 \
 // RUN: | FileCheck -check-prefix=THIN-LTO %s
 // THIN-LTO: error: the combination of '-foffload-lto=' and '-fopenmp-target-jit' is incompatible
diff --git clang/test/Driver/openmp-system-arch.c clang/test/Driver/openmp-system-arch.c
index d097c6bc0654..51021e352f4d 100644
--- clang/test/Driver/openmp-system-arch.c
+++ clang/test/Driver/openmp-system-arch.c
@@ -68,13 +68,13 @@
 // RUN:   not %clang -### --target=x86_64-unknown-linux-gnu -nogpulib -fopenmp=libomp \
 // RUN:     -fopenmp-targets=nvptx64-nvidia-cuda --nvptx-arch-tool=%t/nvptx_arch_empty %s 2>&1 \
 // RUN:   | FileCheck %s --check-prefix=NVPTX
-// NVPTX: error: cannot determine nvptx64 architecture: No NVIDIA GPU detected in the system; consider passing it via '-march'
+// NVPTX: error: cannot determine nvptx64 architecture: No NVIDIA GPU detected in the system; consider passing it via '--offload-arch'
 
 // case when 'amdgpu-arch' returns nothing using `-fopenmp-targets=`.
 // RUN:   not %clang -### --target=x86_64-unknown-linux-gnu -nogpulib -fopenmp=libomp \
 // RUN:     -fopenmp-targets=amdgcn-amd-amdhsa --amdgpu-arch-tool=%t/amdgpu_arch_empty %s 2>&1 \
 // RUN:   | FileCheck %s --check-prefix=AMDGPU
-// AMDGPU: error: cannot determine amdgcn architecture: No AMD GPU detected in the system; consider passing it via '-march'
+// AMDGPU: error: cannot determine amdgcn architecture: No AMD GPU detected in the system; consider passing it via '--offload-arch'
 
 // case when CLANG_TOOLCHAIN_PROGRAM_TIMEOUT is malformed for nvptx-arch.
 // RUN: env CLANG_TOOLCHAIN_PROGRAM_TIMEOUT=foo \
@@ -82,7 +82,7 @@
 // RUN:     -fopenmp-targets=nvptx64-nvidia-cuda -nogpulib \
 // RUN:     --nvptx-arch-tool=%t/nvptx_arch_sm_70 %s 2>&1 | \
 // RUN:   FileCheck %s --check-prefix=BAD-TIMEOUT-NVPTX
-// BAD-TIMEOUT-NVPTX: clang: error: cannot determine nvptx64 architecture: CLANG_TOOLCHAIN_PROGRAM_TIMEOUT expected an integer, got 'foo'; consider passing it via '-march'; environment variable CLANG_TOOLCHAIN_PROGRAM_TIMEOUT specifies the tool timeout (integer secs, <=0 is infinite)
+// BAD-TIMEOUT-NVPTX: clang: error: cannot determine nvptx64 architecture: CLANG_TOOLCHAIN_PROGRAM_TIMEOUT expected an integer, got 'foo'; consider passing it via '--offload-arch'; environment variable CLANG_TOOLCHAIN_PROGRAM_TIMEOUT specifies the tool timeout (integer secs, <=0 is infinite)
 
 // case when CLANG_TOOLCHAIN_PROGRAM_TIMEOUT is malformed for amdgpu-arch.
 // RUN: env CLANG_TOOLCHAIN_PROGRAM_TIMEOUT= \
@@ -90,4 +90,4 @@
 // RUN:     -fopenmp-targets=amdgcn-amd-amdhsa -nogpulib \
 // RUN:     --amdgpu-arch-tool=%t/amdgpu_arch_gfx906 %s 2>&1 | \
 // RUN:   FileCheck %s --check-prefix=BAD-TIMEOUT-AMDGPU
-// BAD-TIMEOUT-AMDGPU: clang: error: cannot determine amdgcn architecture: CLANG_TOOLCHAIN_PROGRAM_TIMEOUT expected an integer, got ''; consider passing it via '-march'; environment variable CLANG_TOOLCHAIN_PROGRAM_TIMEOUT specifies the tool timeout (integer secs, <=0 is infinite)
+// BAD-TIMEOUT-AMDGPU: clang: error: cannot determine amdgcn architecture: CLANG_TOOLCHAIN_PROGRAM_TIMEOUT expected an integer, got ''; consider passing it via '--offload-arch'; environment variable CLANG_TOOLCHAIN_PROGRAM_TIMEOUT specifies the tool timeout (integer secs, <=0 is infinite)
diff --git clang/test/Driver/rocm-device-libs.cl clang/test/Driver/rocm-device-libs.cl
index 6837e219dc35..f9766e6fa4d9 100644
--- clang/test/Driver/rocm-device-libs.cl
+++ clang/test/Driver/rocm-device-libs.cl
@@ -145,8 +145,8 @@
 // RUN: 2>&1 | FileCheck  --check-prefixes=NOASAN %s
 
 // COMMON: "-triple" "amdgcn-amd-amdhsa"
-// ASAN-SAME: "-mlink-bitcode-file" "{{.*}}/amdgcn/bitcode/asanrtl.bc"
 // COMMON-SAME: "-mlink-builtin-bitcode" "{{.*}}/amdgcn/bitcode/opencl.bc"
+// ASAN-SAME: "-mlink-bitcode-file" "{{.*}}/amdgcn/bitcode/asanrtl.bc"
 // COMMON-SAME: "-mlink-builtin-bitcode" "{{.*}}/amdgcn/bitcode/ocml.bc"
 // COMMON-SAME: "-mlink-builtin-bitcode" "{{.*}}/amdgcn/bitcode/ockl.bc"
 
diff --git clang/test/Driver/tls-dialect.c clang/test/Driver/tls-dialect.c
index 3471b55b0eba..9ab79e87353d 100644
--- clang/test/Driver/tls-dialect.c
+++ clang/test/Driver/tls-dialect.c
@@ -10,6 +10,11 @@
 /// TLSDESC is not on by default in Linux, even on RISC-V, and is covered above
 // RUN: %clang -### --target=riscv64-android %s 2>&1 | FileCheck --check-prefix=DESC %s
 
+/// Fuchsia supports TLSDESC by default for all architectures.
+// RUN: %clang -### --target=riscv64-unknown-fuchsia %s 2>&1 | FileCheck --check-prefix=DESC %s
+// RUN: %clang -### --target=aarch64-unknown-fuchsia %s 2>&1 | FileCheck --check-prefix=DESC %s
+// RUN: %clang -### --target=x86_64-unknown-fuchsia %s 2>&1 | FileCheck --check-prefix=DESC %s
+
 /// LTO
 // RUN: %clang -### --target=loongarch64-linux -flto -mtls-dialect=desc %s 2>&1 | FileCheck --check-prefix=LTO-DESC %s
 // RUN: %clang -### --target=loongarch64-linux -flto %s 2>&1 | FileCheck --check-prefix=LTO-NODESC %s
diff --git clang/test/Driver/x86-target-features.c clang/test/Driver/x86-target-features.c
index 339f593dc760..18361251dceb 100644
--- clang/test/Driver/x86-target-features.c
+++ clang/test/Driver/x86-target-features.c
@@ -395,7 +395,8 @@
 // EVEX512: "-target-feature" "+evex512"
 // NO-EVEX512: "-target-feature" "-evex512"
 
-// RUN: %clang --target=i386 -mavx10.1 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=AVX10_1_256 %s
+// RUN: not %clang --target=i386 -march=i386 -mavx10.1 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=UNSUPPORT-AVX10 %s
+// RUN: not %clang --target=i386 -march=i386 -mno-avx10.1 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=UNSUPPORT-AVX10 %s
 // RUN: %clang --target=i386 -mavx10.1-256 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=AVX10_1_256 %s
 // RUN: %clang --target=i386 -mavx10.1-512 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=AVX10_1_512 %s
 // RUN: %clang --target=i386 -mavx10.1-256 -mavx10.1-512 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=AVX10_1_512 %s
@@ -403,15 +404,18 @@
 // RUN: not %clang --target=i386 -march=i386 -mavx10.1-128 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=BAD-AVX10 %s
 // RUN: not %clang --target=i386 -march=i386 -mavx10.a-256 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=BAD-AVX10 %s
 // RUN: not %clang --target=i386 -march=i386 -mavx10.1024-512 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=BAD-AVX10 %s
-// RUN: %clang --target=i386 -march=i386 -mavx10.1 -mavx512f %s -### -o %t.o 2>&1 | FileCheck -check-prefix=AVX10-AVX512 %s
-// RUN: %clang --target=i386 -march=i386 -mavx10.1 -mno-avx512f %s -### -o %t.o 2>&1 | FileCheck -check-prefix=AVX10-AVX512 %s
-// RUN: %clang --target=i386 -march=i386 -mavx10.1 -mevex512 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=AVX10-EVEX512 %s
-// RUN: %clang --target=i386 -march=i386 -mavx10.1 -mno-evex512 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=AVX10-EVEX512 %s
-// RUN: %clang --target=i386 -mavx10.2 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=AVX10_2_256 %s
+// RUN: %clang --target=i386 -march=i386 -mavx10.1-256 -mavx512f %s -### -o %t.o 2>&1 | FileCheck -check-prefix=AVX10-AVX512 %s
+// RUN: %clang --target=i386 -march=i386 -mavx10.1-256 -mno-avx512f %s -### -o %t.o 2>&1 | FileCheck -check-prefix=AVX10-AVX512 %s
+// RUN: %clang --target=i386 -march=i386 -mavx10.1-256 -mevex512 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=AVX10-EVEX512 %s
+// RUN: %clang --target=i386 -march=i386 -mavx10.1-256 -mno-evex512 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=AVX10-EVEX512 %s
+// RUN: %clang --target=i386 -mavx10.2 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=AVX10_2_512 %s
+// RUN: %clang --target=i386 -mno-avx10.2 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-AVX10_2 %s
 // RUN: %clang --target=i386 -mavx10.2-256 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=AVX10_2_256 %s
 // RUN: %clang --target=i386 -mavx10.2-512 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=AVX10_2_512 %s
 // RUN: %clang --target=i386 -mavx10.2-256 -mavx10.1-512 %s -### -o %t.o 2>&1 | FileCheck -check-prefixes=AVX10_2_256,AVX10_1_512 %s
 // RUN: %clang --target=i386 -mavx10.2-512 -mavx10.1-256 %s -### -o %t.o 2>&1 | FileCheck -check-prefixes=AVX10_2_512,AVX10_1_256 %s
+// UNSUPPORT-AVX10: error: unsupported option '-m{{.*}}avx10.1' for target 'i386'
+// NO-AVX10_2: "-target-feature" "-avx10.2-256"
 // AVX10_2_256: "-target-feature" "+avx10.2-256"
 // AVX10_2_512: "-target-feature" "+avx10.2-512"
 // AVX10_1_256: "-target-feature" "+avx10.1-256"
diff --git clang/test/Interpreter/simple-exception.cpp clang/test/Interpreter/simple-exception.cpp
index 651e8d9402f8..6749acd6e6bd 100644
--- clang/test/Interpreter/simple-exception.cpp
+++ clang/test/Interpreter/simple-exception.cpp
@@ -1,7 +1,7 @@
 // clang-format off
 // UNSUPPORTED: system-aix
-// XFAIL for arm, or running on Windows.
-// XFAIL: target=arm-{{.*}}, target=armv{{.*}}, system-windows
+// XFAIL for arm and arm64, or running on Windows.
+// XFAIL: target=arm{{.*}}, system-windows
 // RUN: cat %s | clang-repl | FileCheck %s
 
 // Incompatible with msan. It passes with -O3 but fail -Oz. Interpreter
diff --git clang/test/Modules/pr121245.cpp clang/test/Modules/pr121245.cpp
new file mode 100644
index 000000000000..0e276ad0e435
--- /dev/null
+++ clang/test/Modules/pr121245.cpp
@@ -0,0 +1,93 @@
+// If this test fails, it should be investigated under Debug builds.
+// Before the PR, this test was encountering an `llvm_unreachable()`.
+
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+// RUN: cd %t
+
+// RUN: %clang_cc1 -std=c++20 -emit-header-unit -xc++-user-header %t/hu-01.h \
+// RUN:  -fcxx-exceptions -o %t/hu-01.pcm
+
+// RUN: %clang_cc1 -std=c++20 -emit-header-unit -xc++-user-header %t/hu-02.h \
+// RUN:  -Wno-experimental-header-units -fcxx-exceptions \
+// RUN:  -fmodule-file=%t/hu-01.pcm -o %t/hu-02.pcm
+
+// RUN: %clang_cc1 -std=c++20 -emit-header-unit -xc++-user-header %t/hu-03.h \
+// RUN:  -Wno-experimental-header-units -fcxx-exceptions \
+// RUN:  -fmodule-file=%t/hu-01.pcm -o %t/hu-03.pcm
+
+// RUN: %clang_cc1 -std=c++20 -emit-header-unit -xc++-user-header %t/hu-04.h \
+// RUN:  -Wno-experimental-header-units -fcxx-exceptions \
+// RUN:  -fmodule-file=%t/hu-01.pcm -o %t/hu-04.pcm
+
+// RUN: %clang_cc1 -std=c++20 -emit-header-unit -xc++-user-header %t/hu-05.h \
+// RUN:  -Wno-experimental-header-units -fcxx-exceptions \
+// RUN:  -fmodule-file=%t/hu-03.pcm -fmodule-file=%t/hu-04.pcm \
+// RUN:  -fmodule-file=%t/hu-01.pcm -o %t/hu-05.pcm
+
+// RUN: %clang_cc1 -std=c++20 -emit-obj %t/main.cpp \
+// RUN:  -Wno-experimental-header-units -fcxx-exceptions \
+// RUN:  -fmodule-file=%t/hu-02.pcm -fmodule-file=%t/hu-05.pcm \
+// RUN:  -fmodule-file=%t/hu-04.pcm -fmodule-file=%t/hu-03.pcm \
+// RUN:  -fmodule-file=%t/hu-01.pcm
+
+//--- hu-01.h
+template <typename T>
+struct A {
+  A() {}
+  ~A() {}
+};
+
+template <typename T>
+struct EBO : T {
+  EBO() = default;
+};
+
+template <typename T>
+struct HT : EBO<A<T>> {};
+
+//--- hu-02.h
+import "hu-01.h";
+
+inline void f() {
+  HT<int>();
+}
+
+//--- hu-03.h
+import "hu-01.h";
+
+struct C {
+  C();
+
+  HT<long> _;
+};
+
+//--- hu-04.h
+import "hu-01.h";
+
+void g(HT<long> = {});
+
+//--- hu-05.h
+import "hu-03.h";
+import "hu-04.h";
+import "hu-01.h";
+
+struct B {
+  virtual ~B() = default;
+
+  virtual void f() {
+    HT<long>();
+  }
+};
+
+//--- main.cpp
+import "hu-02.h";
+import "hu-05.h";
+import "hu-03.h";
+
+int main() {
+  f();
+  C();
+  B();
+}
diff --git clang/test/PCH/cuda-kernel-call.cu clang/test/PCH/cuda-kernel-call.cu
index ffb0c1444fe6..da9d81c531c4 100644
--- clang/test/PCH/cuda-kernel-call.cu
+++ clang/test/PCH/cuda-kernel-call.cu
@@ -1,5 +1,7 @@
 // RUN: %clang_cc1 -emit-pch -o %t %s
 // RUN: %clang_cc1 -include-pch %t -fsyntax-only %s 
+// RUN: %clang_cc1 -emit-pch -fcuda-is-device -o %t-device %s
+// RUN: %clang_cc1 -fcuda-is-device -include-pch %t-device -fsyntax-only %s
 
 #ifndef HEADER
 #define HEADER
@@ -14,12 +16,21 @@ void kcall(void (*kp)()) {
 __global__ void kern() {
 }
 
+// Make sure that target overloaded functions remain
+// available as overloads after PCH deserialization.
+__host__ int overloaded_func();
+__device__ int overloaded_func();
+
 #else
 // Using the header.
 
 void test() {
   kcall(kern);
   kern<<<1, 1>>>();
+  overloaded_func();
 }
 
+__device__ void test () {
+  overloaded_func();
+}
 #endif
diff --git clang/test/ParserOpenACC/parse-constructs.c clang/test/ParserOpenACC/parse-constructs.c
index 886a912713c5..f0698495a3cc 100644
--- clang/test/ParserOpenACC/parse-constructs.c
+++ clang/test/ParserOpenACC/parse-constructs.c
@@ -109,30 +109,23 @@ void func() {
   for(int i = 0; i < 6;++i){}
 
   int i = 0, j = 0, k = 0;
-  // expected-warning@+1{{OpenACC construct 'atomic' not yet implemented, pragma ignored}}
 #pragma acc atomic
-  i = j;
-  // expected-error@+2{{invalid OpenACC clause 'garbage'}}
-  // expected-warning@+1{{OpenACC construct 'atomic' not yet implemented, pragma ignored}}
+  i = i + 1;
+  // expected-error@+1{{invalid OpenACC clause 'garbage'}}
 #pragma acc atomic garbage
-  i = j;
-  // expected-error@+2{{invalid OpenACC clause 'garbage'}}
-  // expected-warning@+1{{OpenACC construct 'atomic' not yet implemented, pragma ignored}}
+  i = i + 1;
+  // expected-error@+1{{invalid OpenACC clause 'garbage'}}
 #pragma acc atomic garbage clause list
-  i = j;
-  // expected-warning@+1{{OpenACC construct 'atomic' not yet implemented, pragma ignored}}
+  i = i + 1;
 #pragma acc atomic read
   i = j;
-  // expected-error@+2{{invalid OpenACC clause 'clause'}}
-  // expected-warning@+1{{OpenACC construct 'atomic' not yet implemented, pragma ignored}}
+  // expected-error@+1{{invalid OpenACC clause 'clause'}}
 #pragma acc atomic write clause list
   i = i + j;
-  // expected-error@+2{{invalid OpenACC clause 'clause'}}
-  // expected-warning@+1{{OpenACC construct 'atomic' not yet implemented, pragma ignored}}
+  // expected-error@+1{{invalid OpenACC clause 'clause'}}
 #pragma acc atomic update clause list
   i++;
-  // expected-error@+2{{invalid OpenACC clause 'clause'}}
-  // expected-warning@+1{{OpenACC construct 'atomic' not yet implemented, pragma ignored}}
+  // expected-error@+1{{invalid OpenACC clause 'clause'}}
 #pragma acc atomic capture clause list
   i = j++;
 
diff --git clang/test/Preprocessor/init-aarch64.c clang/test/Preprocessor/init-aarch64.c
index 5f47de4b49b6..3036b496db25 100644
--- clang/test/Preprocessor/init-aarch64.c
+++ clang/test/Preprocessor/init-aarch64.c
@@ -125,8 +125,8 @@
 // AARCH64-NEXT: #define __FP_FAST_FMAF 1
 // AARCH64-NEXT: #define __FUNCTION_MULTI_VERSIONING_SUPPORT_LEVEL 202430
 // AARCH64-NEXT: #define __GCC_ASM_FLAG_OUTPUTS__ 1
-// AARCH64-NEXT: #define __GCC_CONSTRUCTIVE_SIZE {{.+}}
-// AARCH64-NEXT: #define __GCC_DESTRUCTIVE_SIZE {{.+}}
+// AARCH64-NEXT: #define __GCC_CONSTRUCTIVE_SIZE 64
+// AARCH64-NEXT: #define __GCC_DESTRUCTIVE_SIZE 256
 // AARCH64-NEXT: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1 1
 // AARCH64-NEXT: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_16 1
 // AARCH64-NEXT: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2 1
diff --git clang/test/Preprocessor/x86_target_features.c clang/test/Preprocessor/x86_target_features.c
index fa3d0038f05a..63222a882ff5 100644
--- clang/test/Preprocessor/x86_target_features.c
+++ clang/test/Preprocessor/x86_target_features.c
@@ -742,10 +742,8 @@
 // AVXVNNIINT16NOAVX2-NOT: #define __AVX2__ 1
 // AVXVNNIINT16NOAVX2-NOT: #define __AVXVNNIINT16__ 1
 
-// RUN: %clang -target i686-unknown-linux-gnu -march=atom -mavx10.1 -x c -E -dM -o - %s | FileCheck  -check-prefix=AVX10_1_256 %s
 // RUN: %clang -target i686-unknown-linux-gnu -march=atom -mavx10.1-256 -x c -E -dM -o - %s | FileCheck  -check-prefix=AVX10_1_256 %s
 // RUN: %clang -target i686-unknown-linux-gnu -march=atom -mavx10.1-256 -mno-avx512f -x c -E -dM -o - %s | FileCheck  -check-prefix=AVX10_1_256 %s
-// RUN: %clang -target i686-unknown-linux-gnu -march=atom -mavx10.2 -x c -E -dM -o - %s | FileCheck  -check-prefixes=AVX10_1_256,AVX10_2_256 %s
 // RUN: %clang -target i686-unknown-linux-gnu -march=atom -mavx10.2-256 -x c -E -dM -o - %s | FileCheck  -check-prefixes=AVX10_1_256,AVX10_2_256 %s
 // AVX10_1_256-NOT: __AVX10_1_512__
 // AVX10_1_256: #define __AVX10_1__ 1
@@ -758,6 +756,7 @@
 // RUN: %clang -target i686-unknown-linux-gnu -march=atom -mavx10.1-512 -x c -E -dM -o - %s | FileCheck  -check-prefix=AVX10_1_512 %s
 // RUN: %clang -target i686-unknown-linux-gnu -march=atom -mavx10.1-512 -mno-avx512f -x c -E -dM -o - %s | FileCheck  -check-prefix=AVX10_1_512 %s
 // RUN: %clang -target i686-unknown-linux-gnu -march=atom -mavx10.1-512 -mno-evex512 -x c -E -dM -o - %s | FileCheck  -check-prefix=AVX10_1_512 %s
+// RUN: %clang -target i686-unknown-linux-gnu -march=atom -mavx10.2 -x c -E -dM -o - %s | FileCheck  -check-prefixes=AVX10_1_512,AVX10_2_512 %s
 // RUN: %clang -target i686-unknown-linux-gnu -march=atom -mavx10.2-512 -x c -E -dM -o - %s | FileCheck  -check-prefixes=AVX10_1_512,AVX10_2_512 %s
 // AVX10_1_512: #define __AVX10_1_512__ 1
 // AVX10_1_512: #define __AVX10_1__ 1
diff --git clang/test/Sema/MicrosoftCompatibility.c clang/test/Sema/MicrosoftCompatibility.c
index 9a1f050747f9..8d402d53e004 100644
--- clang/test/Sema/MicrosoftCompatibility.c
+++ clang/test/Sema/MicrosoftCompatibility.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 %s -fsyntax-only -Wno-unused-value -Wmicrosoft -verify -fms-compatibility -DMSVCCOMPAT -triple i686-pc-win32
-// RUN: %clang_cc1 %s -fsyntax-only -Wno-unused-value -Wmicrosoft -verify -fms-extensions -triple i686-pc-win32
+// RUN: %clang_cc1 %s -fsyntax-only -Wno-unused-value -Wmicrosoft -verify=expected,compat -fms-compatibility -DMSVCCOMPAT -triple i686-pc-win32
+// RUN: %clang_cc1 %s -fsyntax-only -Wno-unused-value -Wmicrosoft -verify=expected,ext -fms-extensions -triple i686-pc-win32
 
 #ifdef MSVCCOMPAT
 enum ENUM1; // expected-warning {{forward references to 'enum' types are a Microsoft extension}}
@@ -35,3 +35,15 @@ size_t x;
 #else
 size_t x; // expected-error {{unknown type name 'size_t'}}
 #endif
+
+/* Microsoft allows inline, __inline, and __forceinline to appear on a typedef
+   of a function type; this is used in their system headers such as ufxclient.h
+   See GitHub #124869 for more details.
+ */
+typedef int inline Foo1(int);       // compat-warning {{'inline' can only appear on functions}} \
+                                       ext-error {{'inline' can only appear on functions}}
+typedef int __inline Foo2(int);     // compat-warning {{'inline' can only appear on functions}} \
+                                       ext-error {{'inline' can only appear on functions}}
+typedef int __forceinline Foo(int); // compat-warning {{'inline' can only appear on functions}} \
+                                       ext-error {{'inline' can only appear on functions}} \
+                                       expected-warning {{'__forceinline' attribute only applies to functions and statements}}
diff --git clang/test/Sema/MicrosoftCompatibility.cpp clang/test/Sema/MicrosoftCompatibility.cpp
index 90a45dfaaf17..391977e2765c 100644
--- clang/test/Sema/MicrosoftCompatibility.cpp
+++ clang/test/Sema/MicrosoftCompatibility.cpp
@@ -8,3 +8,10 @@ struct cls {
 };
 
 char * cls::* __uptr wrong2 = &cls::m; // expected-error {{'__uptr' attribute cannot be used with pointers to members}}
+
+// Microsoft allows inline, __inline, and __forceinline to appear on a typedef
+// of a function type, but only in C. See GitHub #124869 for more details.
+typedef int inline Foo1(int);       // expected-error {{'inline' can only appear on functions}}
+typedef int __inline Foo2(int);     // expected-error {{'inline' can only appear on functions}}
+typedef int __forceinline Foo(int); // expected-error {{'inline' can only appear on functions}} \
+                                       expected-warning {{'__forceinline' attribute only applies to functions and statements}}
diff --git clang/test/SemaCUDA/inherited-ctor.cu clang/test/SemaCUDA/inherited-ctor.cu
index 8ac59e7b539f..ef3938555b98 100644
--- clang/test/SemaCUDA/inherited-ctor.cu
+++ clang/test/SemaCUDA/inherited-ctor.cu
@@ -81,7 +81,7 @@ namespace DefaultCtorInvalid {
   };
 
   struct C {
-    struct B b;
+    struct B b; // expected-note{{default constructed field 'b' declared here}}
     C() {} // expected-error{{call to implicitly-deleted default constructor of 'struct B'}}
            // expected-note@-6{{default constructor of 'B' is implicitly deleted because field 's' has a deleted default constructor}}
            // expected-note@-15{{'S' has been explicitly marked deleted here}}
diff --git clang/test/SemaCXX/remove_pointer.mm clang/test/SemaCXX/remove_pointer.mm
deleted file mode 100644
index d1cf1fa9f4ef..000000000000
--- clang/test/SemaCXX/remove_pointer.mm
+++ /dev/null
@@ -1,8 +0,0 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s
-
-// expected-no-diagnostics
-
-@class X;
-
-static_assert(__is_same(__remove_pointer(X *), X), "");
-static_assert(__is_same(__remove_pointer(id), id), "");
diff --git clang/test/SemaCXX/uninitialized.cpp clang/test/SemaCXX/uninitialized.cpp
index 52d9897cf9be..7578b288d7b3 100644
--- clang/test/SemaCXX/uninitialized.cpp
+++ clang/test/SemaCXX/uninitialized.cpp
@@ -2,6 +2,8 @@
 // RUN: %clang_cc1 -fsyntax-only -Wall -Wc++20-compat -Wuninitialized -Wno-unused-value -Wno-unused-lambda-capture -Wno-uninitialized-const-reference -std=c++1z -verify %s -fexperimental-new-constant-interpreter
 // RUN: %clang_cc1 -fsyntax-only -Wall -Wc++20-compat -Wuninitialized -Wno-unused-value -Wno-unused-lambda-capture -Wno-uninitialized-const-reference -std=c++20 -verify %s
 
+void* operator new(__SIZE_TYPE__, void*);
+
 // definitions for std::move
 namespace std {
 inline namespace foo {
@@ -1540,6 +1542,48 @@ void aggregate() {
     };
   };
 
+  struct Embed {
+    int embed1;  // #FIELD_EMBED1
+    int embed2 [[clang::require_explicit_initialization]];  // #FIELD_EMBED2
+  };
+  struct EmbedDerived : Embed {};
+  struct F {
+    Embed f1;
+    // expected-warning@+1 {{field in 'Embed' requires explicit initialization but is not explicitly initialized}} expected-note@#FIELD_EMBED2 {{'embed2' declared here}}
+    explicit F(const char(&)[1]) : f1() {
+      // expected-warning@+1 {{field in 'Embed' requires explicit initialization but is not explicitly initialized}} expected-note@#FIELD_EMBED2 {{'embed2' declared here}}
+      ::new(static_cast<void*>(&f1)) decltype(f1);
+      // expected-warning@+1 {{field in 'Embed' requires explicit initialization but is not explicitly initialized}} expected-note@#FIELD_EMBED2 {{'embed2' declared here}}
+      ::new(static_cast<void*>(&f1)) decltype(f1)();
+#if __cplusplus >= 202002L
+      // expected-warning@+1 {{field 'embed2' requires explicit initialization but is not explicitly initialized}} expected-note@#FIELD_EMBED2 {{'embed2' declared here}}
+      ::new(static_cast<void*>(&f1)) decltype(f1)(1);
+#endif
+      // expected-warning@+1 {{field 'embed2' requires explicit initialization but is not explicitly initialized}} expected-note@#FIELD_EMBED2 {{'embed2' declared here}}
+      ::new(static_cast<void*>(&f1)) decltype(f1){1};
+    }
+#if __cplusplus >= 202002L
+    // expected-warning@+1 {{field 'embed2' requires explicit initialization but is not explicitly initialized}} expected-note@#FIELD_EMBED2 {{'embed2' declared here}}
+    explicit F(const char(&)[2]) : f1(1) {}
+#else
+    explicit F(const char(&)[2]) : f1{1, 2} { }
+#endif
+    // expected-warning@+1 {{field 'embed2' requires explicit initialization but is not explicitly initialized}} expected-note@#FIELD_EMBED2 {{'embed2' declared here}}
+    explicit F(const char(&)[3]) : f1{} {}
+    // expected-warning@+1 {{field 'embed2' requires explicit initialization but is not explicitly initialized}} expected-note@#FIELD_EMBED2 {{'embed2' declared here}}
+    explicit F(const char(&)[4]) : f1{1} {}
+    // expected-warning@+1 {{field 'embed2' requires explicit initialization but is not explicitly initialized}} expected-note@#FIELD_EMBED2 {{'embed2' declared here}}
+    explicit F(const char(&)[5]) : f1{.embed1 = 1} {}
+  };
+  F ctors[] = {
+      F(""),
+      F("_"),
+      F("__"),
+      F("___"),
+      F("____")
+  };
+  (void)ctors;
+
   S::foo(S{1, 2, 3, 4});
   S::foo(S{.s1 = 100, .s4 = 100});
   S::foo(S{.s1 = 100}); // expected-warning {{field 's4' requires explicit initialization but is not explicitly initialized}} expected-note@#FIELD_S4 {{'s4' declared here}}
diff --git clang/test/SemaCXX/unique_object_duplication.cpp clang/test/SemaCXX/unique_object_duplication.cpp
new file mode 100644
index 000000000000..8a19fb7b8118
--- /dev/null
+++ clang/test/SemaCXX/unique_object_duplication.cpp
@@ -0,0 +1,16 @@
+// RUN: %clang_cc1 -fsyntax-only -verify=hidden -Wunique-object-duplication -fvisibility=hidden -Wno-unused-value %s
+// RUN: %clang_cc1 -fsyntax-only -verify -Wunique-object-duplication -Wno-unused-value %s
+// The check is currently disabled on windows. The test should fail because we're not getting the expected warnings.
+// XFAIL: target={{.*}}-windows{{.*}}, {{.*}}-ps{{(4|5)(-.+)?}}
+
+#include "unique_object_duplication.h"
+
+// Everything in these namespaces here is defined in the cpp file,
+// so won't get duplicated
+
+namespace GlobalTest {
+  float Test::allowedStaticMember1 = 2.3;
+}
+
+bool disallowed4 = true;
+constexpr inline bool disallowed5 = true;
\ No newline at end of file
diff --git clang/test/SemaCXX/unique_object_duplication.h clang/test/SemaCXX/unique_object_duplication.h
new file mode 100644
index 000000000000..5b2002c31be7
--- /dev/null
+++ clang/test/SemaCXX/unique_object_duplication.h
@@ -0,0 +1,157 @@
+/**
+ * This file contains tests for the -Wunique_object_duplication warning.
+ * See the warning's documentation for more information.
+ */
+
+#define HIDDEN __attribute__((visibility("hidden")))
+#define DEFAULT __attribute__((visibility("default")))
+
+// Helper functions
+constexpr int init_constexpr(int x) { return x; };
+extern double init_dynamic(int);
+
+/******************************************************************************
+ * Case one: Static local variables in an externally-visible function
+ ******************************************************************************/
+namespace StaticLocalTest {
+
+inline void has_static_locals_external() {
+  // Mutable
+  static int disallowedStatic1 = 0; // hidden-warning {{'disallowedStatic1' may be duplicated when built into a shared library: it is mutable, has hidden visibility, and external linkage}}
+  // Initialization might run more than once
+  static const double disallowedStatic2 = disallowedStatic1++; // hidden-warning {{initializeation of 'disallowedStatic2' may run twice when built into a shared library: it has hidden visibility and external linkage}}
+  
+  // OK, because immutable and compile-time-initialized
+  static constexpr int allowedStatic1 = 0;
+  static const float allowedStatic2 = 1;
+  static constexpr int allowedStatic3 = init_constexpr(2);
+  static const int allowedStatic4 = init_constexpr(3);
+}
+
+// Don't warn for non-inline functions, since they can't (legally) appear
+// in more than one TU in the first place.
+void has_static_locals_non_inline() {
+  // Mutable
+  static int allowedStatic1 = 0;
+  // Initialization might run more than once
+  static const double allowedStatic2 = allowedStatic1++;
+}
+
+// Everything in this function is OK because the function is TU-local
+static void has_static_locals_internal() {
+  static int allowedStatic1 = 0;
+  static double allowedStatic2 = init_dynamic(2);
+  static char allowedStatic3 = []() { return allowedStatic1++; }();
+  static constexpr int allowedStatic4 = 0;
+}
+
+namespace {
+
+// Everything in this function is OK because the function is also TU-local
+void has_static_locals_anon() {
+  static int allowedStatic1 = 0;
+  static double allowedStatic2 = init_dynamic(2);
+  static char allowedStatic3 = []() { return allowedStatic1++; }();
+  static constexpr int allowedStatic4 = init_constexpr(3);
+} 
+
+} // Anonymous namespace
+
+HIDDEN inline void static_local_always_hidden() {
+    static int disallowedStatic1 = 3; // hidden-warning {{'disallowedStatic1' may be duplicated when built into a shared library: it is mutable, has hidden visibility, and external linkage}}
+                                      // expected-warning@-1 {{'disallowedStatic1' may be duplicated when built into a shared library: it is mutable, has hidden visibility, and external linkage}}
+    {
+      static int disallowedStatic2 = 3; // hidden-warning {{'disallowedStatic2' may be duplicated when built into a shared library: it is mutable, has hidden visibility, and external linkage}}
+                                        // expected-warning@-1 {{'disallowedStatic2' may be duplicated when built into a shared library: it is mutable, has hidden visibility, and external linkage}}
+    }
+
+    auto lmb = []() {
+      static int disallowedStatic3 = 3; // hidden-warning {{'disallowedStatic3' may be duplicated when built into a shared library: it is mutable, has hidden visibility, and external linkage}}
+                                        // expected-warning@-1 {{'disallowedStatic3' may be duplicated when built into a shared library: it is mutable, has hidden visibility, and external linkage}}
+    };
+}
+
+DEFAULT void static_local_never_hidden() {
+    static int allowedStatic1 = 3; 
+
+    {
+      static int allowedStatic2 = 3; 
+    }
+
+    auto lmb = []() {
+      static int allowedStatic3 = 3;
+    };
+}
+
+// Don't warn on this because it's not in a function
+const int setByLambda = ([]() { static int x = 3; return x++; })();
+
+inline void has_extern_local() {
+  extern int allowedAddressExtern; // Not a definition
+}
+
+inline void has_regular_local() {
+  int allowedAddressLocal = 0;
+}
+
+inline void has_thread_local() {
+  // thread_local variables are static by default
+  thread_local int disallowedThreadLocal = 0; // hidden-warning {{'disallowedThreadLocal' may be duplicated when built into a shared library: it is mutable, has hidden visibility, and external linkage}}
+}
+
+} // namespace StaticLocalTest
+
+/******************************************************************************
+ * Case two: Globals with external linkage
+ ******************************************************************************/
+namespace GlobalTest {
+  // Mutable
+  inline float disallowedGlobal1 = 3.14; // hidden-warning {{'disallowedGlobal1' may be duplicated when built into a shared library: it is mutable, has hidden visibility, and external linkage}}
+  
+  // Initialization might run more than once
+  inline const double disallowedGlobal5 = disallowedGlobal1++; // hidden-warning {{initializeation of 'disallowedGlobal5' may run twice when built into a shared library: it has hidden visibility and external linkage}}
+
+  // OK because internal linkage, so duplication is intended
+  static float allowedGlobal1 = 3.14;
+  const double allowedGlobal2 = init_dynamic(2);
+  static const char allowedGlobal3 = []() { return disallowedGlobal1++; }();
+  static inline double allowedGlobal4 = init_dynamic(2);
+
+  // OK, because immutable and compile-time-initialized
+  constexpr int allowedGlobal5 = 0;
+  const float allowedGlobal6 = 1;
+  constexpr int allowedGlobal7 = init_constexpr(2);
+  const int allowedGlobal8 = init_constexpr(3);
+
+  // We don't warn on this because non-inline variables can't (legally) appear
+  // in more than one TU.
+  float allowedGlobal9 = 3.14;
+  
+  // Pointers need to be double-const-qualified
+  inline float& nonConstReference = disallowedGlobal1; // hidden-warning {{'nonConstReference' may be duplicated when built into a shared library: it is mutable, has hidden visibility, and external linkage}}
+  const inline int& constReference = allowedGlobal5;
+
+  inline int* nonConstPointerToNonConst = nullptr; // hidden-warning {{'nonConstPointerToNonConst' may be duplicated when built into a shared library: it is mutable, has hidden visibility, and external linkage}}
+  inline int const* nonConstPointerToConst = nullptr; // hidden-warning {{'nonConstPointerToConst' may be duplicated when built into a shared library: it is mutable, has hidden visibility, and external linkage}}
+  inline int* const constPointerToNonConst = nullptr; // hidden-warning {{'constPointerToNonConst' may be duplicated when built into a shared library: it is mutable, has hidden visibility, and external linkage}}
+  inline int const* const constPointerToConst = nullptr;
+  // Don't warn on new because it tends to generate false positives
+  inline int const* const constPointerToConstNew = new int(7);
+
+  inline int const * const * const * const nestedConstPointer = nullptr;
+  inline int const * const ** const * const nestedNonConstPointer = nullptr; // hidden-warning {{'nestedNonConstPointer' may be duplicated when built into a shared library: it is mutable, has hidden visibility, and external linkage}}
+
+  struct Test {
+    static inline float disallowedStaticMember1; // hidden-warning {{'disallowedStaticMember1' may be duplicated when built into a shared library: it is mutable, has hidden visibility, and external linkage}}       
+    // Defined below, in the header file
+    static float disallowedStaticMember2;                                       
+    // Defined in the cpp file, so won't get duplicated
+    static float allowedStaticMember1;
+
+    // Tests here are sparse because the AddrTest case below will define plenty
+    // more, which aren't problematic to define (because they're immutable), but
+    // may still cause problems if their address is taken.
+  };
+
+  inline float Test::disallowedStaticMember2 = 2.3; // hidden-warning {{'disallowedStaticMember2' may be duplicated when built into a shared library: it is mutable, has hidden visibility, and external linkage}}
+} // namespace GlobalTest
\ No newline at end of file
diff --git clang/test/SemaObjCXX/type-traits.mm clang/test/SemaObjCXX/type-traits.mm
new file mode 100644
index 000000000000..81b9573b5219
--- /dev/null
+++ clang/test/SemaObjCXX/type-traits.mm
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -triple x86_64-apple-darwin10 -fsyntax-only -verify -std=c++17  %s
+
+// expected-no-diagnostics
+
+@interface I;
+@end
+
+@class C;
+
+static_assert(__is_same(__add_pointer(id), id*));
+static_assert(__is_same(__add_pointer(I), I*));
+
+static_assert(__is_same(__remove_pointer(C*), C));
+static_assert(!__is_same(__remove_pointer(id), id));
+static_assert(__is_same(__remove_pointer(id*), id));
+static_assert(__is_same(__remove_pointer(__add_pointer(id)), id));
+static_assert(__is_same(__add_pointer(__remove_pointer(id)), id));
diff --git clang/test/SemaOpenACC/atomic-construct-ast.cpp clang/test/SemaOpenACC/atomic-construct-ast.cpp
new file mode 100644
index 000000000000..6579b87941e5
--- /dev/null
+++ clang/test/SemaOpenACC/atomic-construct-ast.cpp
@@ -0,0 +1,170 @@
+// RUN: %clang_cc1 %s -fopenacc -ast-dump | FileCheck %s
+
+// Test this with PCH.
+// RUN: %clang_cc1 %s -fopenacc -emit-pch -o %t %s
+// RUN: %clang_cc1 %s -fopenacc -include-pch %t -ast-dump-all | FileCheck %s
+
+#ifndef PCH_HELPER
+#define PCH_HELPER
+
+void foo(int v, int x) {
+  // CHECK: FunctionDecl{{.*}} foo 'void (int, int)'
+  // CHECK-NEXT: ParmVarDecl
+  // CHECK-NEXT: ParmVarDecl
+  // CHECK-NEXT: CompoundStmt
+
+// CHECK-NEXT: OpenACCAtomicConstruct{{.*}} atomic read
+// CHECK-NEXT: BinaryOperator{{.*}} 'int' lvalue '='
+// CHECK-NEXT: DeclRefExpr{{.*}}'v' 'int'
+// CHECK-NEXT: ImplicitCastExpr{{.*}}'int' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr{{.*}}'x' 'int'
+#pragma acc atomic read
+  v = x;
+
+// CHECK-NEXT: OpenACCAtomicConstruct{{.*}} atomic write
+// CHECK-NEXT: BinaryOperator{{.*}} 'int' lvalue '='
+// CHECK-NEXT: DeclRefExpr{{.*}}'v' 'int'
+// CHECK-NEXT: BinaryOperator{{.*}}'int' '+'
+// CHECK-NEXT: ImplicitCastExpr{{.*}}'int' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr{{.*}}'x' 'int'
+// CHECK-NEXT: IntegerLiteral{{.*}} 'int' 1
+#pragma acc atomic write
+  v = x + 1;
+
+// CHECK-NEXT: OpenACCAtomicConstruct{{.*}} atomic update 
+// CHECK-NEXT: UnaryOperator{{.*}} 'int' postfix '++'
+// CHECK-NEXT: DeclRefExpr{{.*}}'x' 'int'
+#pragma acc atomic update
+  x++;
+// CHECK-NEXT: OpenACCAtomicConstruct{{.*}} atomic <none>
+// CHECK-NEXT: UnaryOperator{{.*}} 'int' postfix '--'
+// CHECK-NEXT: DeclRefExpr{{.*}}'x' 'int'
+#pragma acc atomic
+  x--;
+// CHECK-NEXT: OpenACCAtomicConstruct{{.*}} atomic capture
+// CHECK-NEXT: BinaryOperator{{.*}} 'int' lvalue '='
+// CHECK-NEXT: DeclRefExpr{{.*}}'v' 'int'
+// CHECK-NEXT: UnaryOperator{{.*}} 'int' postfix '++'
+// CHECK-NEXT: DeclRefExpr{{.*}}'x' 'int'
+#pragma acc atomic capture
+  v = x++;
+
+// CHECK-NEXT: OpenACCAtomicConstruct{{.*}} atomic capture
+// CHECK-NEXT: CompoundStmt
+// CHECK-NEXT: UnaryOperator{{.*}} 'int' postfix '--'
+// CHECK-NEXT: DeclRefExpr{{.*}}'x' 'int'
+// CHECK-NEXT: BinaryOperator{{.*}} 'int' lvalue '='
+// CHECK-NEXT: DeclRefExpr{{.*}}'v' 'int'
+// CHECK-NEXT: ImplicitCastExpr{{.*}}'int' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr{{.*}}'x' 'int'
+#pragma acc atomic capture
+  { x--; v = x; }
+
+}
+
+template<typename T, int I>
+void templ_foo(T v, T x) {
+  // CHECK-NEXT: FunctionTemplateDecl{{.*}}templ_foo
+  // CHECK-NEXT: TemplateTypeParmDecl{{.*}} T
+  // CHECK-NEXT: NonTypeTemplateParmDecl{{.*}} I
+  // CHECK-NEXT: FunctionDecl{{.*}} templ_foo 'void (T, T)'
+  // CHECK-NEXT: ParmVarDecl{{.*}} v 'T'
+  // CHECK-NEXT: ParmVarDecl{{.*}} x 'T'
+  // CHECK-NEXT: CompoundStmt
+
+// CHECK-NEXT: OpenACCAtomicConstruct{{.*}} atomic read
+// CHECK-NEXT: BinaryOperator{{.*}} '<dependent type>' '='
+// CHECK-NEXT: DeclRefExpr{{.*}}'v' 'T'
+// CHECK-NEXT: DeclRefExpr{{.*}}'x' 'T'
+#pragma acc atomic read
+  v = x;
+
+// CHECK-NEXT: OpenACCAtomicConstruct{{.*}} atomic write
+// CHECK-NEXT: BinaryOperator{{.*}} '<dependent type>' '='
+// CHECK-NEXT: DeclRefExpr{{.*}}'v' 'T'
+// CHECK-NEXT: BinaryOperator{{.*}}'<dependent type>' '+'
+// CHECK-NEXT: DeclRefExpr{{.*}}'x' 'T'
+// CHECK-NEXT: DeclRefExpr{{.*}} 'I' 'int'
+#pragma acc atomic write
+  v = x + I;
+
+// CHECK-NEXT: OpenACCAtomicConstruct{{.*}} atomic update 
+// CHECK-NEXT: UnaryOperator{{.*}} '<dependent type>' postfix '++'
+// CHECK-NEXT: DeclRefExpr{{.*}}'x' 'T'
+#pragma acc atomic update
+  x++;
+// CHECK-NEXT: OpenACCAtomicConstruct{{.*}} atomic <none>
+// CHECK-NEXT: UnaryOperator{{.*}} '<dependent type>' postfix '--'
+// CHECK-NEXT: DeclRefExpr{{.*}}'x' 'T'
+#pragma acc atomic
+  x--;
+// CHECK-NEXT: OpenACCAtomicConstruct{{.*}} atomic capture
+// CHECK-NEXT: BinaryOperator{{.*}} '<dependent type>' '='
+// CHECK-NEXT: DeclRefExpr{{.*}}'v' 'T'
+// CHECK-NEXT: UnaryOperator{{.*}} '<dependent type>' postfix '++'
+// CHECK-NEXT: DeclRefExpr{{.*}}'x' 'T'
+#pragma acc atomic capture
+  v = x++;
+
+// CHECK-NEXT: OpenACCAtomicConstruct{{.*}} atomic capture
+// CHECK-NEXT: CompoundStmt
+// CHECK-NEXT: UnaryOperator{{.*}} '<dependent type>' postfix '--'
+// CHECK-NEXT: DeclRefExpr{{.*}}'x' 'T'
+// CHECK-NEXT: BinaryOperator{{.*}} '<dependent type>' '='
+// CHECK-NEXT: DeclRefExpr{{.*}}'v' 'T'
+// CHECK-NEXT: DeclRefExpr{{.*}}'x' 'T'
+#pragma acc atomic capture
+  { x--; v = x; }
+
+  // CHECK-NEXT: FunctionDecl{{.*}} templ_foo 'void (int, int)' implicit_instantiation
+  // CHECK-NEXT: TemplateArgument type 'int'
+  // CHECK-NEXT: BuiltinType{{.*}} 'int'
+  // CHECK-NEXT: TemplateArgument integral '5'
+  // CHECK-NEXT: ParmVarDecl{{.*}} v 'int'
+  // CHECK-NEXT: ParmVarDecl{{.*}} x 'int'
+  // CHECK-NEXT: CompoundStmt
+
+// CHECK-NEXT: OpenACCAtomicConstruct{{.*}} atomic read
+// CHECK-NEXT: BinaryOperator{{.*}} 'int' lvalue '='
+// CHECK-NEXT: DeclRefExpr{{.*}}'v' 'int'
+// CHECK-NEXT: ImplicitCastExpr{{.*}}'int' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr{{.*}}'x' 'int'
+
+// CHECK-NEXT: OpenACCAtomicConstruct{{.*}} atomic write
+// CHECK-NEXT: BinaryOperator{{.*}} 'int' lvalue '='
+// CHECK-NEXT: DeclRefExpr{{.*}}'v' 'int'
+// CHECK-NEXT: BinaryOperator{{.*}}'int' '+'
+// CHECK-NEXT: ImplicitCastExpr{{.*}}'int' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr{{.*}}'x' 'int'
+// CHECK-NEXT: SubstNonTypeTemplateParmExpr{{.*}} 'int'
+// CHECK-NEXT: NonTypeTemplateParmDecl{{.*}} 'int'{{.*}}I
+// CHECK-NEXT: IntegerLiteral{{.*}} 'int' 5
+
+// CHECK-NEXT: OpenACCAtomicConstruct{{.*}} atomic update 
+// CHECK-NEXT: UnaryOperator{{.*}} 'int' postfix '++'
+// CHECK-NEXT: DeclRefExpr{{.*}}'x' 'int'
+
+// CHECK-NEXT: OpenACCAtomicConstruct{{.*}} atomic <none>
+// CHECK-NEXT: UnaryOperator{{.*}} 'int' postfix '--'
+// CHECK-NEXT: DeclRefExpr{{.*}}'x' 'int'
+
+// CHECK-NEXT: OpenACCAtomicConstruct{{.*}} atomic capture
+// CHECK-NEXT: BinaryOperator{{.*}} 'int' lvalue '='
+// CHECK-NEXT: DeclRefExpr{{.*}}'v' 'int'
+// CHECK-NEXT: UnaryOperator{{.*}} 'int' postfix '++'
+// CHECK-NEXT: DeclRefExpr{{.*}}'x' 'int'
+
+// CHECK-NEXT: OpenACCAtomicConstruct{{.*}} atomic capture
+// CHECK-NEXT: CompoundStmt
+// CHECK-NEXT: UnaryOperator{{.*}} 'int' postfix '--'
+// CHECK-NEXT: DeclRefExpr{{.*}}'x' 'int'
+// CHECK-NEXT: BinaryOperator{{.*}} 'int' lvalue '='
+// CHECK-NEXT: DeclRefExpr{{.*}}'v' 'int'
+// CHECK-NEXT: ImplicitCastExpr{{.*}}'int' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr{{.*}}'x' 'int'
+}
+
+void use() {
+  templ_foo<int, 5>(1, 2);
+}
+#endif
diff --git clang/test/SemaOpenACC/atomic-construct.cpp clang/test/SemaOpenACC/atomic-construct.cpp
new file mode 100644
index 000000000000..7357d91d704f
--- /dev/null
+++ clang/test/SemaOpenACC/atomic-construct.cpp
@@ -0,0 +1,1846 @@
+// RUN: %clang_cc1 %s -fopenacc -Wno-unused-value -verify 
+
+void NormalFunc(int I) {
+  // No clauses are valid, but we parse them anyway, just mark them as not valid
+  // on this construct.
+ 
+  // expected-error@+1{{OpenACC 'copy' clause is not valid on 'atomic' directive}}
+#pragma acc atomic copy(I)
+  I = I + 1;
+  // expected-error@+1{{OpenACC 'copy' clause is not valid on 'atomic' directive}}
+#pragma acc atomic read copy(I)
+  I = I;
+}
+
+struct Struct{
+  Struct *getPtr();
+  Struct &operator++();
+  Struct &operator--();
+  Struct &operator++(int);
+  Struct &operator--(int);
+
+  Struct &operator+=(int);
+  Struct &operator*=(int);
+  Struct &operator-=(int);
+  Struct &operator/=(int);
+  Struct &operator&=(int);
+  Struct &operator|=(int);
+  Struct &operator<<=(int);
+  Struct &operator>>=(int);
+  Struct &operator^=(int);
+  Struct &operator%=(int);
+  Struct &operator!=(int);
+  Struct &operator+();
+  Struct &operator-();
+
+  operator int();
+  void operator()();
+  Struct &operator*();
+  Struct &operator=(int);
+};
+
+int operator+(Struct&, int);
+int operator+(int, Struct&);
+Struct &operator+(Struct&, Struct&);
+Struct &operator*(Struct&, Struct&);
+Struct &operator-(Struct&, Struct&);
+
+Struct S1, S2;
+
+template<typename T>
+T &getRValue();
+
+template<typename T>
+void AtomicReadTemplate(T LHS, T RHS) {
+#pragma acc atomic read
+  LHS = RHS;
+
+  T *LHSPtr, *RHSPtr;
+
+#pragma acc atomic read
+  LHSPtr = RHSPtr;
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic read' directive is invalid}}
+  // expected-note@+2{{right operand to assignment expression must be an l-value}}
+#pragma acc atomic read
+  LHS = RHS + 1;
+
+#pragma acc atomic read
+  *LHSPtr = RHS;
+
+#pragma acc atomic read
+  LHS = *RHSPtr;
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic read' directive is invalid}}
+  // expected-note@+2{{right operand to assignment expression must be an l-value}}
+#pragma acc atomic read
+  LHS = getRValue<T>();
+}
+
+template<typename T>
+void AtomicReadTemplate2(T LHS, T RHS) {
+  // expected-error@+2{{statement associated with OpenACC 'atomic read' directive is invalid}}
+  // expected-note@+2{{left operand to assignment expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic read
+  LHS = RHS;
+
+  T *LHSPtr, *RHSPtr;
+  // Fine, now a pointer.
+#pragma acc atomic read
+  LHSPtr = RHSPtr;
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic read' directive is invalid}}
+  // expected-note@+2{{right operand to assignment expression must be an l-value}}
+#pragma acc atomic read
+  LHS = *RHS.getPtr();
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic read' directive is invalid}}
+  // expected-note@+2{{left operand to assignment expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic read
+  *LHSPtr = RHS;
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic read' directive is invalid}}
+  // expected-note@+2{{left operand to assignment expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic read
+  LHS = *RHSPtr;
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic read' directive is invalid}}
+  // expected-note@+2{{left operand to assignment expression must be an l-value}}
+#pragma acc atomic read
+  getRValue<T>() = getRValue<T>();
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic read' directive is invalid}}
+  // expected-note@+2{{right operand to assignment expression must be an l-value}}
+#pragma acc atomic read
+  LHS = getRValue<T>();
+}
+
+void AtomicRead(int LHS, int RHS) {
+  AtomicReadTemplate(LHS, RHS);
+  AtomicReadTemplate2(S1, S2); // expected-note{{in instantiation of function template specialization}}
+
+#pragma acc atomic read
+  LHS = RHS;
+
+  int *LHSPtr, *RHSPtr;
+
+#pragma acc atomic read
+  LHSPtr = RHSPtr;
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic read' directive is invalid}}
+  // expected-note@+2{{left operand to assignment expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic read
+  S1 = S2;
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic read' directive is invalid}}
+  // expected-note@+2{{right operand to assignment expression must be an l-value}}
+#pragma acc atomic read
+  LHS = RHS + 1;
+
+#pragma acc atomic read
+  *LHSPtr = RHS;
+
+#pragma acc atomic read
+  LHS = *RHSPtr;
+
+  // There is no way to test that = is an overloaded operator, since there
+  // really isn't a way to create an operator= without a class type on one side
+  // or the other.
+}
+
+template<typename T>
+void AtomicWriteTemplate(T LHS, T RHS) {
+#pragma acc atomic write
+  LHS = RHS;
+
+  T *LHSPtr, *RHSPtr;
+#pragma acc atomic write
+  LHSPtr = RHSPtr;
+
+#pragma acc atomic write
+  *LHSPtr = *RHSPtr;
+
+  // allowed, expr is ok.
+#pragma acc atomic write
+  LHS = *RHSPtr;
+
+#pragma acc atomic write
+  LHS = RHS * 2;
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic write' directive is invalid}}
+  // expected-note@+2{{left operand to assignment expression must be an l-value}}
+#pragma acc atomic write
+  getRValue<T>() = getRValue<T>();
+
+#pragma acc atomic write
+  LHS = getRValue<T>();
+}
+
+template<typename T>
+void AtomicWriteTemplate2(T LHS, T RHS) {
+  // expected-error@+2{{statement associated with OpenACC 'atomic write' directive is invalid}}
+  // expected-note@+2{{left operand to assignment expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic write
+  LHS = RHS;
+
+  T *LHSPtr, *RHSPtr;
+#pragma acc atomic write
+  LHSPtr = RHSPtr;
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic write' directive is invalid}}
+  // expected-note@+2{{left operand to assignment expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic write
+  LHS = *RHSPtr;
+
+#pragma acc atomic write
+  LHSPtr = RHS.getPtr();
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic write' directive is invalid}}
+  // expected-note@+2{{left operand to assignment expression must be an l-value}}
+#pragma acc atomic write
+  getRValue<T>() = getRValue<T>();
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic write' directive is invalid}}
+  // expected-note@+2{{left operand to assignment expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic write
+  LHS = getRValue<T>();
+}
+
+void AtomicWrite(int LHS, int RHS) {
+  AtomicWriteTemplate(LHS, RHS);
+  AtomicWriteTemplate2(S1, S2); // expected-note{{in instantiation of function template specialization}}
+
+#pragma acc atomic write
+  LHS = RHS;
+
+  int *LHSPtr, *RHSPtr;
+#pragma acc atomic write
+  LHSPtr = RHSPtr;
+
+#pragma acc atomic write
+  *LHSPtr = *RHSPtr;
+
+  // allowed, expr is ok.
+#pragma acc atomic write
+  LHS = *RHSPtr;
+
+#pragma acc atomic write
+  LHS = RHS * 2;
+}
+
+template<typename T>
+void AtomicUpdateTemplate(T LHS, T RHS) {
+#pragma acc atomic
+  LHS++;
+
+#pragma acc atomic update
+  LHS--;
+
+#pragma acc atomic
+  ++LHS;
+
+#pragma acc atomic update
+  --LHS;
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic' directive is invalid}}
+  // expected-note@+2{{unary operator not supported, only increment and decrement operations permitted}}
+#pragma acc atomic
+  +LHS;
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic update' directive is invalid}}
+  // expected-note@+2{{unary operator not supported, only increment and decrement operations permitted}}
+#pragma acc atomic update
+  -LHS;
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic update' directive is invalid}}
+  // expected-note@+2{{expected binary operation on right hand side of assignment operator}}
+#pragma acc atomic update
+  LHS = RHS;
+
+  T *LHSPtr, *RHSPtr;
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic' directive is invalid}}
+  // expected-note@+2{{expected binary operation on right hand side of assignment operator}}
+#pragma acc atomic
+  *LHSPtr = *RHSPtr;
+
+  // x binop= expr;
+#pragma acc atomic
+  LHS += 1 + RHS;
+#pragma acc atomic update
+  LHS *= 1 + RHS;
+#pragma acc atomic
+  LHS -= 1 + RHS;
+#pragma acc atomic update
+  LHS /= 1 + RHS;
+#pragma acc atomic
+  LHS &= 1 + RHS;
+#pragma acc atomic update
+  LHS ^= 1 + RHS;
+#pragma acc atomic
+  LHS |= 1 + RHS;
+#pragma acc atomic update
+  LHS <<= 1 + RHS;
+#pragma acc atomic
+  LHS >>= 1 + RHS;
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic update' directive is invalid}}
+  // expected-note@+2{{compound binary operator not supported, only +=, *=, -=, /=, &=, ^=, |=, <<=, or >>= are permitted}}
+#pragma acc atomic update
+  LHS != 1 + RHS;
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic update' directive is invalid}}
+  // expected-note@+2{{compound binary operator not supported, only +=, *=, -=, /=, &=, ^=, |=, <<=, or >>= are permitted}}
+#pragma acc atomic update
+  LHS <= 1 + RHS;
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic update' directive is invalid}}
+  // expected-note@+2{{compound binary operator not supported, only +=, *=, -=, /=, &=, ^=, |=, <<=, or >>= are permitted}}
+#pragma acc atomic update
+  LHS >= 1 + RHS;
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic update' directive is invalid}}
+  // expected-note@+2{{compound binary operator not supported, only +=, *=, -=, /=, &=, ^=, |=, <<=, or >>= are permitted}}
+#pragma acc atomic update
+  LHS %= 1 + RHS;
+
+  // x = x binop expr.
+#pragma acc atomic
+  LHS = LHS + getRValue<T>();
+#pragma acc atomic update
+  LHS = LHS * getRValue<T>();
+#pragma acc atomic update
+  LHS = LHS - getRValue<T>();
+#pragma acc atomic update
+  LHS = LHS / getRValue<T>();
+#pragma acc atomic update
+  LHS = LHS & getRValue<T>();
+#pragma acc atomic update
+  LHS = LHS ^ getRValue<T>();
+#pragma acc atomic update
+  LHS = LHS | getRValue<T>();
+#pragma acc atomic update
+  LHS = LHS << getRValue<T>();
+#pragma acc atomic update
+  LHS = LHS >> getRValue<T>();
+  // expected-error@+2{{statement associated with OpenACC 'atomic update' directive is invalid}}
+  // expected-note@+2{{binary operator not supported, only +, *, -, /, &, ^, |, <<, or >> are permitted}}
+#pragma acc atomic update
+  LHS = LHS < getRValue<T>();
+  // expected-error@+2{{statement associated with OpenACC 'atomic update' directive is invalid}}
+  // expected-note@+2{{binary operator not supported, only +, *, -, /, &, ^, |, <<, or >> are permitted}}
+#pragma acc atomic update
+  LHS = LHS > getRValue<T>();
+  // expected-error@+2{{statement associated with OpenACC 'atomic update' directive is invalid}}
+  // expected-note@+2{{binary operator not supported, only +, *, -, /, &, ^, |, <<, or >> are permitted}}
+#pragma acc atomic update
+  LHS = LHS <= getRValue<T>();
+  // expected-error@+2{{statement associated with OpenACC 'atomic update' directive is invalid}}
+  // expected-note@+2{{binary operator not supported, only +, *, -, /, &, ^, |, <<, or >> are permitted}}
+#pragma acc atomic update
+  LHS = LHS >= getRValue<T>();
+#pragma acc atomic update
+  LHS = LHS ^ getRValue<T>();
+
+
+  // x = expr binop x.
+#pragma acc atomic
+  LHS = getRValue<T>() + LHS;
+#pragma acc atomic update
+  LHS = getRValue<T>() * LHS;
+#pragma acc atomic update
+  LHS = getRValue<T>() - LHS;
+#pragma acc atomic update
+  LHS = getRValue<T>() / LHS;
+#pragma acc atomic update
+  LHS = getRValue<T>() & LHS;
+#pragma acc atomic update
+  LHS = getRValue<T>() ^ LHS;
+#pragma acc atomic update
+  LHS = getRValue<T>() | LHS;
+#pragma acc atomic update
+  LHS = getRValue<T>() << LHS;
+#pragma acc atomic update
+  LHS = getRValue<T>() >> LHS;
+  // expected-error@+2{{statement associated with OpenACC 'atomic update' directive is invalid}}
+  // expected-note@+2{{binary operator not supported, only +, *, -, /, &, ^, |, <<, or >> are permitted}}
+#pragma acc atomic update
+  LHS = getRValue<T>() < LHS;
+  // expected-error@+2{{statement associated with OpenACC 'atomic update' directive is invalid}}
+  // expected-note@+2{{binary operator not supported, only +, *, -, /, &, ^, |, <<, or >> are permitted}}
+#pragma acc atomic update
+  LHS = getRValue<T>() > LHS;
+  // expected-error@+2{{statement associated with OpenACC 'atomic update' directive is invalid}}
+  // expected-note@+2{{binary operator not supported, only +, *, -, /, &, ^, |, <<, or >> are permitted}}
+#pragma acc atomic update
+  LHS = getRValue<T>() <= LHS;
+  // expected-error@+2{{statement associated with OpenACC 'atomic update' directive is invalid}}
+  // expected-note@+2{{binary operator not supported, only +, *, -, /, &, ^, |, <<, or >> are permitted}}
+#pragma acc atomic update
+  LHS = getRValue<T>() >= LHS;
+#pragma acc atomic update
+  LHS = getRValue<T>() ^ LHS;
+
+#pragma acc atomic update
+  LHS = LHS + getRValue<T>();
+  // expected-error@+2{{statement associated with OpenACC 'atomic update' directive is invalid}}
+  // expected-note@+2{{left hand side of assignment operation('LHS') must match one side of the sub-operation on the right hand side('RHS' and 'getRValue<T>()')}}
+#pragma acc atomic update
+  LHS = RHS + getRValue<T>();
+
+#pragma acc atomic update
+  LHS = getRValue<T>() - LHS;
+  // expected-error@+2{{statement associated with OpenACC 'atomic update' directive is invalid}}
+  // expected-note@+2{{left hand side of assignment operation('LHS') must match one side of the sub-operation on the right hand side('getRValue<T>()' and 'RHS')}}
+#pragma acc atomic update
+  LHS = getRValue<T>() + RHS;
+}
+
+template<typename T>
+void AtomicUpdateTemplate2(T LHS, T RHS) {
+  // expected-error@+2{{statement associated with OpenACC 'atomic' directive is invalid}}
+  // expected-note@+2{{operand to increment expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic
+  LHS++;
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic update' directive is invalid}}
+  // expected-note@+2{{operand to decrement expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic update
+  LHS--;
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic' directive is invalid}}
+  // expected-note@+2{{operand to increment expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic
+  ++LHS;
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic update' directive is invalid}}
+  // expected-note@+2{{operand to decrement expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic update
+  --LHS;
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic' directive is invalid}}
+  // expected-note@+2{{unary operator not supported, only increment and decrement operations permitted}}
+#pragma acc atomic
+  +LHS;
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic update' directive is invalid}}
+  // expected-note@+2{{unary operator not supported, only increment and decrement operations permitted}}
+#pragma acc atomic update
+  -LHS;
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic' directive is invalid}}
+  // expected-note@+2{{expected assignment, compound assignment, increment, or decrement expression}}
+#pragma acc atomic
+  LHS();
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic' directive is invalid}}
+  // expected-note@+2{{unary operator not supported, only increment and decrement operations permitted}}
+#pragma acc atomic
+  *LHS;
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic update' directive is invalid}}
+  // expected-note@+2{{expected binary operation on right hand side of assignment operator}}
+#pragma acc atomic update
+  LHS = RHS;
+
+  T *LHSPtr, *RHSPtr;
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic' directive is invalid}}
+  // expected-note@+2{{expected binary operation on right hand side of assignment operator}}
+#pragma acc atomic
+  *LHSPtr = *RHSPtr;
+
+  // x binop= expr;
+  // expected-error@+2{{statement associated with OpenACC 'atomic' directive is invalid}}
+  // expected-note@+2{{left operand to compound assignment expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic
+  LHS += 1 + RHS;
+  // expected-error@+2{{statement associated with OpenACC 'atomic update' directive is invalid}}
+  // expected-note@+2{{left operand to compound assignment expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic update
+  LHS *= 1 + RHS;
+  // expected-error@+2{{statement associated with OpenACC 'atomic' directive is invalid}}
+  // expected-note@+2{{left operand to compound assignment expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic
+  LHS -= 1 + RHS;
+  // expected-error@+2{{statement associated with OpenACC 'atomic update' directive is invalid}}
+  // expected-note@+2{{left operand to compound assignment expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic update
+  LHS /= 1 + RHS;
+  // expected-error@+2{{statement associated with OpenACC 'atomic' directive is invalid}}
+  // expected-note@+2{{left operand to compound assignment expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic
+  LHS &= 1 + RHS;
+  // expected-error@+2{{statement associated with OpenACC 'atomic update' directive is invalid}}
+  // expected-note@+2{{left operand to compound assignment expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic update
+  LHS ^= 1 + RHS;
+  // expected-error@+2{{statement associated with OpenACC 'atomic' directive is invalid}}
+  // expected-note@+2{{left operand to compound assignment expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic
+  LHS |= 1 + RHS;
+  // expected-error@+2{{statement associated with OpenACC 'atomic update' directive is invalid}}
+  // expected-note@+2{{left operand to compound assignment expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic update
+  LHS <<= 1 + RHS;
+  // expected-error@+2{{statement associated with OpenACC 'atomic' directive is invalid}}
+  // expected-note@+2{{left operand to compound assignment expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic
+  LHS >>= 1 + RHS;
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic update' directive is invalid}}
+  // expected-note@+2{{compound binary operator not supported, only +=, *=, -=, /=, &=, ^=, |=, <<=, or >>= are permitted}}
+#pragma acc atomic update
+  LHS != 1 + RHS;
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic update' directive is invalid}}
+  // expected-note@+2{{compound binary operator not supported, only +=, *=, -=, /=, &=, ^=, |=, <<=, or >>= are permitted}}
+#pragma acc atomic update
+  LHS %= 1 + RHS;
+
+  // x = x binop expr.
+  // expected-error@+2{{statement associated with OpenACC 'atomic' directive is invalid}}
+  // expected-note@+2{{left operand to assignment expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic
+  LHS = LHS + getRValue<T>();
+  // expected-error@+2{{statement associated with OpenACC 'atomic update' directive is invalid}}
+  // expected-note@+2{{left operand to assignment expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic update
+  LHS = LHS * getRValue<T>();
+  // expected-error@+2{{statement associated with OpenACC 'atomic update' directive is invalid}}
+  // expected-note@+2{{left operand to assignment expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic update
+  LHS = LHS - getRValue<T>();
+
+  // x = expr binop x.
+  // expected-error@+2{{statement associated with OpenACC 'atomic' directive is invalid}}
+  // expected-note@+2{{left operand to assignment expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic
+  LHS = getRValue<T>() + LHS;
+  // expected-error@+2{{statement associated with OpenACC 'atomic update' directive is invalid}}
+  // expected-note@+2{{left operand to assignment expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic update
+  LHS = getRValue<T>() * LHS;
+  // expected-error@+2{{statement associated with OpenACC 'atomic update' directive is invalid}}
+  // expected-note@+2{{left operand to assignment expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic update
+  LHS = getRValue<T>() - LHS;
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic update' directive is invalid}}
+  // expected-note@+2{{left operand to assignment expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic update
+  LHS = LHS + getRValue<T>();
+  // expected-error@+2{{statement associated with OpenACC 'atomic update' directive is invalid}}
+  // expected-note@+2{{left hand side of assignment operation('LHS') must match one side of the sub-operation on the right hand side('RHS' and 'getRValue<T>()')}}
+#pragma acc atomic update
+  LHS = RHS + getRValue<T>();
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic update' directive is invalid}}
+  // expected-note@+2{{left operand to assignment expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic update
+  LHS = getRValue<T>() - LHS;
+  // expected-error@+2{{statement associated with OpenACC 'atomic update' directive is invalid}}
+  // expected-note@+2{{left hand side of assignment operation('LHS') must match one side of the sub-operation on the right hand side('getRValue<T>()' and 'RHS')}}
+#pragma acc atomic update
+  LHS = getRValue<T>() + RHS;
+}
+
+void AtomicUpdate() {
+  AtomicUpdateTemplate(1, 2);
+  AtomicUpdateTemplate2(S1, S2); //expected-note{{in instantiation of function template specialization}}
+
+  int I, J;
+
+#pragma acc atomic
+  I++;
+#pragma acc atomic update
+  --I;
+  // expected-error@+2{{statement associated with OpenACC 'atomic' directive is invalid}}
+  // expected-note@+2{{operand to increment expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic
+  S1++;
+  // expected-error@+2{{statement associated with OpenACC 'atomic update' directive is invalid}}
+  // expected-note@+2{{operand to decrement expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic update
+  --S2;
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic' directive is invalid}}
+  // expected-note@+2{{unary operator not supported, only increment and decrement operations permitted}}
+#pragma acc atomic
+  +I;
+  // expected-error@+2{{statement associated with OpenACC 'atomic update' directive is invalid}}
+  // expected-note@+2{{unary operator not supported, only increment and decrement operations permitted}}
+#pragma acc atomic update
+  -J;
+
+#pragma acc atomic update
+  I ^= 1 + J;
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic update' directive is invalid}}
+  // expected-note@+2{{compound binary operator not supported, only +=, *=, -=, /=, &=, ^=, |=, <<=, or >>= are permitted}}
+#pragma acc atomic update
+  I%= 1 + J;
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic update' directive is invalid}}
+  // expected-note@+2{{left operand to compound assignment expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic update
+  S1 ^= 1 + J;
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic update' directive is invalid}}
+  // expected-note@+2{{compound binary operator not supported, only +=, *=, -=, /=, &=, ^=, |=, <<=, or >>= are permitted}}
+#pragma acc atomic update
+  S2 %= 1 + J;
+
+#pragma acc atomic update
+  I = I + getRValue<int>();
+  // expected-error@+2{{statement associated with OpenACC 'atomic update' directive is invalid}}
+  // expected-note@+2{{left hand side of assignment operation('I') must match one side of the sub-operation on the right hand side('J' and 'getRValue<int>()')}}
+#pragma acc atomic update
+  I = J + getRValue<int>();
+
+#pragma acc atomic update
+  I = getRValue<int>() - I;
+  // expected-error@+2{{statement associated with OpenACC 'atomic update' directive is invalid}}
+  // expected-note@+2{{left hand side of assignment operation('I') must match one side of the sub-operation on the right hand side('getRValue<int>()' and 'J')}}
+#pragma acc atomic update
+  I = getRValue<int>() + J;
+}
+
+template<typename T>
+void AtomicCaptureTemplateSimple(T LHS, T RHS) {
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{expected assignment expression}}
+#pragma acc atomic capture
+  LHS++;
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{expected assignment expression}}
+#pragma acc atomic capture
+--LHS;
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{expected assignment expression}}
+#pragma acc atomic capture
+  LHS += 1 + RHS;
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{expected assignment, compound assignment, increment, or decrement expression}}
+#pragma acc atomic capture
+  LHS = RHS;
+
+#pragma acc atomic capture
+  LHS = RHS++;
+
+#pragma acc atomic capture
+  LHS = RHS--;
+
+#pragma acc atomic capture
+  LHS = ++RHS;
+
+#pragma acc atomic capture
+  LHS = --RHS;
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{unary operator not supported, only increment and decrement operations permitted}}
+#pragma acc atomic capture
+  LHS = +RHS;
+
+#pragma acc atomic capture
+  LHS = RHS += 1 + RHS;
+#pragma acc atomic capture
+  LHS = RHS *= 1 + RHS;
+#pragma acc atomic capture
+  LHS = RHS -= 1 + RHS;
+#pragma acc atomic capture
+  LHS = RHS /= 1 + RHS;
+#pragma acc atomic capture
+  LHS = RHS &= 1 + RHS;
+#pragma acc atomic capture
+  LHS = RHS ^= 1 + RHS;
+#pragma acc atomic capture
+  LHS = RHS >>= 1 + RHS;
+#pragma acc atomic capture
+  LHS = RHS |= 1 + RHS;
+#pragma acc atomic capture
+  LHS = RHS <<= 1 + RHS;
+#pragma acc atomic capture
+  LHS = RHS >>= 1 + RHS;
+
+#pragma acc atomic capture
+  LHS = RHS ^= 1 + RHS;
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{compound binary operator not supported, only +=, *=, -=, /=, &=, ^=, |=, <<=, or >>= are permitted}}
+#pragma acc atomic capture
+  LHS = RHS <= 1 + RHS;
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{compound binary operator not supported, only +=, *=, -=, /=, &=, ^=, |=, <<=, or >>= are permitted}}
+#pragma acc atomic capture
+  LHS = RHS >= 1 + RHS;
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{compound binary operator not supported, only +=, *=, -=, /=, &=, ^=, |=, <<=, or >>= are permitted}}
+#pragma acc atomic capture
+  LHS = RHS + 1;
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{compound binary operator not supported, only +=, *=, -=, /=, &=, ^=, |=, <<=, or >>= are permitted}}
+#pragma acc atomic capture
+  LHS = RHS < 1 + RHS;
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{compound binary operator not supported, only +=, *=, -=, /=, &=, ^=, |=, <<=, or >>= are permitted}}
+#pragma acc atomic capture
+  LHS = RHS > 1 + RHS;
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{compound binary operator not supported, only +=, *=, -=, /=, &=, ^=, |=, <<=, or >>= are permitted}}
+#pragma acc atomic capture
+  LHS = RHS ^ 1 + RHS;
+
+#pragma acc atomic capture
+  LHS = RHS = RHS + 1;
+#pragma acc atomic capture
+  LHS = RHS = 1 + RHS;
+#pragma acc atomic capture
+  LHS = RHS = RHS * 1;
+#pragma acc atomic capture
+  LHS = RHS = 1 * RHS;
+#pragma acc atomic capture
+  LHS = RHS = RHS / 1;
+#pragma acc atomic capture
+  LHS = RHS = 1 / RHS;
+#pragma acc atomic capture
+  LHS = RHS = RHS ^ 1;
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{binary operator not supported, only +, *, -, /, &, ^, |, <<, or >> are permitted}}
+#pragma acc atomic capture
+  LHS = RHS = 1 % RHS;
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{binary operator not supported, only +, *, -, /, &, ^, |, <<, or >> are permitted}}
+#pragma acc atomic capture
+  LHS = RHS = RHS < 1;
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{binary operator not supported, only +, *, -, /, &, ^, |, <<, or >> are permitted}}
+#pragma acc atomic capture
+  LHS = RHS = 1 > RHS;
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{left hand side of assignment operation('LHS') must match one side of the sub-operation on the right hand side('RHS' and 'getRValue<T>()')}}
+#pragma acc atomic capture
+  LHS = LHS = RHS + getRValue<T>();
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{left hand side of assignment operation('LHS') must match one side of the sub-operation on the right hand side('getRValue<T>()' and 'RHS')}}
+#pragma acc atomic capture
+  LHS = LHS = getRValue<T>() + RHS;
+}
+template<typename T>
+void AtomicCaptureTemplateSimple2(T LHS, T RHS) {
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{expected assignment expression}}
+#pragma acc atomic capture
+  LHS++;
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{expected assignment expression}}
+#pragma acc atomic capture
+--LHS;
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{expected assignment expression}}
+#pragma acc atomic capture
+  LHS += 1 + RHS;
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{expected assignment, compound assignment, increment, or decrement expression}}
+#pragma acc atomic capture
+  LHS = RHS;
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{left operand to assignment expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic capture
+  LHS = RHS++;
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{left operand to assignment expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic capture
+  LHS = RHS--;
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{left operand to assignment expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic capture
+  LHS = ++RHS;
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{left operand to assignment expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic capture
+  LHS = --RHS;
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{unary operator not supported, only increment and decrement operations permitted}}
+#pragma acc atomic capture
+  LHS = +RHS;
+
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{left operand to assignment expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic capture
+  LHS = RHS += 1 + RHS;
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{left operand to assignment expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic capture
+  LHS = RHS *= 1 + RHS;
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{left operand to assignment expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic capture
+  LHS = RHS -= 1 + RHS;
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{left operand to assignment expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic capture
+  LHS = RHS /= 1 + RHS;
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{left operand to assignment expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic capture
+  LHS = RHS &= 1 + RHS;
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{left operand to assignment expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic capture
+  LHS = RHS ^= 1 + RHS;
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{left operand to assignment expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic capture
+  LHS = RHS >>= 1 + RHS;
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{left operand to assignment expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic capture
+  LHS = RHS |= 1 + RHS;
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{left operand to assignment expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic capture
+  LHS = RHS <<= 1 + RHS;
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{left operand to assignment expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic capture
+  LHS = RHS >>= 1 + RHS;
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{left operand to assignment expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic capture
+  LHS = RHS ^= 1 + RHS;
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{compound binary operator not supported, only +=, *=, -=, /=, &=, ^=, |=, <<=, or >>= are permitted}}
+#pragma acc atomic capture
+  LHS = RHS <= 1 + RHS;
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{compound binary operator not supported, only +=, *=, -=, /=, &=, ^=, |=, <<=, or >>= are permitted}}
+#pragma acc atomic capture
+  LHS = RHS >= 1 + RHS;
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{compound binary operator not supported, only +=, *=, -=, /=, &=, ^=, |=, <<=, or >>= are permitted}}
+#pragma acc atomic capture
+  LHS = RHS + 1;
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{compound binary operator not supported, only +=, *=, -=, /=, &=, ^=, |=, <<=, or >>= are permitted}}
+#pragma acc atomic capture
+  LHS = RHS < 1 + RHS;
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{compound binary operator not supported, only +=, *=, -=, /=, &=, ^=, |=, <<=, or >>= are permitted}}
+#pragma acc atomic capture
+  LHS = RHS > 1 + RHS;
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{compound binary operator not supported, only +=, *=, -=, /=, &=, ^=, |=, <<=, or >>= are permitted}}
+#pragma acc atomic capture
+  LHS = RHS ^ 1 + RHS;
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{left operand to assignment expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic capture
+  LHS = RHS = RHS + 1;
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{left operand to assignment expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic capture
+  LHS = RHS = 1 + RHS;
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{left operand to assignment expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic capture
+  LHS = RHS = RHS * 1;
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{left operand to assignment expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic capture
+  LHS = RHS = 1 * RHS;
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{left operand to assignment expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic capture
+  LHS = RHS = RHS / 1;
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{left operand to assignment expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic capture
+  LHS = RHS = 1 / RHS;
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{left operand to assignment expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic capture
+  LHS = RHS = RHS ^ 1;
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{binary operator not supported, only +, *, -, /, &, ^, |, <<, or >> are permitted}}
+#pragma acc atomic capture
+  LHS = RHS = 1 % RHS;
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{binary operator not supported, only +, *, -, /, &, ^, |, <<, or >> are permitted}}
+#pragma acc atomic capture
+  LHS = RHS = RHS < 1;
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{binary operator not supported, only +, *, -, /, &, ^, |, <<, or >> are permitted}}
+#pragma acc atomic capture
+  LHS = RHS = 1 > RHS;
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{left hand side of assignment operation('LHS') must match one side of the sub-operation on the right hand side('RHS' and 'getRValue<T>()')}}
+#pragma acc atomic capture
+  LHS = LHS = RHS + getRValue<T>();
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{left hand side of assignment operation('LHS') must match one side of the sub-operation on the right hand side('getRValue<T>()' and 'RHS')}}
+#pragma acc atomic capture
+  LHS = LHS = getRValue<T>() + RHS;
+}
+
+void AtomicCaptureSimple(int LHS, int RHS) {
+  AtomicCaptureTemplateSimple(1, 2);
+  AtomicCaptureTemplateSimple2(S1, S2); //expected-note{{in instantiation of function template specialization}}
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{expected assignment expression}}
+#pragma acc atomic capture
+  LHS++;
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{expected assignment expression}}
+#pragma acc atomic capture
+--LHS;
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{expected assignment expression}}
+#pragma acc atomic capture
+  LHS += 1 + RHS;
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{expected assignment, compound assignment, increment, or decrement expression}}
+#pragma acc atomic capture
+  LHS = RHS;
+
+#pragma acc atomic capture
+  LHS = RHS++;
+
+#pragma acc atomic capture
+  LHS = RHS--;
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{left operand to assignment expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic capture
+  S1 = ++S2;
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{left operand to assignment expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic capture
+  S1 = --S2 ;
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{unary operator not supported, only increment and decrement operations permitted}}
+#pragma acc atomic capture
+  LHS = +RHS;
+
+#pragma acc atomic capture
+  LHS = RHS += 1 + RHS;
+#pragma acc atomic capture
+  LHS = RHS *= 1 + RHS;
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{left operand to assignment expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic capture
+  S1 = RHS -= 1 + RHS;
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{expected assignment, compound assignment, increment, or decrement expression}}
+#pragma acc atomic capture
+  LHS = S1 /= 1 + RHS;
+#pragma acc atomic capture
+  LHS = RHS &= 1 + S2;
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{expected assignment, compound assignment, increment, or decrement expression}}
+#pragma acc atomic capture
+  LHS = S1^= 1 + S2;
+
+#pragma acc atomic capture
+  LHS = RHS ^= 1 + RHS;
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{compound binary operator not supported, only +=, *=, -=, /=, &=, ^=, |=, <<=, or >>= are permitted}}
+#pragma acc atomic capture
+  LHS = RHS <= 1 + RHS;
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{left operand to assignment expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic capture
+  S1 = RHS ^= 1 + RHS;
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{compound binary operator not supported, only +=, *=, -=, /=, &=, ^=, |=, <<=, or >>= are permitted}}
+#pragma acc atomic capture
+  LHS = S1 <= 1 + RHS;
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{compound binary operator not supported, only +=, *=, -=, /=, &=, ^=, |=, <<=, or >>= are permitted}}
+#pragma acc atomic capture
+  LHS = RHS <= 1 + S2;
+
+#pragma acc atomic capture
+  LHS = RHS = RHS + 1;
+#pragma acc atomic capture
+  LHS = RHS = 1 + RHS;
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{left operand to assignment expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic capture
+  S1 = RHS = RHS * 1;
+  // A little weird, because this contains a 'operator int' call here rather
+  // than a conversion, so the diagnostic could be better.
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{expected assignment, compound assignment, increment, or decrement expression}}
+#pragma acc atomic capture
+  LHS = S2 = 1 * S2;
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{binary operator not supported, only +, *, -, /, &, ^, |, <<, or >> are permitted}}
+#pragma acc atomic capture
+  LHS = RHS = RHS < 1;
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{binary operator not supported, only +, *, -, /, &, ^, |, <<, or >> are permitted}}
+#pragma acc atomic capture
+  LHS = RHS = 1 > RHS;
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{left operand to assignment expression must be of scalar type (was 'Struct')}}
+#pragma acc atomic capture
+  S1 = RHS = RHS < 1;
+
+  // A little weird, because this contains a 'operator int' call here rather
+  // than a conversion, so the diagnostic could be better.
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{expected assignment, compound assignment, increment, or decrement expression}}
+#pragma acc atomic capture
+  LHS = S1 = 1 > S1;
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{left hand side of assignment operation('LHS') must match one side of the sub-operation on the right hand side('RHS' and 'getRValue<int>()')}}
+#pragma acc atomic capture
+  LHS = LHS = RHS + getRValue<int>();
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{left hand side of assignment operation('LHS') must match one side of the sub-operation on the right hand side('getRValue<int>()' and 'RHS')}}
+#pragma acc atomic capture
+  LHS = LHS = getRValue<int>() + RHS;
+}
+
+template<typename T>
+void AtomicCaptureTemplateCompound(T LHS, T RHS) {
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{expected assignment, compound assignment, increment, or decrement expression}}
+#pragma acc atomic capture
+  {
+  }
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+4{{expected assignment, compound assignment, increment, or decrement expression}}
+#pragma acc atomic capture
+  {
+    LHS = RHS;
+  }
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+3{{'atomic capture' with a compound statement only supports two statements}}
+#pragma acc atomic capture
+  {
+    LHS = RHS; RHS += 1; LHS=RHS;
+  }
+
+
+#pragma acc atomic capture
+  {
+    LHS++;
+    RHS = LHS;
+  }
+
+#pragma acc atomic capture
+  {
+    ++LHS;
+    RHS = LHS;
+  }
+
+#pragma acc atomic capture
+  {
+    --LHS;
+    RHS = LHS;
+  }
+
+
+#pragma acc atomic capture
+  {
+    LHS--;
+    // expected-error@-3{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{variable on right hand side of assignment('RHS') must match variable used in unary expression('LHS') from the first statement}}
+    LHS = RHS;
+  }
+
+#pragma acc atomic capture
+  {
+    // expected-error@-2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{unary operator not supported, only increment and decrement operations permitted}}
+    -LHS;
+    RHS = LHS;
+  }
+
+#pragma acc atomic capture
+  {
+    --LHS;
+    // expected-error@-3{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{expected assignment expression}}
+    RHS += LHS;
+  }
+
+  // { x binop = expr; v = x; }
+#pragma acc atomic capture
+  {
+    LHS += 1;
+    RHS = LHS;
+  }
+#pragma acc atomic capture
+  {
+    LHS *= 1;
+    // expected-error@-3{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{variable on right hand side of assignment('RHS') must match variable used on left hand side of compound assignment('LHS') from the first statement}}
+    LHS = RHS;
+  }
+#pragma acc atomic capture
+  {
+    LHS /= 1;
+  // expected-error@-3{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+1{{expected assignment expression}}
+    RHS += LHS;
+  }
+
+  // { x = x binop expr; v = x; }
+#pragma acc atomic capture
+  {
+    LHS = LHS + 1;
+    RHS = LHS;
+  }
+
+#pragma acc atomic capture
+  {
+  // expected-error@-2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+1{{left hand side of assignment operation('LHS') must match one side of the sub-operation on the right hand side('RHS' and '1')}}
+    LHS = RHS - 1;
+    RHS = LHS;
+  }
+#pragma acc atomic capture
+  {
+    LHS = LHS * 1;
+    // expected-error@-3{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{variable on right hand side of assignment('RHS') must match variable used on left hand side of assignment('LHS') from the first statement}}
+    RHS = RHS;
+  }
+#pragma acc atomic capture
+  {
+    LHS = LHS / 1;
+  // expected-error@-3{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+1{{expected assignment expression}}
+    RHS += LHS;
+  }
+
+  // { x = expr binop x; v = x; }
+#pragma acc atomic capture
+  {
+    LHS = 1 ^ LHS;
+    RHS = LHS;
+  }
+
+#pragma acc atomic capture
+  {
+    // expected-error@-2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{left hand side of assignment operation('LHS') must match one side of the sub-operation on the right hand side('1' and 'RHS')}}
+    LHS = 1 & RHS;
+    RHS = LHS;
+  }
+#pragma acc atomic capture
+  {
+    LHS = LHS | 1;
+    // expected-error@-3{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{variable on right hand side of assignment('RHS') must match variable used on left hand side of assignment('LHS') from the first statement}}
+    RHS = RHS;
+  }
+#pragma acc atomic capture
+  {
+    LHS = LHS << 1;
+    // expected-error@-3{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{expected assignment expression}}
+    RHS += LHS;
+  }
+
+  // { v = x; x binop = expr; }
+#pragma acc atomic capture
+  {
+    LHS = RHS;
+    RHS += 1;
+  }
+
+  // { v = x; x = x binop expr; }
+#pragma acc atomic capture
+  {
+    LHS = RHS;
+    RHS = RHS / 1;
+  }
+#pragma acc atomic capture
+  {
+    LHS = RHS;
+    // expected-error@-3{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{left hand side of assignment operation('RHS') must match one side of the sub-operation on the right hand side('LHS' and '1')}}
+    RHS = LHS ^ 1;
+  }
+#pragma acc atomic capture
+  {
+    LHS = RHS;
+    // expected-error@-3{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{left hand side of assignment operation('LHS') must match one side of the sub-operation on the right hand side('RHS' and '1')}}
+    LHS = RHS << 1;
+  }
+  // { v = x; x = expr binop x; }
+#pragma acc atomic capture
+  {
+    LHS = RHS;
+    RHS = 1 / RHS;
+  }
+#pragma acc atomic capture
+  {
+    LHS = RHS;
+    // expected-error@-3{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{left hand side of assignment operation('RHS') must match one side of the sub-operation on the right hand side('1' and 'LHS')}}
+    RHS = 1 ^ LHS;
+  }
+#pragma acc atomic capture
+  {
+    LHS = RHS;
+    // expected-error@-3{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{left hand side of assignment operation('LHS') must match one side of the sub-operation on the right hand side('1' and 'RHS')}}
+    LHS = 1 << RHS;
+  }
+
+  // { v = x; x = expr; }
+#pragma acc atomic capture
+  {
+    LHS = RHS;
+    RHS = 1;
+  }
+#pragma acc atomic capture
+  {
+    LHS = RHS;
+    // expected-error@-3{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{variable on left hand side of assignment('LHS') must match variable used on right hand side of assignment('RHS') from the first statement}}
+    LHS = 1;
+  }
+
+  // { v = x; x++; }
+  // { v = x; ++x; }
+  // { v = x; x--; }
+  // { v = x; --x; }
+#pragma acc atomic capture
+  {
+    LHS = RHS;
+    RHS++;
+  }
+#pragma acc atomic capture
+  {
+    LHS = RHS;
+    RHS--;
+  }
+#pragma acc atomic capture
+  {
+    LHS = RHS;
+    ++RHS;
+  }
+#pragma acc atomic capture
+  {
+    LHS = RHS;
+    --RHS;
+  }
+#pragma acc atomic capture
+  {
+    LHS = RHS;
+    // expected-error@-3{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{unary operator not supported, only increment and decrement operations permitted}}
+    -RHS;
+  }
+#pragma acc atomic capture
+  {
+    LHS = RHS;
+    // expected-error@-3{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{variable in unary expression('LHS') must match variable used on right hand side of assignment('RHS') from the first statement}}
+    LHS++;
+  }
+}
+
+template<typename T>
+void AtomicCaptureTemplateCompound2(T LHS, T RHS) {
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{expected assignment, compound assignment, increment, or decrement expression}}
+#pragma acc atomic capture
+  {
+  }
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+4{{expected assignment, compound assignment, increment, or decrement expression}}
+#pragma acc atomic capture
+  {
+    LHS = RHS;
+  }
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+3{{'atomic capture' with a compound statement only supports two statements}}
+#pragma acc atomic capture
+  {
+    LHS = RHS; RHS += 1; LHS=RHS;
+  }
+
+
+#pragma acc atomic capture
+  {
+    // expected-error@-2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{operand to increment expression must be of scalar type (was 'Struct')}}
+    LHS++;
+    RHS = LHS;
+  }
+
+#pragma acc atomic capture
+  {
+    // expected-error@-2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{operand to increment expression must be of scalar type (was 'Struct')}}
+    ++LHS;
+    RHS = LHS;
+  }
+
+#pragma acc atomic capture
+  {
+    // expected-error@-2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{operand to decrement expression must be of scalar type (was 'Struct')}}
+    --LHS;
+    RHS = LHS;
+  }
+
+
+#pragma acc atomic capture
+  {
+    LHS--;
+    // expected-error@-3{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{variable on right hand side of assignment('RHS') must match variable used in unary expression('LHS') from the first statement}}
+    LHS = RHS;
+  }
+
+#pragma acc atomic capture
+  {
+    // expected-error@-2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{unary operator not supported, only increment and decrement operations permitted}}
+    -LHS;
+    RHS = LHS;
+  }
+
+#pragma acc atomic capture
+  {
+    --LHS;
+    // expected-error@-3{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{expected assignment expression}}
+    RHS += LHS;
+  }
+
+  // { x binop = expr; v = x; }
+#pragma acc atomic capture
+  {
+    // expected-error@-2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{left operand to compound assignment expression must be of scalar type (was 'Struct')}}
+    LHS += 1;
+    RHS = LHS;
+  }
+#pragma acc atomic capture
+  {
+    LHS *= 1;
+    // expected-error@-3{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{variable on right hand side of assignment('RHS') must match variable used on left hand side of compound assignment('LHS') from the first statement}}
+    LHS = RHS;
+  }
+#pragma acc atomic capture
+  {
+    LHS /= 1;
+    // expected-error@-3{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{expected assignment expression}}
+    RHS += LHS;
+  }
+
+  // { x = x binop expr; v = x; }
+#pragma acc atomic capture
+  {
+    // expected-error@-2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{left operand to assignment expression must be of scalar type (was 'Struct')}}
+    LHS = LHS + 1;
+    RHS = LHS;
+  }
+
+#pragma acc atomic capture
+  {
+  // expected-error@-2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+1{{left hand side of assignment operation('LHS') must match one side of the sub-operation on the right hand side('RHS' and '1')}}
+    LHS = RHS - 1;
+    RHS = LHS;
+  }
+#pragma acc atomic capture
+  {
+    LHS = LHS * 1;
+    // expected-error@-3{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{variable on right hand side of assignment('RHS') must match variable used on left hand side of assignment('LHS') from the first statement}}
+    RHS = RHS;
+  }
+#pragma acc atomic capture
+  {
+    LHS = LHS / 1;
+    // expected-error@-3{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{expected assignment expression}}
+    RHS += LHS;
+  }
+
+  // { x = expr binop x; v = x; }
+#pragma acc atomic capture
+  {
+    // expected-error@-2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{left operand to assignment expression must be of scalar type (was 'Struct')}}
+    LHS = 1 ^ LHS;
+    RHS = LHS;
+  }
+
+#pragma acc atomic capture
+  {
+  // expected-error@-2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+1{{left hand side of assignment operation('LHS') must match one side of the sub-operation on the right hand side('1' and 'RHS')}}
+    LHS = 1 & RHS;
+    RHS = LHS;
+  }
+#pragma acc atomic capture
+  {
+    LHS = LHS | 1;
+    // expected-error@-3{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{variable on right hand side of assignment('RHS') must match variable used on left hand side of assignment('LHS') from the first statement}}
+    RHS = RHS;
+  }
+#pragma acc atomic capture
+  {
+    LHS = LHS << 1;
+    // expected-error@-3{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{expected assignment expression}}
+    RHS += LHS;
+  }
+
+  // { v = x; x binop = expr; }
+#pragma acc atomic capture
+  {
+    // expected-error@-2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{left operand to assignment expression must be of scalar type (was 'Struct')}}
+    LHS = RHS;
+    RHS += 1;
+  }
+
+  // { v = x; x = x binop expr; }
+#pragma acc atomic capture
+  {
+    // expected-error@-2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{left operand to assignment expression must be of scalar type (was 'Struct')}}
+    LHS = RHS;
+    RHS = RHS / 1;
+  }
+#pragma acc atomic capture
+  {
+    LHS = RHS;
+    // expected-error@-3{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{left hand side of assignment operation('RHS') must match one side of the sub-operation on the right hand side('LHS' and '1')}}
+    RHS = LHS ^ 1;
+  }
+#pragma acc atomic capture
+  {
+    LHS = RHS;
+    // expected-error@-3{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{left hand side of assignment operation('LHS') must match one side of the sub-operation on the right hand side('RHS' and '1')}}
+    LHS = RHS << 1;
+  }
+  // { v = x; x = expr binop x; }
+#pragma acc atomic capture
+  {
+    // expected-error@-2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{left operand to assignment expression must be of scalar type (was 'Struct')}}
+    LHS = RHS;
+    RHS = 1 / RHS;
+  }
+#pragma acc atomic capture
+  {
+    LHS = RHS;
+    // expected-error@-3{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{left hand side of assignment operation('RHS') must match one side of the sub-operation on the right hand side('1' and 'LHS')}}
+    RHS = 1 ^ LHS;
+  }
+#pragma acc atomic capture
+  {
+    LHS = RHS;
+    // expected-error@-3{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{left hand side of assignment operation('LHS') must match one side of the sub-operation on the right hand side('1' and 'RHS')}}
+    LHS = 1 << RHS;
+  }
+
+  // { v = x; x = expr; }
+#pragma acc atomic capture
+  {
+    // expected-error@-2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{left operand to assignment expression must be of scalar type (was 'Struct')}}
+    LHS = RHS;
+    RHS = 1;
+  }
+#pragma acc atomic capture
+  {
+    LHS = RHS;
+    // expected-error@-3{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{variable on left hand side of assignment('LHS') must match variable used on right hand side of assignment('RHS') from the first statement}}
+    LHS = 1;
+  }
+
+  // { v = x; x++; }
+  // { v = x; ++x; }
+  // { v = x; x--; }
+  // { v = x; --x; }
+#pragma acc atomic capture
+  {
+    // expected-error@-2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{left operand to assignment expression must be of scalar type (was 'Struct')}}
+    LHS = RHS;
+    RHS++;
+  }
+#pragma acc atomic capture
+  {
+    // expected-error@-2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{left operand to assignment expression must be of scalar type (was 'Struct')}}
+    LHS = RHS;
+    RHS--;
+  }
+#pragma acc atomic capture
+  {
+    // expected-error@-2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{left operand to assignment expression must be of scalar type (was 'Struct')}}
+    LHS = RHS;
+    ++RHS;
+  }
+#pragma acc atomic capture
+  {
+    // expected-error@-2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{left operand to assignment expression must be of scalar type (was 'Struct')}}
+    LHS = RHS;
+    --RHS;
+  }
+#pragma acc atomic capture
+  {
+    LHS = RHS;
+    // expected-error@-3{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{unary operator not supported, only increment and decrement operations permitted}}
+    -RHS;
+  }
+#pragma acc atomic capture
+  {
+    LHS = RHS;
+    // expected-error@-3{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{variable in unary expression('LHS') must match variable used on right hand side of assignment('RHS') from the first statement}}
+    LHS++;
+  }
+}
+void AtomicCaptureCompound(int LHS, int RHS) {
+  AtomicCaptureTemplateCompound(1, 2); 
+  AtomicCaptureTemplateCompound2(S1, S2); //expected-note{{in instantiation of function template specialization}}
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+2{{expected assignment, compound assignment, increment, or decrement expression}}
+#pragma acc atomic capture
+  {
+  }
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+4{{expected assignment, compound assignment, increment, or decrement expression}}
+#pragma acc atomic capture
+  {
+    LHS = RHS;
+  }
+
+  // expected-error@+2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+3{{'atomic capture' with a compound statement only supports two statements}}
+#pragma acc atomic capture
+  {
+    LHS = RHS; RHS += 1; LHS=RHS;
+  }
+
+
+#pragma acc atomic capture
+  {
+    LHS++;
+    RHS = LHS;
+  }
+#pragma acc atomic capture
+  {
+    // expected-error@-2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{operand to increment expression must be of scalar type (was 'Struct')}}
+    S1++;
+    S2= S1;
+  }
+
+#pragma acc atomic capture
+  {
+    ++LHS;
+    RHS = LHS;
+  }
+
+#pragma acc atomic capture
+  {
+    --LHS;
+    RHS = LHS;
+  }
+
+
+#pragma acc atomic capture
+  {
+    LHS--;
+    // expected-error@-3{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{variable on right hand side of assignment('RHS') must match variable used in unary expression('LHS') from the first statement}}
+    LHS = RHS;
+  }
+
+#pragma acc atomic capture
+  {
+    // expected-error@-2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{unary operator not supported, only increment and decrement operations permitted}}
+    -LHS;
+    RHS = LHS;
+  }
+
+#pragma acc atomic capture
+  {
+    --LHS;
+  // expected-error@-3{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+1{{expected assignment expression}}
+    RHS += LHS;
+  }
+
+  // { x binop = expr; v = x; }
+#pragma acc atomic capture
+  {
+    LHS += 1;
+    RHS = LHS;
+  }
+#pragma acc atomic capture
+  {
+    // expected-error@-2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{left operand to compound assignment expression must be of scalar type (was 'Struct')}}
+    S1 += 1;
+    S2= S1;
+  }
+#pragma acc atomic capture
+  {
+    LHS *= 1;
+    // expected-error@-3{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{variable on right hand side of assignment('RHS') must match variable used on left hand side of compound assignment('LHS') from the first statement}}
+    LHS = RHS;
+  }
+#pragma acc atomic capture
+  {
+    LHS /= 1;
+  // expected-error@-3{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+1{{expected assignment expression}}
+    RHS += LHS;
+  }
+
+  // { x = x binop expr; v = x; }
+#pragma acc atomic capture
+  {
+    LHS = LHS + 1;
+    RHS = LHS;
+  }
+#pragma acc atomic capture
+  {
+    // expected-error@-2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{left operand to assignment expression must be of scalar type (was 'Struct')}}
+    S1 = S1 + 1;
+    S2= S1;
+  }
+
+#pragma acc atomic capture
+  {
+    // expected-error@-2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{left hand side of assignment operation('LHS') must match one side of the sub-operation on the right hand side('RHS' and '1')}}
+    LHS = RHS - 1;
+    RHS = LHS;
+  }
+#pragma acc atomic capture
+  {
+    LHS = LHS * 1;
+    // expected-error@-3{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{variable on right hand side of assignment('RHS') must match variable used on left hand side of assignment('LHS') from the first statement}}
+    RHS = RHS;
+  }
+#pragma acc atomic capture
+  {
+    LHS = LHS / 1;
+  // expected-error@-3{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+1{{expected assignment expression}}
+    RHS += LHS;
+  }
+
+  // { x = expr binop x; v = x; }
+#pragma acc atomic capture
+  {
+    LHS = 1 ^ LHS;
+    RHS = LHS;
+  }
+#pragma acc atomic capture
+  {
+    // expected-error@-2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{left operand to assignment expression must be of scalar type (was 'Struct')}}
+    S1 = 1 ^ S1;
+    S2 = S1;
+  }
+
+#pragma acc atomic capture
+  {
+    // expected-error@-2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{left hand side of assignment operation('LHS') must match one side of the sub-operation on the right hand side('1' and 'RHS')}}
+    LHS = 1 & RHS;
+    RHS = LHS;
+  }
+#pragma acc atomic capture
+  {
+    LHS = LHS | 1;
+    // expected-error@-3{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{variable on right hand side of assignment('RHS') must match variable used on left hand side of assignment('LHS') from the first statement}}
+    RHS = RHS;
+  }
+#pragma acc atomic capture
+  {
+    LHS = LHS << 1;
+  // expected-error@-3{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+  // expected-note@+1{{expected assignment expression}}
+    RHS += LHS;
+  }
+
+  // { v = x; x binop = expr; }
+#pragma acc atomic capture
+  {
+    LHS = RHS;
+    RHS += 1;
+  }
+
+#pragma acc atomic capture
+  {
+    // expected-error@-2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{left operand to assignment expression must be of scalar type (was 'Struct')}}
+    S1 = S2;
+    S2 += 1;
+  }
+
+  // { v = x; x = x binop expr; }
+#pragma acc atomic capture
+  {
+    LHS = RHS;
+    RHS = RHS / 1;
+  }
+#pragma acc atomic capture
+  {
+    LHS = RHS;
+    // expected-error@-3{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{left hand side of assignment operation('RHS') must match one side of the sub-operation on the right hand side('LHS' and '1')}}
+    RHS = LHS ^ 1;
+  }
+#pragma acc atomic capture
+  {
+    LHS = RHS;
+    // expected-error@-3{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{left hand side of assignment operation('LHS') must match one side of the sub-operation on the right hand side('RHS' and '1')}}
+    LHS = RHS << 1;
+  }
+  // { v = x; x = expr binop x; }
+#pragma acc atomic capture
+  {
+    LHS = RHS;
+    RHS = 1 / RHS;
+  }
+#pragma acc atomic capture
+  {
+    LHS = RHS;
+    // expected-error@-3{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{left hand side of assignment operation('RHS') must match one side of the sub-operation on the right hand side('1' and 'LHS')}}
+    RHS = 1 ^ LHS;
+  }
+#pragma acc atomic capture
+  {
+    LHS = RHS;
+    // expected-error@-3{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{left hand side of assignment operation('LHS') must match one side of the sub-operation on the right hand side('1' and 'RHS')}}
+    LHS = 1 << RHS;
+  }
+
+  // { v = x; x = expr; }
+#pragma acc atomic capture
+  {
+    LHS = RHS;
+    RHS = 1;
+  }
+#pragma acc atomic capture
+  {
+    LHS = RHS;
+    // expected-error@-3{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{variable on left hand side of assignment('LHS') must match variable used on right hand side of assignment('RHS') from the first statement}}
+    LHS = 1;
+  }
+
+  // { v = x; x++; }
+  // { v = x; ++x; }
+  // { v = x; x--; }
+  // { v = x; --x; }
+#pragma acc atomic capture
+  {
+    LHS = RHS;
+    RHS++;
+  }
+#pragma acc atomic capture
+  {
+    LHS = RHS;
+    RHS--;
+  }
+#pragma acc atomic capture
+  {
+    LHS = RHS;
+    ++RHS;
+  }
+#pragma acc atomic capture
+  {
+    LHS = RHS;
+    --RHS;
+  }
+#pragma acc atomic capture
+  {
+    // expected-error@-2{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{left operand to assignment expression must be of scalar type (was 'Struct')}}
+    S1= S2;
+    --S2;
+  }
+#pragma acc atomic capture
+  {
+    LHS = RHS;
+    // expected-error@-3{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{unary operator not supported, only increment and decrement operations permitted}}
+    -RHS;
+  }
+#pragma acc atomic capture
+  {
+    LHS = RHS;
+    // expected-error@-3{{statement associated with OpenACC 'atomic capture' directive is invalid}}
+    // expected-note@+1{{variable in unary expression('LHS') must match variable used on right hand side of assignment('RHS') from the first statement}}
+    LHS++;
+  }
+}
diff --git clang/test/SemaTemplate/GH55509.cpp clang/test/SemaTemplate/GH55509.cpp
deleted file mode 100644
index f95833fbed7b..000000000000
--- clang/test/SemaTemplate/GH55509.cpp
+++ /dev/null
@@ -1,101 +0,0 @@
-// RUN: %clang_cc1 -fsyntax-only -verify -std=c++26 %s
-
-namespace t1 {
-  template<int N> struct A {
-    template<class C> friend auto cica(const A<N-1>&, C) {
-      return N;
-    }
-  };
-
-  template<> struct A<0> {
-    template<class C> friend auto cica(const A<0>&, C);
-    // expected-note@-1 {{declared here}}
-  };
-
-  void test() {
-    cica(A<0>{}, 0);
-    // expected-error@-1 {{function 'cica<int>' with deduced return type cannot be used before it is defined}}
-
-    (void)A<1>{};
-    cica(A<0>{}, 0);
-  }
-} // namespace t1
-namespace t2 {
-  template<int N> struct A {
-    template<class C> friend auto cica(const A<N-1>&, C) {
-      return N;
-    }
-  };
-
-  template<> struct A<0> {
-    template<class C> friend auto cica(const A<0>&, C);
-  };
-
-  template <int N, class = decltype(cica(A<N>{}, nullptr))>
-  void MakeCica();
-  // expected-note@-1 {{candidate function}}
-
-  template <int N> void MakeCica(A<N+1> = {});
-  // expected-note@-1 {{candidate function}}
-
-  void test() {
-    MakeCica<0>();
-
-    MakeCica<0>();
-    // expected-error@-1 {{call to 'MakeCica' is ambiguous}}
-  }
-} // namespace t2
-namespace t3 {
-  template<int N> struct A {
-    template<class C> friend auto cica(const A<N-1>&, C) {
-      return N-1;
-    }
-  };
-
-  template<> struct A<0> {
-    template<class C> friend auto cica(const A<0>&, C);
-  };
-
-  template <int N, class AT, class = decltype(cica(AT{}, nullptr))>
-  static constexpr bool MakeCica(int);
-
-  template <int N, class AT>
-  static constexpr bool MakeCica(short, A<N+1> = {});
-
-  template <int N, class AT = A<N>, class Val = decltype(MakeCica<N, AT>(0))>
-  static constexpr bool has_cica = Val{};
-
-  constexpr bool cica2 = has_cica<0> || has_cica<0>;
-} // namespace t3
-namespace t4 {
-  template<int N> struct A {
-    template<class C> friend auto cica(const A<N-1>&, C);
-  };
-
-  template<> struct A<0> {
-    template<class C> friend auto cica(const A<0>&, C) {
-      C a;
-    }
-  };
-
-  template struct A<1>;
-
-  void test() {
-    cica(A<0>{}, 0);
-  }
-} // namespace t4
-namespace regression1 {
-  template <class> class A;
-
-  template <class T> [[gnu::abi_tag("TAG")]] void foo(A<T>);
-
-  template <class> struct A {
-    friend void foo <>(A);
-  };
-
-  template struct A<int>;
-
-  template <class T> [[gnu::abi_tag("TAG")]] void foo(A<T>) {}
-
-  template void foo<int>(A<int>);
-} // namespace regression1
diff --git clang/tools/libclang/CIndex.cpp clang/tools/libclang/CIndex.cpp
index 42f095fea2db..697cc4776839 100644
--- clang/tools/libclang/CIndex.cpp
+++ clang/tools/libclang/CIndex.cpp
@@ -2193,6 +2193,8 @@ public:
   void VisitOpenACCInitConstruct(const OpenACCInitConstruct *D);
   void VisitOpenACCShutdownConstruct(const OpenACCShutdownConstruct *D);
   void VisitOpenACCSetConstruct(const OpenACCSetConstruct *D);
+  void VisitOpenACCUpdateConstruct(const OpenACCUpdateConstruct *D);
+  void VisitOpenACCAtomicConstruct(const OpenACCAtomicConstruct *D);
   void VisitOMPExecutableDirective(const OMPExecutableDirective *D);
   void VisitOMPLoopBasedDirective(const OMPLoopBasedDirective *D);
   void VisitOMPLoopDirective(const OMPLoopDirective *D);
@@ -3682,6 +3684,18 @@ void EnqueueVisitor::VisitOpenACCSetConstruct(const OpenACCSetConstruct *C) {
     EnqueueChildren(Clause);
 }
 
+void EnqueueVisitor::VisitOpenACCUpdateConstruct(
+    const OpenACCUpdateConstruct *C) {
+  EnqueueChildren(C);
+  for (auto *Clause : C->clauses())
+    EnqueueChildren(Clause);
+}
+
+void EnqueueVisitor::VisitOpenACCAtomicConstruct(
+    const OpenACCAtomicConstruct *C) {
+  EnqueueChildren(C);
+}
+
 void EnqueueVisitor::VisitAnnotateAttr(const AnnotateAttr *A) {
   EnqueueChildren(A);
 }
@@ -6454,6 +6468,8 @@ CXString clang_getCursorKindSpelling(enum CXCursorKind Kind) {
     return cxstring::createRef("OpenACCSetConstruct");
   case CXCursor_OpenACCUpdateConstruct:
     return cxstring::createRef("OpenACCUpdateConstruct");
+  case CXCursor_OpenACCAtomicConstruct:
+    return cxstring::createRef("OpenACCAtomicConstruct");
   }
 
   llvm_unreachable("Unhandled CXCursorKind");
@@ -7415,11 +7431,11 @@ unsigned clang_getNumOverloadedDecls(CXCursor C) {
     return 0;
 
   OverloadedDeclRefStorage Storage = getCursorOverloadedDeclRef(C).first;
-  if (const OverloadExpr *E = Storage.dyn_cast<const OverloadExpr *>())
+  if (const OverloadExpr *E = dyn_cast<const OverloadExpr *>(Storage))
     return E->getNumDecls();
 
   if (OverloadedTemplateStorage *S =
-          Storage.dyn_cast<OverloadedTemplateStorage *>())
+          dyn_cast<OverloadedTemplateStorage *>(Storage))
     return S->size();
 
   const Decl *D = cast<const Decl *>(Storage);
@@ -7438,11 +7454,11 @@ CXCursor clang_getOverloadedDecl(CXCursor cursor, unsigned index) {
 
   CXTranslationUnit TU = getCursorTU(cursor);
   OverloadedDeclRefStorage Storage = getCursorOverloadedDeclRef(cursor).first;
-  if (const OverloadExpr *E = Storage.dyn_cast<const OverloadExpr *>())
+  if (const OverloadExpr *E = dyn_cast<const OverloadExpr *>(Storage))
     return MakeCXCursor(E->decls_begin()[index], TU);
 
   if (OverloadedTemplateStorage *S =
-          Storage.dyn_cast<OverloadedTemplateStorage *>())
+          dyn_cast<OverloadedTemplateStorage *>(Storage))
     return MakeCXCursor(S->begin()[index], TU);
 
   const Decl *D = cast<const Decl *>(Storage);
diff --git clang/tools/libclang/CXCursor.cpp clang/tools/libclang/CXCursor.cpp
index 60c740311e94..127f22bc5bdc 100644
--- clang/tools/libclang/CXCursor.cpp
+++ clang/tools/libclang/CXCursor.cpp
@@ -920,6 +920,9 @@ CXCursor cxcursor::MakeCXCursor(const Stmt *S, const Decl *Parent,
   case Stmt::OpenACCUpdateConstructClass:
     K = CXCursor_OpenACCUpdateConstruct;
     break;
+  case Stmt::OpenACCAtomicConstructClass:
+    K = CXCursor_OpenACCAtomicConstruct;
+    break;
   case Stmt::OMPTargetParallelGenericLoopDirectiveClass:
     K = CXCursor_OMPTargetParallelGenericLoopDirective;
     break;
diff --git clang/unittests/AST/ASTImporterTest.cpp clang/unittests/AST/ASTImporterTest.cpp
index 114d0b461dae..0bac95eb40b2 100644
--- clang/unittests/AST/ASTImporterTest.cpp
+++ clang/unittests/AST/ASTImporterTest.cpp
@@ -8193,6 +8193,29 @@ TEST_P(ImportFunctions, CTADAliasTemplate) {
   EXPECT_TRUE(ToD->getSourceDeductionGuide());
 }
 
+TEST_P(ImportFunctions, CTADAliasTemplateWithExplicitSourceDeductionGuide) {
+  Decl *TU = getTuDecl(
+      R"(
+      template <typename T> struct A {
+        A(T);
+      };
+      template<typename T>
+      using B = A<T>;
+      A(int) -> A<double>; // explicit
+      B b{(int)0};
+      )",
+      Lang_CXX20, "input.cc");
+  auto *FromD = FirstDeclMatcher<CXXDeductionGuideDecl>().match(
+      TU, cxxDeductionGuideDecl(hasParameter(0, hasType(asString("int"))),
+                                hasName("<deduction guide for B>"),
+                                hasReturnTypeLoc(loc(asString("A<double>")))));
+  auto *ToD = Import(FromD, Lang_CXX20);
+  ASSERT_TRUE(ToD);
+  EXPECT_TRUE(ToD->getSourceDeductionGuideKind() ==
+              CXXDeductionGuideDecl::SourceDeductionGuideKind::Alias);
+  EXPECT_TRUE(ToD->getSourceDeductionGuide());
+}
+
 TEST_P(ImportFunctions, ParmVarDeclDeclContext) {
   constexpr auto FromTUCode = R"(
       void f(int P);
diff --git clang/unittests/Analysis/FlowSensitive/SmartPointerAccessorCachingTest.cpp clang/unittests/Analysis/FlowSensitive/SmartPointerAccessorCachingTest.cpp
index 3f75dff60ee5..18b9f80e32bb 100644
--- clang/unittests/Analysis/FlowSensitive/SmartPointerAccessorCachingTest.cpp
+++ clang/unittests/Analysis/FlowSensitive/SmartPointerAccessorCachingTest.cpp
@@ -190,5 +190,57 @@ TEST(SmartPointerAccessorCachingTest, MatchesWithValueAndNonConstOverloads) {
       isSmartPointerLikeValueMethodCall()));
 }
 
+TEST(SmartPointerAccessorCachingTest, MatchesWithTypeAliases) {
+  llvm::StringRef Decls(R"cc(
+    template <class T>
+    struct HasGetAndValue {
+      using pointer_t = T*;
+      using reference_t = T&;
+
+      const pointer_t operator->() const;
+      pointer_t operator->();
+      const reference_t operator*() const;
+      reference_t operator*();
+      const reference_t value() const;
+      reference_t value();
+      const pointer_t get() const;
+      pointer_t get();
+    };
+
+    struct S { int i; };
+  )cc");
+
+  EXPECT_TRUE(matches(
+      Decls,
+      "int target(HasGetAndValue<S> &NonConst) { return (*NonConst).i; }",
+      isSmartPointerLikeOperatorStar()));
+  EXPECT_TRUE(matches(
+      Decls,
+      "int target(const HasGetAndValue<S> &Const) { return (*Const).i; }",
+      isSmartPointerLikeOperatorStar()));
+  EXPECT_TRUE(matches(
+      Decls, "int target(HasGetAndValue<S> &NonConst) { return NonConst->i; }",
+      isSmartPointerLikeOperatorArrow()));
+  EXPECT_TRUE(matches(
+      Decls, "int target(const HasGetAndValue<S> &Const) { return Const->i; }",
+      isSmartPointerLikeOperatorArrow()));
+  EXPECT_TRUE(matches(
+      Decls,
+      "int target(HasGetAndValue<S> &NonConst) { return NonConst.value().i; }",
+      isSmartPointerLikeValueMethodCall()));
+  EXPECT_TRUE(matches(
+      Decls,
+      "int target(const HasGetAndValue<S> &Const) { return Const.value().i; }",
+      isSmartPointerLikeValueMethodCall()));
+  EXPECT_TRUE(matches(
+      Decls,
+      "int target(HasGetAndValue<S> &NonConst) { return NonConst.get()->i; }",
+      isSmartPointerLikeGetMethodCall()));
+  EXPECT_TRUE(matches(
+      Decls,
+      "int target(const HasGetAndValue<S> &Const) { return Const.get()->i; }",
+      isSmartPointerLikeGetMethodCall()));
+}
+
 } // namespace
 } // namespace clang::dataflow
diff --git clang/unittests/CodeGen/TBAAMetadataTest.cpp clang/unittests/CodeGen/TBAAMetadataTest.cpp
index cad8783ea73f..f05c9787c63e 100644
--- clang/unittests/CodeGen/TBAAMetadataTest.cpp
+++ clang/unittests/CodeGen/TBAAMetadataTest.cpp
@@ -117,15 +117,9 @@ TEST(TBAAMetadataTest, BasicTypes) {
   ASSERT_TRUE(I);
 
   I = matchNext(I,
-      MInstruction(Instruction::Store,
-        MValType(PointerType::getUnqual(Compiler.Context)),
-        MMTuple(
-          MMTuple(
-            MMString("p1 void"),
-            AnyPtr,
-            MConstInt(0)),
-          MSameAs(0),
-          MConstInt(0))));
+                MInstruction(Instruction::Store,
+                             MValType(PointerType::getUnqual(Compiler.Context)),
+                             MMTuple(AnyPtr, MSameAs(0), MConstInt(0))));
   ASSERT_TRUE(I);
 
   I = matchNext(I,
diff --git clang/unittests/Format/FormatTest.cpp clang/unittests/Format/FormatTest.cpp
index 57f12221cdc7..253e50437c23 100644
--- clang/unittests/Format/FormatTest.cpp
+++ clang/unittests/Format/FormatTest.cpp
@@ -9614,6 +9614,10 @@ TEST_F(FormatTest, AlignsAfterOpenBracket) {
                "        auto aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\n"
                "    ) {};",
                Style);
+  verifyFormat("aaaaaaaaaaaaaaaaaaaaaaaa(\n"
+               "    &bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb\n"
+               ");",
+               Style);
 }
 
 TEST_F(FormatTest, ParenthesesAndOperandAlignment) {
@@ -26478,6 +26482,9 @@ TEST_F(FormatTest, RequiresClauses) {
                "foo();\n"
                "#endif\n"
                "bar(requires);");
+
+  verifyNoCrash("template <class T>\n"
+                "    requires(requires { std::declval<T>()");
 }
 
 TEST_F(FormatTest, RequiresExpressionIndentation) {
diff --git clang/unittests/Format/TokenAnnotatorTest.cpp clang/unittests/Format/TokenAnnotatorTest.cpp
index fc77e277947c..1b09c4570345 100644
--- clang/unittests/Format/TokenAnnotatorTest.cpp
+++ clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -479,11 +479,13 @@ TEST_F(TokenAnnotatorTest, UnderstandsUsesOfPlusAndMinus) {
 TEST_F(TokenAnnotatorTest, UnderstandsClasses) {
   auto Tokens = annotate("class C {};");
   ASSERT_EQ(Tokens.size(), 6u) << Tokens;
+  EXPECT_TOKEN(Tokens[1], tok::identifier, TT_ClassHeadName);
   EXPECT_TOKEN(Tokens[2], tok::l_brace, TT_ClassLBrace);
   EXPECT_TOKEN(Tokens[3], tok::r_brace, TT_ClassRBrace);
 
   Tokens = annotate("const class C {} c;");
   ASSERT_EQ(Tokens.size(), 8u) << Tokens;
+  EXPECT_TOKEN(Tokens[2], tok::identifier, TT_ClassHeadName);
   EXPECT_TOKEN(Tokens[3], tok::l_brace, TT_ClassLBrace);
   EXPECT_TOKEN(Tokens[4], tok::r_brace, TT_ClassRBrace);
 
@@ -494,6 +496,7 @@ TEST_F(TokenAnnotatorTest, UnderstandsClasses) {
 
   Tokens = annotate("class [[deprecated(\"\")]] C { int i; };");
   ASSERT_EQ(Tokens.size(), 17u) << Tokens;
+  EXPECT_TOKEN(Tokens[9], tok::identifier, TT_ClassHeadName);
   EXPECT_TOKEN(Tokens[10], tok::l_brace, TT_ClassLBrace);
   EXPECT_TOKEN(Tokens[14], tok::r_brace, TT_ClassRBrace);
 }
@@ -501,21 +504,25 @@ TEST_F(TokenAnnotatorTest, UnderstandsClasses) {
 TEST_F(TokenAnnotatorTest, UnderstandsStructs) {
   auto Tokens = annotate("struct S {};");
   ASSERT_EQ(Tokens.size(), 6u) << Tokens;
+  EXPECT_TOKEN(Tokens[1], tok::identifier, TT_ClassHeadName);
   EXPECT_TOKEN(Tokens[2], tok::l_brace, TT_StructLBrace);
   EXPECT_TOKEN(Tokens[3], tok::r_brace, TT_StructRBrace);
 
   Tokens = annotate("struct macro(a) S {};");
   ASSERT_EQ(Tokens.size(), 10u) << Tokens;
+  EXPECT_TOKEN(Tokens[5], tok::identifier, TT_ClassHeadName);
   EXPECT_TOKEN(Tokens[6], tok::l_brace, TT_StructLBrace);
   EXPECT_TOKEN(Tokens[7], tok::r_brace, TT_StructRBrace);
 
   Tokens = annotate("struct EXPORT_MACRO [[nodiscard]] C { int i; };");
   ASSERT_EQ(Tokens.size(), 15u) << Tokens;
+  EXPECT_TOKEN(Tokens[7], tok::identifier, TT_ClassHeadName);
   EXPECT_TOKEN(Tokens[8], tok::l_brace, TT_StructLBrace);
   EXPECT_TOKEN(Tokens[12], tok::r_brace, TT_StructRBrace);
 
   Tokens = annotate("struct [[deprecated]] [[nodiscard]] C { int i; };");
   ASSERT_EQ(Tokens.size(), 19u) << Tokens;
+  EXPECT_TOKEN(Tokens[11], tok::identifier, TT_ClassHeadName);
   EXPECT_TOKEN(Tokens[12], tok::l_brace, TT_StructLBrace);
   EXPECT_TOKEN(Tokens[16], tok::r_brace, TT_StructRBrace);
 
@@ -523,12 +530,14 @@ TEST_F(TokenAnnotatorTest, UnderstandsStructs) {
                     "  void f(T &t);\n"
                     "};");
   ASSERT_EQ(Tokens.size(), 18u) << Tokens;
+  EXPECT_TOKEN(Tokens[5], tok::identifier, TT_ClassHeadName);
   EXPECT_TOKEN(Tokens[6], tok::l_brace, TT_StructLBrace);
   EXPECT_TOKEN(Tokens[11], tok::amp, TT_PointerOrReference);
   EXPECT_TOKEN(Tokens[15], tok::r_brace, TT_StructRBrace);
 
   Tokens = annotate("template <typename T> struct S<const T[N]> {};");
   ASSERT_EQ(Tokens.size(), 18u) << Tokens;
+  EXPECT_TOKEN(Tokens[6], tok::identifier, TT_ClassHeadName);
   EXPECT_TOKEN(Tokens[7], tok::less, TT_TemplateOpener);
   EXPECT_TOKEN(Tokens[10], tok::l_square, TT_ArraySubscriptLSquare);
   EXPECT_TOKEN(Tokens[13], tok::greater, TT_TemplateCloser);
@@ -537,6 +546,7 @@ TEST_F(TokenAnnotatorTest, UnderstandsStructs) {
 
   Tokens = annotate("template <typename T> struct S<T const[N]> {};");
   ASSERT_EQ(Tokens.size(), 18u) << Tokens;
+  EXPECT_TOKEN(Tokens[6], tok::identifier, TT_ClassHeadName);
   EXPECT_TOKEN(Tokens[7], tok::less, TT_TemplateOpener);
   EXPECT_TOKEN(Tokens[10], tok::l_square, TT_ArraySubscriptLSquare);
   EXPECT_TOKEN(Tokens[13], tok::greater, TT_TemplateCloser);
@@ -547,6 +557,7 @@ TEST_F(TokenAnnotatorTest, UnderstandsStructs) {
                     "  void f(T const (&a)[n]);\n"
                     "};");
   ASSERT_EQ(Tokens.size(), 35u) << Tokens;
+  EXPECT_TOKEN(Tokens[9], tok::identifier, TT_ClassHeadName);
   EXPECT_TOKEN(Tokens[10], tok::less, TT_TemplateOpener);
   EXPECT_TOKEN(Tokens[13], tok::l_square, TT_ArraySubscriptLSquare);
   EXPECT_TOKEN(Tokens[16], tok::greater, TT_TemplateCloser);
@@ -558,12 +569,24 @@ TEST_F(TokenAnnotatorTest, UnderstandsStructs) {
 
   Tokens = annotate("template <typename T, enum E e> struct S {};");
   ASSERT_EQ(Tokens.size(), 15u) << Tokens;
+  EXPECT_TOKEN(Tokens[10], tok::identifier, TT_ClassHeadName);
   EXPECT_TOKEN(Tokens[11], tok::l_brace, TT_StructLBrace);
 
+  Tokens = annotate(
+      "template <> struct __declspec(foo) Op<Bar *> : OpImpl<Bar *> {};");
+  ASSERT_EQ(Tokens.size(), 23u) << Tokens;
+  EXPECT_TOKEN(Tokens[5], tok::l_paren, TT_AttributeLParen);
+  EXPECT_TOKEN(Tokens[7], tok::r_paren, TT_AttributeRParen);
+  EXPECT_TOKEN(Tokens[8], tok::identifier, TT_ClassHeadName);
+  EXPECT_TOKEN(Tokens[13], tok::colon, TT_InheritanceColon);
+  EXPECT_TOKEN(Tokens[19], tok::l_brace, TT_StructLBrace);
+  EXPECT_TOKEN(Tokens[20], tok::r_brace, TT_StructRBrace);
+
   constexpr StringRef Code{"struct EXPORT StructName {};"};
 
   Tokens = annotate(Code);
   ASSERT_EQ(Tokens.size(), 7u) << Tokens;
+  EXPECT_TOKEN(Tokens[2], tok::identifier, TT_ClassHeadName);
   EXPECT_TOKEN(Tokens[3], tok::l_brace, TT_StructLBrace);
   EXPECT_TOKEN(Tokens[4], tok::r_brace, TT_StructRBrace);
 
@@ -572,6 +595,7 @@ TEST_F(TokenAnnotatorTest, UnderstandsStructs) {
   Tokens = annotate(Code, Style);
   ASSERT_EQ(Tokens.size(), 7u) << Tokens;
   EXPECT_TOKEN(Tokens[1], tok::identifier, TT_AttributeMacro);
+  EXPECT_TOKEN(Tokens[2], tok::identifier, TT_ClassHeadName);
   EXPECT_TOKEN(Tokens[3], tok::l_brace, TT_StructLBrace);
   EXPECT_TOKEN(Tokens[4], tok::r_brace, TT_StructRBrace);
 }
@@ -579,11 +603,13 @@ TEST_F(TokenAnnotatorTest, UnderstandsStructs) {
 TEST_F(TokenAnnotatorTest, UnderstandsUnions) {
   auto Tokens = annotate("union U {};");
   ASSERT_EQ(Tokens.size(), 6u) << Tokens;
+  EXPECT_TOKEN(Tokens[1], tok::identifier, TT_ClassHeadName);
   EXPECT_TOKEN(Tokens[2], tok::l_brace, TT_UnionLBrace);
   EXPECT_TOKEN(Tokens[3], tok::r_brace, TT_UnionRBrace);
 
   Tokens = annotate("union U { void f() { return; } };");
   ASSERT_EQ(Tokens.size(), 14u) << Tokens;
+  EXPECT_TOKEN(Tokens[1], tok::identifier, TT_ClassHeadName);
   EXPECT_TOKEN(Tokens[2], tok::l_brace, TT_UnionLBrace);
   EXPECT_TOKEN(Tokens[7], tok::l_brace, TT_FunctionLBrace);
   EXPECT_TOKEN(Tokens[11], tok::r_brace, TT_UnionRBrace);
@@ -716,8 +742,10 @@ TEST_F(TokenAnnotatorTest, UnderstandsTemplateTemplateParameters) {
   EXPECT_TOKEN(Tokens[11], tok::less, TT_TemplateOpener);
   EXPECT_TOKEN(Tokens[14], tok::greater, TT_TemplateCloser);
   EXPECT_FALSE(Tokens[14]->ClosesTemplateDeclaration);
+  EXPECT_TOKEN(Tokens[16], tok::identifier, TT_Unknown);
   EXPECT_TOKEN(Tokens[21], tok::greater, TT_TemplateCloser);
   EXPECT_TRUE(Tokens[21]->ClosesTemplateDeclaration);
+  EXPECT_TOKEN(Tokens[23], tok::identifier, TT_ClassHeadName);
 }
 
 TEST_F(TokenAnnotatorTest, UnderstandsWhitespaceSensitiveMacros) {
@@ -800,6 +828,7 @@ TEST_F(TokenAnnotatorTest, UnderstandsCasts) {
 
   Tokens = annotate("return (struct foo){};");
   ASSERT_EQ(Tokens.size(), 9u) << Tokens;
+  EXPECT_TOKEN(Tokens[3], tok::identifier, TT_Unknown);
   EXPECT_TOKEN(Tokens[4], tok::r_paren, TT_CastRParen);
 
   Tokens = annotate("#define FOO(bar) foo((uint64_t)&bar)");
@@ -1143,9 +1172,13 @@ TEST_F(TokenAnnotatorTest, UnderstandsRequiresClausesAndConcepts) {
   EXPECT_TOKEN(Tokens[16], tok::pipepipe, TT_BinaryOperator);
   EXPECT_TOKEN(Tokens[21], tok::ampamp, TT_BinaryOperator);
   EXPECT_TOKEN(Tokens[27], tok::ampamp, TT_BinaryOperator);
+  // Not TT_TrailingAnnotation.
+  EXPECT_TOKEN(Tokens[28], tok::identifier, TT_Unknown);
   EXPECT_TOKEN(Tokens[31], tok::greater, TT_TemplateCloser);
   EXPECT_EQ(Tokens[31]->FakeRParens, 1u);
   EXPECT_TRUE(Tokens[31]->ClosesRequiresClause);
+  // Not TT_TrailingAnnotation.
+  EXPECT_TOKEN(Tokens[33], tok::identifier, TT_Unknown);
 
   Tokens =
       annotate("template<typename T>\n"
@@ -1163,6 +1196,7 @@ TEST_F(TokenAnnotatorTest, UnderstandsRequiresClausesAndConcepts) {
   EXPECT_EQ(Tokens[32]->FakeRParens, 1u);
   EXPECT_TOKEN(Tokens[33], tok::r_paren, TT_Unknown);
   EXPECT_TRUE(Tokens[33]->ClosesRequiresClause);
+  EXPECT_TOKEN(Tokens[35], tok::identifier, TT_Unknown);
 
   Tokens = annotate("template <typename T>\n"
                     "void foo(T) noexcept requires Bar<T>;");
@@ -1244,6 +1278,8 @@ TEST_F(TokenAnnotatorTest, UnderstandsRequiresClausesAndConcepts) {
                     "return number_zero_v<T>; }\n"
                     "};");
   ASSERT_EQ(Tokens.size(), 44u) << Tokens;
+  EXPECT_TOKEN(Tokens[6], tok::identifier, TT_ClassHeadName);
+  EXPECT_TOKEN(Tokens[11], tok::identifier, TT_Unknown);
   EXPECT_TOKEN(Tokens[13], tok::kw_requires, TT_RequiresClause);
   EXPECT_TOKEN(Tokens[14], tok::kw_requires, TT_RequiresExpression);
   EXPECT_TOKEN(Tokens[15], tok::l_brace, TT_RequiresExpressionLBrace);
@@ -1255,6 +1291,8 @@ TEST_F(TokenAnnotatorTest, UnderstandsRequiresClausesAndConcepts) {
       annotate("template <class A, class B> concept C ="
                "std::same_as<std::iter_value_t<A>, std::iter_value_t<B>>;");
   ASSERT_EQ(Tokens.size(), 31u) << Tokens;
+  EXPECT_TOKEN(Tokens[3], tok::identifier, TT_Unknown);
+  EXPECT_TOKEN(Tokens[6], tok::identifier, TT_Unknown);
   EXPECT_TOKEN(Tokens[8], tok::kw_concept, TT_Unknown);
   EXPECT_TOKEN(Tokens[14], tok::less, TT_TemplateOpener);
   EXPECT_TOKEN(Tokens[18], tok::less, TT_TemplateOpener);
@@ -1871,6 +1909,7 @@ TEST_F(TokenAnnotatorTest, UnderstandsLambdas) {
   ASSERT_EQ(Tokens.size(), 12u) << Tokens;
   EXPECT_TOKEN(Tokens[0], tok::l_square, TT_LambdaLSquare);
   EXPECT_TOKEN(Tokens[2], tok::arrow, TT_LambdaArrow);
+  EXPECT_TOKEN(Tokens[4], tok::identifier, TT_Unknown);
   EXPECT_TOKEN(Tokens[5], tok::l_brace, TT_LambdaLBrace);
 
   Tokens = annotate("foo([&](u32 bar) __attribute__((attr)) -> void {});");
@@ -2751,6 +2790,7 @@ TEST_F(TokenAnnotatorTest, UnderstandTableGenTokens) {
 
   // Structured statements.
   Tokens = Annotate("class Foo {}");
+  EXPECT_TOKEN(Tokens[1], tok::identifier, TT_StartOfName);
   EXPECT_TOKEN(Tokens[2], tok::l_brace, TT_FunctionLBrace);
   Tokens = Annotate("def Def: Foo {}");
   EXPECT_TOKEN(Tokens[2], tok::colon, TT_InheritanceColon);
@@ -3216,6 +3256,7 @@ TEST_F(TokenAnnotatorTest, BraceKind) {
 
   Tokens = annotate("class Foo<int> f() {}");
   ASSERT_EQ(Tokens.size(), 11u) << Tokens;
+  EXPECT_TOKEN(Tokens[1], tok::identifier, TT_Unknown);
   EXPECT_TOKEN(Tokens[5], tok::identifier, TT_FunctionDeclarationName);
   EXPECT_TOKEN(Tokens[6], tok::l_paren, TT_FunctionDeclarationLParen);
   EXPECT_TOKEN(Tokens[8], tok::l_brace, TT_FunctionLBrace);
@@ -3224,6 +3265,7 @@ TEST_F(TokenAnnotatorTest, BraceKind) {
 
   Tokens = annotate("template <typename T> class Foo<T> f() {}");
   ASSERT_EQ(Tokens.size(), 16u) << Tokens;
+  EXPECT_TOKEN(Tokens[6], tok::identifier, TT_Unknown);
   EXPECT_TOKEN(Tokens[10], tok::identifier, TT_FunctionDeclarationName);
   EXPECT_TOKEN(Tokens[11], tok::l_paren, TT_FunctionDeclarationLParen);
   EXPECT_TOKEN(Tokens[13], tok::l_brace, TT_FunctionLBrace);
@@ -3334,36 +3376,52 @@ TEST_F(TokenAnnotatorTest, BraceKind) {
 
   Tokens = annotate("struct ::Foo {};");
   ASSERT_EQ(Tokens.size(), 7u) << Tokens;
+  EXPECT_TOKEN(Tokens[2], tok::identifier, TT_ClassHeadName);
   EXPECT_BRACE_KIND(Tokens[3], BK_Block);
   EXPECT_BRACE_KIND(Tokens[4], BK_Block);
 
   Tokens = annotate("struct NS::Foo {};");
   ASSERT_EQ(Tokens.size(), 8u) << Tokens;
+  EXPECT_TOKEN(Tokens[1], tok::identifier, TT_Unknown);
+  EXPECT_TOKEN(Tokens[3], tok::identifier, TT_ClassHeadName);
   EXPECT_BRACE_KIND(Tokens[4], BK_Block);
   EXPECT_BRACE_KIND(Tokens[5], BK_Block);
 
   Tokens = annotate("struct Foo<int> {};");
   ASSERT_EQ(Tokens.size(), 9u) << Tokens;
+  EXPECT_TOKEN(Tokens[1], tok::identifier, TT_ClassHeadName);
   EXPECT_BRACE_KIND(Tokens[5], BK_Block);
   EXPECT_BRACE_KIND(Tokens[6], BK_Block);
 
   Tokens = annotate("struct Foo<int>::Bar {};");
   ASSERT_EQ(Tokens.size(), 11u) << Tokens;
+  EXPECT_TOKEN(Tokens[1], tok::identifier, TT_Unknown);
+  EXPECT_TOKEN(Tokens[6], tok::identifier, TT_ClassHeadName);
   EXPECT_BRACE_KIND(Tokens[7], BK_Block);
   EXPECT_BRACE_KIND(Tokens[8], BK_Block);
 
   Tokens = annotate("struct Foo<int> : Base {};");
   ASSERT_EQ(Tokens.size(), 11u) << Tokens;
+  EXPECT_TOKEN(Tokens[1], tok::identifier, TT_ClassHeadName);
   EXPECT_BRACE_KIND(Tokens[7], BK_Block);
   EXPECT_BRACE_KIND(Tokens[8], BK_Block);
 
+  Tokens = annotate("struct Foo<int> : Base::Bar {};");
+  ASSERT_EQ(Tokens.size(), 13u) << Tokens;
+  EXPECT_TOKEN(Tokens[1], tok::identifier, TT_ClassHeadName);
+  EXPECT_TOKEN(Tokens[8], tok::identifier, TT_Unknown); // Not TT_ClassHeadName.
+  EXPECT_BRACE_KIND(Tokens[9], BK_Block);
+  EXPECT_BRACE_KIND(Tokens[10], BK_Block);
+
   Tokens = annotate("struct Foo final {};");
   ASSERT_EQ(Tokens.size(), 7u) << Tokens;
+  EXPECT_TOKEN(Tokens[1], tok::identifier, TT_ClassHeadName);
   EXPECT_BRACE_KIND(Tokens[3], BK_Block);
   EXPECT_BRACE_KIND(Tokens[4], BK_Block);
 
   Tokens = annotate("struct [[foo]] [[bar]] Foo final : Base1, Base2 {};");
   ASSERT_EQ(Tokens.size(), 21u) << Tokens;
+  EXPECT_TOKEN(Tokens[11], tok::identifier, TT_ClassHeadName);
   EXPECT_BRACE_KIND(Tokens[17], BK_Block);
   EXPECT_BRACE_KIND(Tokens[18], BK_Block);
 
@@ -3399,6 +3457,7 @@ TEST_F(TokenAnnotatorTest, BraceKind) {
                     "#endif\n"
                     "};");
   ASSERT_EQ(Tokens.size(), 29u) << Tokens;
+  EXPECT_TOKEN(Tokens[8], tok::identifier, TT_ClassHeadName);
   EXPECT_BRACE_KIND(Tokens[11], BK_Block);
   EXPECT_BRACE_KIND(Tokens[17], BK_Block);
   EXPECT_BRACE_KIND(Tokens[22], BK_Block);
@@ -3450,6 +3509,7 @@ TEST_F(TokenAnnotatorTest, BraceKind) {
   Tokens = annotate("a = class extends goog.a {};",
                     getGoogleStyle(FormatStyle::LK_JavaScript));
   ASSERT_EQ(Tokens.size(), 11u) << Tokens;
+  EXPECT_TOKEN(Tokens[3], tok::identifier, TT_Unknown);
   EXPECT_TOKEN(Tokens[7], tok::l_brace, TT_ClassLBrace);
   EXPECT_BRACE_KIND(Tokens[7], BK_Block);
   EXPECT_TOKEN(Tokens[8], tok::r_brace, TT_ClassRBrace);
@@ -3458,6 +3518,8 @@ TEST_F(TokenAnnotatorTest, BraceKind) {
   Tokens = annotate("a = class Foo extends goog.a {};",
                     getGoogleStyle(FormatStyle::LK_JavaScript));
   ASSERT_EQ(Tokens.size(), 12u) << Tokens;
+  EXPECT_TOKEN(Tokens[3], tok::identifier, TT_ClassHeadName);
+  EXPECT_TOKEN(Tokens[4], tok::identifier, TT_Unknown); // Not TT_StartOfName
   EXPECT_TOKEN(Tokens[8], tok::l_brace, TT_ClassLBrace);
   EXPECT_BRACE_KIND(Tokens[8], BK_Block);
   EXPECT_TOKEN(Tokens[9], tok::r_brace, TT_ClassRBrace);
@@ -3466,6 +3528,8 @@ TEST_F(TokenAnnotatorTest, BraceKind) {
   Tokens = annotate("#define FOO(X) \\\n"
                     "  struct X##_tag_ {};");
   ASSERT_EQ(Tokens.size(), 14u) << Tokens;
+  EXPECT_TOKEN(Tokens[7], tok::identifier, TT_Unknown);
+  EXPECT_TOKEN(Tokens[9], tok::identifier, TT_ClassHeadName);
   EXPECT_TOKEN(Tokens[10], tok::l_brace, TT_StructLBrace);
   EXPECT_BRACE_KIND(Tokens[10], BK_Block);
   EXPECT_TOKEN(Tokens[11], tok::r_brace, TT_StructRBrace);
@@ -3476,6 +3540,7 @@ TEST_F(TokenAnnotatorTest, BraceKind) {
                     "    void f() { return; } \\\n"
                     "  };");
   ASSERT_EQ(Tokens.size(), 20u) << Tokens;
+  EXPECT_TOKEN(Tokens[4], tok::identifier, TT_ClassHeadName);
   EXPECT_TOKEN(Tokens[8], tok::l_brace, TT_StructLBrace);
   EXPECT_BRACE_KIND(Tokens[8], BK_Block);
   EXPECT_TOKEN(Tokens[10], tok::identifier, TT_FunctionDeclarationName);
diff --git clang/unittests/Sema/HeuristicResolverTest.cpp clang/unittests/Sema/HeuristicResolverTest.cpp
index a0deb2d93675..5c3459dbeb10 100644
--- clang/unittests/Sema/HeuristicResolverTest.cpp
+++ clang/unittests/Sema/HeuristicResolverTest.cpp
@@ -155,6 +155,26 @@ TEST(HeuristicResolver, MemberExpr_SmartPointer_Qualified) {
       cxxMethodDecl(hasName("find"), isConst()).bind("output"));
 }
 
+TEST(HeuristicResolver, MemberExpr_Static_Qualified) {
+  std::string Code = R"cpp(
+    template <typename T>
+    struct Waldo {
+      static void find();
+    };
+    template <typename T>
+    void foo(const Waldo<T>& t) {
+      t.find();
+    }
+  )cpp";
+  // Test resolution of "find" in "t.find()".
+  // The object being `const` should have no bearing on a call to a static
+  // method.
+  expectResolution(
+      Code, &HeuristicResolver::resolveMemberExpr,
+      cxxDependentScopeMemberExpr(hasMemberName("find")).bind("input"),
+      cxxMethodDecl(hasName("find")).bind("output"));
+}
+
 TEST(HeuristicResolver, MemberExpr_AutoTypeDeduction1) {
   std::string Code = R"cpp(
     template <typename T>
diff --git clang/unittests/StaticAnalyzer/CMakeLists.txt clang/unittests/StaticAnalyzer/CMakeLists.txt
index f5da86e54560..3b01a4e9e532 100644
--- clang/unittests/StaticAnalyzer/CMakeLists.txt
+++ clang/unittests/StaticAnalyzer/CMakeLists.txt
@@ -15,6 +15,7 @@ add_clang_unittest(StaticAnalysisTests
   IsCLibraryFunctionTest.cpp
   MemRegionDescriptiveNameTest.cpp
   NoStateChangeFuncVisitorTest.cpp
+  ObjcBug-124477.cpp
   ParamRegionTest.cpp
   RangeSetTest.cpp
   RegisterCustomCheckersTest.cpp
diff --git clang/unittests/StaticAnalyzer/ObjcBug-124477.cpp clang/unittests/StaticAnalyzer/ObjcBug-124477.cpp
new file mode 100644
index 000000000000..51bd33210032
--- /dev/null
+++ clang/unittests/StaticAnalyzer/ObjcBug-124477.cpp
@@ -0,0 +1,63 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "CheckerRegistration.h"
+#include "clang/StaticAnalyzer/Core/Checker.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/ProgramStateTrait.h"
+#include "clang/StaticAnalyzer/Frontend/AnalysisConsumer.h"
+#include "clang/StaticAnalyzer/Frontend/CheckerRegistry.h"
+#include "gtest/gtest.h"
+
+using namespace clang;
+using namespace ento;
+
+// Some dummy trait that we can mutate back and forth to force a new State.
+REGISTER_TRAIT_WITH_PROGRAMSTATE(Flag, bool)
+
+namespace {
+class FlipFlagOnCheckLocation : public Checker<check::Location> {
+public:
+  // We make sure we alter the State every time we model a checkLocation event.
+  void checkLocation(SVal l, bool isLoad, const Stmt *S,
+                     CheckerContext &C) const {
+    ProgramStateRef State = C.getState();
+    State = State->set<Flag>(!State->get<Flag>());
+    C.addTransition(State);
+  }
+};
+
+void addFlagFlipperChecker(AnalysisASTConsumer &AnalysisConsumer,
+                           AnalyzerOptions &AnOpts) {
+  AnOpts.CheckersAndPackages = {{"test.FlipFlagOnCheckLocation", true}};
+  AnalysisConsumer.AddCheckerRegistrationFn([](CheckerRegistry &Registry) {
+    Registry.addChecker<FlipFlagOnCheckLocation>("test.FlipFlagOnCheckLocation",
+                                                 "Description", "");
+  });
+}
+
+TEST(ObjCTest, CheckLocationEventsShouldMaterializeInObjCForCollectionStmts) {
+  // Previously, the `ExprEngine::hasMoreIteration` may fired an assertion
+  // because we forgot to handle correctly the resulting nodes of the
+  // check::Location callback for the ObjCForCollectionStmts.
+  // This caused inconsistencies in the graph and triggering the assertion.
+  // See #124477 for more details.
+  std::string Diags;
+  EXPECT_TRUE(runCheckerOnCodeWithArgs<addFlagFlipperChecker>(
+      R"(
+    @class NSArray, NSDictionary, NSString;
+    extern void NSLog(NSString *format, ...) __attribute__((format(__NSString__, 1, 2)));
+    void entrypoint(NSArray *bowl) {
+      for (NSString *fruit in bowl) { // no-crash
+        NSLog(@"Fruit: %@", fruit);
+      }
+    })",
+      {"-x", "objective-c"}, Diags));
+}
+
+} // namespace
diff --git clang/utils/TableGen/ClangDiagnosticsEmitter.cpp clang/utils/TableGen/ClangDiagnosticsEmitter.cpp
index 50dbe4d5a8ca..8f846a4744bb 100644
--- clang/utils/TableGen/ClangDiagnosticsEmitter.cpp
+++ clang/utils/TableGen/ClangDiagnosticsEmitter.cpp
@@ -359,7 +359,7 @@ void InferPedantic::compute(VecOrSet DiagsInPedantic,
 
     // The diagnostic is not included in a group that is (transitively) in
     // -Wpedantic.  Include it in -Wpedantic directly.
-    if (auto *V = DiagsInPedantic.dyn_cast<RecordVec *>())
+    if (auto *V = dyn_cast<RecordVec *>(DiagsInPedantic))
       V->push_back(R);
     else
       cast<RecordSet *>(DiagsInPedantic)->insert(R);
@@ -386,7 +386,7 @@ void InferPedantic::compute(VecOrSet DiagsInPedantic,
     if (Parents.size() > 0 && AllParentsInPedantic)
       continue;
 
-    if (auto *V = GroupsInPedantic.dyn_cast<RecordVec *>())
+    if (auto *V = dyn_cast<RecordVec *>(GroupsInPedantic))
       V->push_back(Group);
     else
       cast<RecordSet *>(GroupsInPedantic)->insert(Group);
diff --git clang/utils/TableGen/SveEmitter.cpp clang/utils/TableGen/SveEmitter.cpp
index 687d344163e2..39dcbc678dc4 100644
--- clang/utils/TableGen/SveEmitter.cpp
+++ clang/utils/TableGen/SveEmitter.cpp
@@ -181,6 +181,8 @@ class Intrinsic {
 
   SmallVector<ImmCheck, 2> ImmChecks;
 
+  bool SetsFPMR;
+
 public:
   Intrinsic(StringRef Name, StringRef Proto, uint64_t MergeTy,
             StringRef MergeSuffix, uint64_t MemoryElementTy, StringRef LLVMName,
@@ -278,6 +280,7 @@ public:
 
 private:
   std::string getMergeSuffix() const { return MergeSuffix; }
+  StringRef getFPMSuffix() const { return SetsFPMR ? "_fpm" : ""; }
   std::string mangleName(ClassKind LocalCK) const;
   std::string mangleLLVMName() const;
   std::string replaceTemplatedArgs(std::string Name, TypeSpec TS,
@@ -983,6 +986,7 @@ Intrinsic::Intrinsic(StringRef Name, StringRef Proto, uint64_t MergeTy,
     std::tie(Mod, NumVectors) = getProtoModifier(Proto, I);
     SVEType T(BaseTypeSpec, Mod, NumVectors);
     Types.push_back(T);
+    SetsFPMR = T.isFpm();
 
     // Add range checks for immediates
     if (I > 0) {
@@ -1001,6 +1005,8 @@ Intrinsic::Intrinsic(StringRef Name, StringRef Proto, uint64_t MergeTy,
   this->Flags |= Emitter.encodeMergeType(MergeTy);
   if (hasSplat())
     this->Flags |= Emitter.encodeSplatOperand(getSplatIdx());
+  if (SetsFPMR)
+    this->Flags |= Emitter.getEnumValueForFlag("SetsFPMR");
 }
 
 std::string Intrinsic::getBuiltinTypeStr() {
@@ -1089,8 +1095,9 @@ std::string Intrinsic::mangleName(ClassKind LocalCK) const {
   }
 
   // Replace all {d} like expressions with e.g. 'u32'
-  return replaceTemplatedArgs(S, getBaseTypeSpec(), getProto()) +
-         getMergeSuffix();
+  return replaceTemplatedArgs(S, getBaseTypeSpec(), getProto())
+      .append(getMergeSuffix())
+      .append(getFPMSuffix());
 }
 
 void Intrinsic::emitIntrinsic(raw_ostream &OS, SVEEmitter &Emitter,
diff --git compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake
index ee5be276f3df..2683259e93e3 100644
--- compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake
+++ compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake
@@ -103,7 +103,7 @@ if(APPLE)
 set(ALL_XRAY_SUPPORTED_ARCH ${X86_64} ${ARM64})
 else()
 set(ALL_XRAY_SUPPORTED_ARCH ${X86_64} ${ARM32} ${ARM64} ${MIPS32} ${MIPS64}
-               powerpc64le ${HEXAGON} ${LOONGARCH64} ${RISCV32} ${RISCV64})
+    powerpc64le ${HEXAGON} ${LOONGARCH64} ${RISCV32} ${RISCV64} ${S390X})
 endif()
 set(ALL_XRAY_DSO_SUPPORTED_ARCH ${X86_64} ${ARM64})
 set(ALL_SHADOWCALLSTACK_SUPPORTED_ARCH ${ARM64})
diff --git compiler-rt/lib/orc/macho_platform.cpp compiler-rt/lib/orc/macho_platform.cpp
index 4b603fd95e31..8ca68587aeb3 100644
--- compiler-rt/lib/orc/macho_platform.cpp
+++ compiler-rt/lib/orc/macho_platform.cpp
@@ -557,12 +557,6 @@ Error MachOPlatformRuntimeState::registerObjectPlatformSections(
     return make_error<StringError>(ErrStream.str());
   }
 
-  ORC_RT_DEBUG({
-    printdbg("  UnwindInfo: %s, UseCallbackStyleUnwindInfo: %s\n",
-             UnwindInfo ? "true" : "false",
-             UseCallbackStyleUnwindInfo ? "true" : "false");
-  });
-
   if (UnwindInfo && UseCallbackStyleUnwindInfo) {
     ORC_RT_DEBUG({
       printdbg("  Registering new-style unwind info for:\n"
diff --git compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp
index d15f30c61b58..0b8a75391136 100644
--- compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp
+++ compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp
@@ -1203,13 +1203,14 @@ uptr MapDynamicShadow(uptr shadow_size_bytes, uptr shadow_scale,
   const uptr left_padding =
       Max<uptr>(granularity, 1ULL << min_shadow_base_alignment);
 
-  uptr space_size = shadow_size_bytes + left_padding;
+  uptr space_size = shadow_size_bytes;
 
   uptr largest_gap_found = 0;
   uptr max_occupied_addr = 0;
+
   VReport(2, "FindDynamicShadowStart, space_size = %p\n", (void *)space_size);
   uptr shadow_start =
-      FindAvailableMemoryRange(space_size, alignment, granularity,
+      FindAvailableMemoryRange(space_size, alignment, left_padding,
                                &largest_gap_found, &max_occupied_addr);
   // If the shadow doesn't fit, restrict the address space to make it fit.
   if (shadow_start == 0) {
@@ -1229,9 +1230,9 @@ uptr MapDynamicShadow(uptr shadow_size_bytes, uptr shadow_scale,
     }
     RestrictMemoryToMaxAddress(new_max_vm);
     high_mem_end = new_max_vm - 1;
-    space_size = (high_mem_end >> shadow_scale) + left_padding;
+    space_size = (high_mem_end >> shadow_scale);
     VReport(2, "FindDynamicShadowStart, space_size = %p\n", (void *)space_size);
-    shadow_start = FindAvailableMemoryRange(space_size, alignment, granularity,
+    shadow_start = FindAvailableMemoryRange(space_size, alignment, left_padding,
                                             nullptr, nullptr);
     if (shadow_start == 0) {
       Report("Unable to find a memory range after restricting VM.\n");
@@ -1272,10 +1273,15 @@ uptr FindAvailableMemoryRange(uptr size, uptr alignment, uptr left_padding,
     mach_msg_type_number_t count = kRegionInfoSize;
     kr = mach_vm_region_recurse(mach_task_self(), &address, &vmsize, &depth,
                                 (vm_region_info_t)&vminfo, &count);
-    if (kr == KERN_INVALID_ADDRESS) {
+
+    // There are cases where going beyond the processes' max vm does
+    // not return KERN_INVALID_ADDRESS so we check for going beyond that
+    // max address as well.
+    if (kr == KERN_INVALID_ADDRESS || address > max_vm_address) {
       // No more regions beyond "address", consider the gap at the end of VM.
       address = max_vm_address;
       vmsize = 0;
+      kr = -1;  // break after this iteration.
     } else {
       if (max_occupied_addr) *max_occupied_addr = address + vmsize;
     }
diff --git compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp
index 3638f1c36ddd..e1471dfdf680 100644
--- compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp
+++ compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp
@@ -32,7 +32,7 @@ template <typename Config> static scudo::Options getOptionsForConfig() {
   return AO.load();
 }
 
-template <typename Config> static void testSecondaryBasic(void) {
+template <typename Config> static void testBasic(void) {
   using SecondaryT = scudo::MapAllocator<scudo::SecondaryConfig<Config>>;
   scudo::Options Options =
       getOptionsForConfig<scudo::SecondaryConfig<Config>>();
@@ -85,7 +85,7 @@ template <typename Config> static void testSecondaryBasic(void) {
   L->unmapTestOnly();
 }
 
-struct NoCacheConfig {
+struct TestNoCacheConfig {
   static const bool MaySupportMemoryTagging = false;
   template <typename> using TSDRegistryT = void;
   template <typename> using PrimaryT = void;
@@ -97,7 +97,7 @@ struct NoCacheConfig {
   };
 };
 
-struct TestConfig {
+struct TestCacheConfig {
   static const bool MaySupportMemoryTagging = false;
   template <typename> using TSDRegistryT = void;
   template <typename> using PrimaryT = void;
@@ -117,15 +117,15 @@ struct TestConfig {
   };
 };
 
-TEST(ScudoSecondaryTest, SecondaryBasic) {
-  testSecondaryBasic<NoCacheConfig>();
-  testSecondaryBasic<scudo::DefaultConfig>();
-  testSecondaryBasic<TestConfig>();
+TEST(ScudoSecondaryTest, Basic) {
+  testBasic<TestNoCacheConfig>();
+  testBasic<TestCacheConfig>();
+  testBasic<scudo::DefaultConfig>();
 }
 
-struct MapAllocatorTest : public Test {
-  using Config = scudo::DefaultConfig;
-  using LargeAllocator = scudo::MapAllocator<scudo::SecondaryConfig<Config>>;
+struct ScudoSecondaryAllocatorTest : public Test {
+  using LargeAllocator =
+      scudo::MapAllocator<scudo::SecondaryConfig<TestNoCacheConfig>>;
 
   void SetUp() override { Allocator->init(nullptr); }
 
@@ -134,13 +134,13 @@ struct MapAllocatorTest : public Test {
   std::unique_ptr<LargeAllocator> Allocator =
       std::make_unique<LargeAllocator>();
   scudo::Options Options =
-      getOptionsForConfig<scudo::SecondaryConfig<Config>>();
+      getOptionsForConfig<scudo::SecondaryConfig<TestNoCacheConfig>>();
 };
 
 // This exercises a variety of combinations of size and alignment for the
 // MapAllocator. The size computation done here mimic the ones done by the
 // combined allocator.
-TEST_F(MapAllocatorTest, SecondaryCombinations) {
+TEST_F(ScudoSecondaryAllocatorTest, Combinations) {
   constexpr scudo::uptr MinAlign = FIRST_32_SECOND_64(8, 16);
   constexpr scudo::uptr HeaderSize = scudo::roundUp(8, MinAlign);
   for (scudo::uptr SizeLog = 0; SizeLog <= 20; SizeLog++) {
@@ -168,7 +168,7 @@ TEST_F(MapAllocatorTest, SecondaryCombinations) {
   Str.output();
 }
 
-TEST_F(MapAllocatorTest, SecondaryIterate) {
+TEST_F(ScudoSecondaryAllocatorTest, Iterate) {
   std::vector<void *> V;
   const scudo::uptr PageSize = scudo::getPageSizeCached();
   for (scudo::uptr I = 0; I < 32U; I++)
@@ -190,34 +190,8 @@ TEST_F(MapAllocatorTest, SecondaryIterate) {
   Str.output();
 }
 
-TEST_F(MapAllocatorTest, SecondaryCacheOptions) {
-  if (!Allocator->canCache(0U))
-    TEST_SKIP("Secondary Cache disabled");
-
-  // Attempt to set a maximum number of entries higher than the array size.
-  EXPECT_TRUE(Allocator->setOption(scudo::Option::MaxCacheEntriesCount, 4096U));
-
-  // Attempt to set an invalid (negative) number of entries
-  EXPECT_FALSE(Allocator->setOption(scudo::Option::MaxCacheEntriesCount, -1));
-
-  // Various valid combinations.
-  EXPECT_TRUE(Allocator->setOption(scudo::Option::MaxCacheEntriesCount, 4U));
-  EXPECT_TRUE(
-      Allocator->setOption(scudo::Option::MaxCacheEntrySize, 1UL << 20));
-  EXPECT_TRUE(Allocator->canCache(1UL << 18));
-  EXPECT_TRUE(
-      Allocator->setOption(scudo::Option::MaxCacheEntrySize, 1UL << 17));
-  EXPECT_FALSE(Allocator->canCache(1UL << 18));
-  EXPECT_TRUE(Allocator->canCache(1UL << 16));
-  EXPECT_TRUE(Allocator->setOption(scudo::Option::MaxCacheEntriesCount, 0U));
-  EXPECT_FALSE(Allocator->canCache(1UL << 16));
-  EXPECT_TRUE(Allocator->setOption(scudo::Option::MaxCacheEntriesCount, 4U));
-  EXPECT_TRUE(
-      Allocator->setOption(scudo::Option::MaxCacheEntrySize, 1UL << 20));
-  EXPECT_TRUE(Allocator->canCache(1UL << 16));
-}
-
-struct MapAllocatorWithReleaseTest : public MapAllocatorTest {
+struct ScudoSecondaryAllocatorWithReleaseTest
+    : public ScudoSecondaryAllocatorTest {
   void SetUp() override { Allocator->init(nullptr, /*ReleaseToOsInterval=*/0); }
 
   void performAllocations() {
@@ -249,11 +223,11 @@ struct MapAllocatorWithReleaseTest : public MapAllocatorTest {
   bool Ready = false;
 };
 
-TEST_F(MapAllocatorWithReleaseTest, SecondaryThreadsRace) {
+TEST_F(ScudoSecondaryAllocatorWithReleaseTest, ThreadsRace) {
   std::thread Threads[16];
   for (scudo::uptr I = 0; I < ARRAY_SIZE(Threads); I++)
-    Threads[I] =
-        std::thread(&MapAllocatorWithReleaseTest::performAllocations, this);
+    Threads[I] = std::thread(
+        &ScudoSecondaryAllocatorWithReleaseTest::performAllocations, this);
   {
     std::unique_lock<std::mutex> Lock(Mutex);
     Ready = true;
@@ -266,7 +240,7 @@ TEST_F(MapAllocatorWithReleaseTest, SecondaryThreadsRace) {
   Str.output();
 }
 
-struct MapAllocatorCacheTest : public Test {
+struct ScudoSecondaryAllocatorCacheTest : public Test {
   static constexpr scudo::u32 UnmappedMarker = 0xDEADBEEF;
 
   static void testUnmapCallback(scudo::MemMapT &MemMap) {
@@ -274,7 +248,7 @@ struct MapAllocatorCacheTest : public Test {
     *Ptr = UnmappedMarker;
   }
 
-  using SecondaryConfig = scudo::SecondaryConfig<TestConfig>;
+  using SecondaryConfig = scudo::SecondaryConfig<TestCacheConfig>;
   using CacheConfig = SecondaryConfig::CacheConfig;
   using CacheT = scudo::MapAllocatorCache<CacheConfig, testUnmapCallback>;
 
@@ -315,7 +289,7 @@ struct MapAllocatorCacheTest : public Test {
   }
 };
 
-TEST_F(MapAllocatorCacheTest, CacheOrder) {
+TEST_F(ScudoSecondaryAllocatorCacheTest, EntryOrder) {
   std::vector<scudo::MemMapT> MemMaps;
   Cache->setOption(scudo::Option::MaxCacheEntriesCount,
                    CacheConfig::getEntriesArraySize());
@@ -336,7 +310,7 @@ TEST_F(MapAllocatorCacheTest, CacheOrder) {
     MemMap.unmap();
 }
 
-TEST_F(MapAllocatorCacheTest, PartialChunkHeuristicRetrievalTest) {
+TEST_F(ScudoSecondaryAllocatorCacheTest, PartialChunkHeuristicRetrievalTest) {
   const scudo::uptr FragmentedPages =
       1 + scudo::CachedBlock::MaxReleasedCachePages;
   scudo::uptr EntryHeaderPos;
@@ -360,7 +334,7 @@ TEST_F(MapAllocatorCacheTest, PartialChunkHeuristicRetrievalTest) {
   MemMap.unmap();
 }
 
-TEST_F(MapAllocatorCacheTest, MemoryLeakTest) {
+TEST_F(ScudoSecondaryAllocatorCacheTest, MemoryLeakTest) {
   std::vector<scudo::MemMapT> MemMaps;
   // Fill the cache above MaxEntriesCount to force an eviction
   // The first cache entry should be evicted (because it is the oldest)
@@ -387,3 +361,24 @@ TEST_F(MapAllocatorCacheTest, MemoryLeakTest) {
   for (auto &MemMap : MemMaps)
     MemMap.unmap();
 }
+
+TEST_F(ScudoSecondaryAllocatorCacheTest, Options) {
+  // Attempt to set a maximum number of entries higher than the array size.
+  EXPECT_TRUE(Cache->setOption(scudo::Option::MaxCacheEntriesCount, 4096U));
+
+  // Attempt to set an invalid (negative) number of entries
+  EXPECT_FALSE(Cache->setOption(scudo::Option::MaxCacheEntriesCount, -1));
+
+  // Various valid combinations.
+  EXPECT_TRUE(Cache->setOption(scudo::Option::MaxCacheEntriesCount, 4U));
+  EXPECT_TRUE(Cache->setOption(scudo::Option::MaxCacheEntrySize, 1UL << 20));
+  EXPECT_TRUE(Cache->canCache(1UL << 18));
+  EXPECT_TRUE(Cache->setOption(scudo::Option::MaxCacheEntrySize, 1UL << 17));
+  EXPECT_FALSE(Cache->canCache(1UL << 18));
+  EXPECT_TRUE(Cache->canCache(1UL << 16));
+  EXPECT_TRUE(Cache->setOption(scudo::Option::MaxCacheEntriesCount, 0U));
+  EXPECT_FALSE(Cache->canCache(1UL << 16));
+  EXPECT_TRUE(Cache->setOption(scudo::Option::MaxCacheEntriesCount, 4U));
+  EXPECT_TRUE(Cache->setOption(scudo::Option::MaxCacheEntrySize, 1UL << 20));
+  EXPECT_TRUE(Cache->canCache(1UL << 16));
+}
diff --git compiler-rt/lib/xray/CMakeLists.txt compiler-rt/lib/xray/CMakeLists.txt
index e7f01a2f4f16..673091807e34 100644
--- compiler-rt/lib/xray/CMakeLists.txt
+++ compiler-rt/lib/xray/CMakeLists.txt
@@ -106,6 +106,13 @@ set(riscv64_SOURCES
   xray_trampoline_riscv64.S
   )
 
+  set(s390x_SOURCES
+  xray_s390x.cpp
+  xray_trampoline_s390x.S
+  )
+# Enable vector instructions in the assembly file.
+set_source_files_properties(xray_trampoline_s390x.S PROPERTIES COMPILE_FLAGS -mvx)
+
 set(XRAY_SOURCE_ARCHS
   arm
   armhf
@@ -116,6 +123,7 @@ set(XRAY_SOURCE_ARCHS
   mips64
   mips64el
   powerpc64le
+  s390x
   x86_64
   )
 
@@ -168,6 +176,7 @@ set(XRAY_ALL_SOURCE_FILES
   ${powerpc64le_SOURCES}
   ${riscv32_SOURCES}
   ${riscv64_SOURCES}
+  ${s390x_SOURCES}
   ${XRAY_IMPL_HEADERS}
   )
 list(REMOVE_DUPLICATES XRAY_ALL_SOURCE_FILES)
diff --git compiler-rt/lib/xray/xray_interface.cpp compiler-rt/lib/xray/xray_interface.cpp
index 4ec492c266d8..3f97827874a7 100644
--- compiler-rt/lib/xray/xray_interface.cpp
+++ compiler-rt/lib/xray/xray_interface.cpp
@@ -61,6 +61,8 @@ static const int16_t cSledLength = 20;
 static const int16_t cSledLength = 68;
 #elif defined(__riscv) && (__riscv_xlen == 32)
 static const int16_t cSledLength = 52;
+#elif defined(__s390x__)
+static const int16_t cSledLength = 18;
 #else
 #error "Unsupported CPU Architecture"
 #endif /* CPU architecture */
diff --git compiler-rt/lib/xray/xray_interface_internal.h compiler-rt/lib/xray/xray_interface_internal.h
index a8cfe0fde84d..5dcccfe825cf 100644
--- compiler-rt/lib/xray/xray_interface_internal.h
+++ compiler-rt/lib/xray/xray_interface_internal.h
@@ -29,6 +29,10 @@ extern void __xray_FunctionTailExit();
 extern void __xray_ArgLoggerEntry();
 extern void __xray_CustomEvent();
 extern void __xray_TypedEvent();
+#if defined(__s390x__)
+extern void __xray_FunctionEntryVec();
+extern void __xray_FunctionExitVec();
+#endif
 }
 
 extern "C" {
diff --git compiler-rt/lib/xray/xray_s390x.cpp compiler-rt/lib/xray/xray_s390x.cpp
new file mode 100644
index 000000000000..599485435671
--- /dev/null
+++ compiler-rt/lib/xray/xray_s390x.cpp
@@ -0,0 +1,104 @@
+//===-- xray_s390x.cpp ------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// Implementation of s390x routines.
+//
+//===----------------------------------------------------------------------===//
+#include "sanitizer_common/sanitizer_common.h"
+#include "xray_defs.h"
+#include "xray_interface_internal.h"
+#include <cassert>
+#include <cstring>
+
+bool __xray::patchFunctionEntry(const bool Enable, const uint32_t FuncId,
+                                const XRaySledEntry &Sled,
+                                const XRayTrampolines &Trampolines,
+                                bool LogArgs) XRAY_NEVER_INSTRUMENT {
+  uint32_t *Address = reinterpret_cast<uint32_t *>(Sled.address());
+  // TODO: Trampoline addresses are currently inserted at compile-time, using
+  //       __xray_FunctionEntry and __xray_FunctionExit only.
+  //       To support DSO instrumentation, trampolines have to be written during
+  //       patching (see implementation on X86_64, e.g.).
+  if (Enable) {
+    // The resulting code is:
+    //   stmg    %r2, %r15, 16(%r15)
+    //   llilf   %2, FuncID
+    //   brasl   %r14, __xray_FunctionEntry@GOT
+    // The FuncId and the stmg instruction must be written.
+
+    // Write FuncId into llilf.
+    Address[2] = FuncId;
+    // Write last part of stmg.
+    reinterpret_cast<uint16_t *>(Address)[2] = 0x24;
+    // Write first part of stmg.
+    Address[0] = 0xeb2ff010;
+  } else {
+    // j +16 instructions.
+    Address[0] = 0xa7f4000b;
+  }
+  return true;
+}
+
+bool __xray::patchFunctionExit(
+    const bool Enable, const uint32_t FuncId, const XRaySledEntry &Sled,
+    const XRayTrampolines &Trampolines) XRAY_NEVER_INSTRUMENT {
+  uint32_t *Address = reinterpret_cast<uint32_t *>(Sled.address());
+  // TODO: Trampoline addresses are currently inserted at compile-time, using
+  //       __xray_FunctionEntry and __xray_FunctionExit only.
+  //       To support DSO instrumentation, trampolines have to be written during
+  //       patching (see implementation on X86_64, e.g.).
+  if (Enable) {
+    // The resulting code is:
+    //   stmg    %r2, %r15, 24(%r15)
+    //   llilf   %2,FuncID
+    //   j       __xray_FunctionEntry@GOT
+    // The FuncId and the stmg instruction must be written.
+
+    // Write FuncId into llilf.
+    Address[2] = FuncId;
+    // Write last part of of stmg.
+    reinterpret_cast<uint16_t *>(Address)[2] = 0x24;
+    // Write first part of stmg.
+    Address[0] = 0xeb2ff010;
+  } else {
+    // br %14 instruction.
+    reinterpret_cast<uint16_t *>(Address)[0] = 0x07fe;
+  }
+  return true;
+}
+
+bool __xray::patchFunctionTailExit(
+    const bool Enable, const uint32_t FuncId, const XRaySledEntry &Sled,
+    const XRayTrampolines &Trampolines) XRAY_NEVER_INSTRUMENT {
+  return patchFunctionExit(Enable, FuncId, Sled, Trampolines);
+}
+
+bool __xray::patchCustomEvent(const bool Enable, const uint32_t FuncId,
+                              const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  // TODO Implement.
+  return false;
+}
+
+bool __xray::patchTypedEvent(const bool Enable, const uint32_t FuncId,
+                             const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  // TODO Implement.
+  return false;
+}
+
+extern "C" void __xray_ArgLoggerEntry() XRAY_NEVER_INSTRUMENT {
+  // TODO this will have to be implemented in the trampoline assembly file.
+}
+
+extern "C" void __xray_FunctionTailExit() XRAY_NEVER_INSTRUMENT {
+  // For PowerPC, calls to __xray_FunctionEntry and __xray_FunctionExit
+  // are statically inserted into the sled. Tail exits are handled like normal
+  // function exits. This trampoline is therefore not implemented.
+  // This stub is placed here to avoid linking issues.
+}
diff --git compiler-rt/lib/xray/xray_trampoline_s390x.S compiler-rt/lib/xray/xray_trampoline_s390x.S
new file mode 100644
index 000000000000..4073943641b9
--- /dev/null
+++ compiler-rt/lib/xray/xray_trampoline_s390x.S
@@ -0,0 +1,176 @@
+//===-- xray_trampoline_s390x.s ---------------------------------*- ASM -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// This implements the s390x-specific assembler for the trampolines.
+// 2 versions of the functions are provided: one which does not store the
+// vector registers, and one which does store them. The compiler decides
+// which to call based on the availability of the vector extension.
+//
+//===----------------------------------------------------------------------===//
+
+    .text
+
+// Minimal stack frame size
+#define STACKSZ  160
+
+// Minimal stack frame size (160) plus space for 8 vector registers a 16 bytes.
+#define STACKSZ_VEC  288
+
+//===----------------------------------------------------------------------===//
+
+    .globl  __xray_FunctionEntry
+    .p2align    4
+    .type   __xray_FunctionEntry,@function
+__xray_FunctionEntry:
+    # The registers r2-15 of the instrumented function are already saved in the
+    # stack frame. On entry, r2 contains the function id, and %r14 the address
+    # of the first instruction of the instrumented function.
+    # Register r14 will be stored in the slot reserved for compiler use.
+    stg     %r14, 8(%r15)
+    std     %f0, 128(%r15)
+    std     %f2, 136(%r15)
+    std     %f4, 144(%r15)
+    std     %f6, 152(%r15)
+    aghi    %r15, -STACKSZ
+
+    lgrl    %r1, _ZN6__xray19XRayPatchedFunctionE@GOT
+    ltg     %r1, 0(%r1)
+    je      .Lrestore0
+
+    # Set r3 to XRayEntryType::ENTRY = 0.
+    # The FuncId is still stored in r2.
+    lghi    %r3, 0
+    basr    %r14, %r1
+
+.Lrestore0:
+    ld      %f6, STACKSZ+152(%r15)
+    ld      %f4, STACKSZ+144(%r15)
+    ld      %f2, STACKSZ+136(%r15)
+    ld      %f0, STACKSZ+128(%r15)
+    lmg     %r1, %r15, STACKSZ+8(%r15)
+    br      %r1
+.Lfunc_end0:
+    .size    __xray_FunctionEntry, .Lfunc_end0-__xray_FunctionEntry
+
+//===----------------------------------------------------------------------===//
+
+    .globl  __xray_FunctionEntryVec
+    .p2align    4
+    .type   __xray_FunctionEntryVec,@function
+__xray_FunctionEntryVec:
+    # The registers r2-15 of the instrumented function are already saved in the
+    # stack frame. On entry, r2 contains the function id, and %r14 the address
+    # of the first instruction of the instrumented function.
+    # Register r14 will be stored in the slot reserved for compiler use.
+    stg     %r14, 8(%r15)
+    std     %f0, 128(%r15)
+    std     %f2, 136(%r15)
+    std     %f4, 144(%r15)
+    std     %f6, 152(%r15)
+    aghi    %r15, -STACKSZ_VEC
+    vstm    %v24, %v31, 160(%r15)
+
+    lgrl    %r1, _ZN6__xray19XRayPatchedFunctionE@GOT
+    ltg     %r1, 0(%r1)
+    je      .Lrestore1
+
+    # Set r3 to XRayEntryType::ENTRY = 0.
+    # The FuncId is still stored in r2.
+    lghi    %r3, 0
+    basr    %r14, %r1
+
+.Lrestore1:
+    vlm     %v24, %v31, 160(%r15)
+    ld      %f6, STACKSZ_VEC+152(%r15)
+    ld      %f4, STACKSZ_VEC+144(%r15)
+    ld      %f2, STACKSZ_VEC+136(%r15)
+    ld      %f0, STACKSZ_VEC+128(%r15)
+    lmg     %r1, %r15, STACKSZ_VEC+8(%r15)
+    br      %r1
+.Lfunc_end1:
+    .size    __xray_FunctionEntryVec, .Lfunc_end1-__xray_FunctionEntryVec
+
+//===----------------------------------------------------------------------===//
+
+    .globl  __xray_FunctionExit
+    .p2align    4
+    .type   __xray_FunctionExit,@function
+__xray_FunctionExit:
+    # The registers r2-15 of the instrumented function are already saved in the
+    # stack frame. On entry, the register r2 contains the function id.
+    # At the end, the function jumps to the address saved in the slot for r14,
+    # which contains the return address into the caller of the instrumented
+    # function.
+    std     %f0, 128(%r15)
+    std     %f2, 136(%r15)
+    std     %f4, 144(%r15)
+    std     %f6, 152(%r15)
+    aghi    %r15, -STACKSZ
+
+    lgrl    %r1, _ZN6__xray19XRayPatchedFunctionE@GOT
+    ltg     %r1, 0(%r1)
+    je      .Lrestore2
+
+    # Set r3 to XRayEntryType::EXIT = 1.
+    # The FuncId is still stored in r2.
+    lghi    %r3, 1
+    basr    %r14, %r1
+
+.Lrestore2:
+    ld      %f6, STACKSZ+152(%r15)
+    ld      %f4, STACKSZ+144(%r15)
+    ld      %f2, STACKSZ+136(%r15)
+    ld      %f0, STACKSZ+128(%r15)
+    lmg     %r2, %r15, STACKSZ+16(%r15)
+    br      %r14
+.Lfunc_end2:
+    .size    __xray_FunctionExit, .Lfunc_end2-__xray_FunctionExit
+
+//===----------------------------------------------------------------------===//
+
+    .globl  __xray_FunctionExitVec
+    .p2align    4
+    .type   __xray_FunctionExitVec,@function
+__xray_FunctionExitVec:
+    # The registers r2-15 of the instrumented function are already saved in the
+    # stack frame. On entry, the register r2 contains the function id.
+    # At the end, the function jumps to the address saved in the slot for r14,
+    # which contains the return address into the caller of the instrumented
+    # function.
+    std     %f0, 128(%r15)
+    std     %f2, 136(%r15)
+    std     %f4, 144(%r15)
+    std     %f6, 152(%r15)
+    aghi    %r15, -STACKSZ_VEC
+    vstm    %v24, %v31, 160(%r15)
+
+    lgrl    %r1, _ZN6__xray19XRayPatchedFunctionE@GOT
+    ltg     %r1, 0(%r1)
+    je      .Lrestore3
+
+    # Set r3 to XRayEntryType::EXIT = 1.
+    # The FuncId is still stored in r2.
+    lghi    %r3, 1
+    basr    %r14, %r1
+
+.Lrestore3:
+    vlm     %v24, %v31, 160(%r15)
+    ld      %f6, STACKSZ_VEC+152(%r15)
+    ld      %f4, STACKSZ_VEC+144(%r15)
+    ld      %f2, STACKSZ_VEC+136(%r15)
+    ld      %f0, STACKSZ_VEC+128(%r15)
+    lmg     %r2, %r15, STACKSZ_VEC+16(%r15)
+    br      %r14
+.Lfunc_end3:
+    .size    __xray_FunctionExit, .Lfunc_end3-__xray_FunctionExit
+
+//===----------------------------------------------------------------------===//
+
+    .section    ".note.GNU-stack","",@progbits
diff --git compiler-rt/lib/xray/xray_tsc.h compiler-rt/lib/xray/xray_tsc.h
index b62a686d6ce0..17e06c7035d8 100644
--- compiler-rt/lib/xray/xray_tsc.h
+++ compiler-rt/lib/xray/xray_tsc.h
@@ -83,6 +83,28 @@ inline uint64_t getTSCFrequency() XRAY_NEVER_INSTRUMENT {
 
 } // namespace __xray
 
+#elif defined(__s390x__)
+#include "sanitizer_common/sanitizer_common.h"
+#include "sanitizer_common/sanitizer_internal_defs.h"
+#include "xray_defs.h"
+#include <cerrno>
+#include <cstdint>
+#include <time.h>
+
+namespace __xray {
+
+inline bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT { return true; }
+
+ALWAYS_INLINE uint64_t readTSC(uint8_t &CPU) XRAY_NEVER_INSTRUMENT {
+  return __builtin_readcyclecounter();
+}
+
+inline uint64_t getTSCFrequency() XRAY_NEVER_INSTRUMENT {
+  return NanosecondsPerSecond;
+}
+
+} // namespace __xray
+
 #else
 #error Target architecture is not supported.
 #endif // CPU architecture
diff --git compiler-rt/test/asan/TestCases/suppressions-alloc-dealloc-mismatch.cpp compiler-rt/test/asan/TestCases/suppressions-alloc-dealloc-mismatch.cpp
index 43478ec2f345..6ab796b1c76a 100644
--- compiler-rt/test/asan/TestCases/suppressions-alloc-dealloc-mismatch.cpp
+++ compiler-rt/test/asan/TestCases/suppressions-alloc-dealloc-mismatch.cpp
@@ -9,6 +9,10 @@
 // FIXME: Upload suppressions to device.
 // XFAIL: android
 
+// FIXME: atos does not work for inlined functions, yet llvm-symbolizer
+// does not always work with debug info on Darwin.
+// UNSUPPORTED: darwin
+
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
diff --git compiler-rt/test/orc/TestCases/Darwin/Generic/exceptions.cpp compiler-rt/test/orc/TestCases/Darwin/Generic/exceptions.cpp
deleted file mode 100644
index 7e9c40c724ae..000000000000
--- compiler-rt/test/orc/TestCases/Darwin/Generic/exceptions.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-// RUN: %clangxx -c -o %t %s
-// RUN: %llvm_jitlink -slab-allocate=20Mb %t
-//
-// REQUIRES: system-darwin && host-arch-compatible
-
-int main(int argc, char *argv[]) {
-  try {
-    throw 42;
-  } catch (int E) {
-    return 42 - E;
-  }
-  return 1;
-}
diff --git flang-rt/lib/cuda/allocatable.cpp flang-rt/lib/cuda/allocatable.cpp
index 2f549b604fe5..b773e802c90f 100644
--- flang-rt/lib/cuda/allocatable.cpp
+++ flang-rt/lib/cuda/allocatable.cpp
@@ -24,10 +24,10 @@ extern "C" {
 RT_EXT_API_GROUP_BEGIN
 
 int RTDEF(CUFAllocatableAllocateSync)(Descriptor &desc, int64_t stream,
-    bool hasStat, const Descriptor *errMsg, const char *sourceFile,
-    int sourceLine) {
+    bool *pinned, bool hasStat, const Descriptor *errMsg,
+    const char *sourceFile, int sourceLine) {
   int stat{RTNAME(CUFAllocatableAllocate)(
-      desc, stream, hasStat, errMsg, sourceFile, sourceLine)};
+      desc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
 #ifndef RT_DEVICE_COMPILATION
   // Descriptor synchronization is only done when the allocation is done
   // from the host.
@@ -42,8 +42,8 @@ int RTDEF(CUFAllocatableAllocateSync)(Descriptor &desc, int64_t stream,
 }
 
 int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, int64_t stream,
-    bool hasStat, const Descriptor *errMsg, const char *sourceFile,
-    int sourceLine) {
+    bool *pinned, bool hasStat, const Descriptor *errMsg,
+    const char *sourceFile, int sourceLine) {
   if (desc.HasAddendum()) {
     Terminator terminator{sourceFile, sourceLine};
     // TODO: This require a bit more work to set the correct type descriptor
@@ -54,14 +54,19 @@ int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, int64_t stream,
   // Perform the standard allocation.
   int stat{RTNAME(AllocatableAllocate)(
       desc, hasStat, errMsg, sourceFile, sourceLine)};
+  if (pinned) {
+    // Set pinned according to stat. More infrastructre is needed to set it
+    // closer to the actual allocation call.
+    *pinned = (stat == StatOk);
+  }
   return stat;
 }
 
 int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
-    const Descriptor &source, int64_t stream, bool hasStat,
+    const Descriptor &source, int64_t stream, bool *pinned, bool hasStat,
     const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
   int stat{RTNAME(CUFAllocatableAllocate)(
-      alloc, stream, hasStat, errMsg, sourceFile, sourceLine)};
+      alloc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
   if (stat == StatOk) {
     Terminator terminator{sourceFile, sourceLine};
     Fortran::runtime::DoFromSourceAssign(
@@ -71,10 +76,10 @@ int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
 }
 
 int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc,
-    const Descriptor &source, int64_t stream, bool hasStat,
+    const Descriptor &source, int64_t stream, bool *pinned, bool hasStat,
     const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
   int stat{RTNAME(CUFAllocatableAllocateSync)(
-      alloc, stream, hasStat, errMsg, sourceFile, sourceLine)};
+      alloc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
   if (stat == StatOk) {
     Terminator terminator{sourceFile, sourceLine};
     Fortran::runtime::DoFromSourceAssign(
diff --git flang-rt/lib/cuda/pointer.cpp flang-rt/lib/cuda/pointer.cpp
index d94e3958b915..c2559ecb9a6f 100644
--- flang-rt/lib/cuda/pointer.cpp
+++ flang-rt/lib/cuda/pointer.cpp
@@ -22,8 +22,9 @@ namespace Fortran::runtime::cuda {
 extern "C" {
 RT_EXT_API_GROUP_BEGIN
 
-int RTDEF(CUFPointerAllocate)(Descriptor &desc, int64_t stream, bool hasStat,
-    const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
+int RTDEF(CUFPointerAllocate)(Descriptor &desc, int64_t stream, bool *pinned,
+    bool hasStat, const Descriptor *errMsg, const char *sourceFile,
+    int sourceLine) {
   if (desc.HasAddendum()) {
     Terminator terminator{sourceFile, sourceLine};
     // TODO: This require a bit more work to set the correct type descriptor
@@ -34,14 +35,19 @@ int RTDEF(CUFPointerAllocate)(Descriptor &desc, int64_t stream, bool hasStat,
   // Perform the standard allocation.
   int stat{
       RTNAME(PointerAllocate)(desc, hasStat, errMsg, sourceFile, sourceLine)};
+  if (pinned) {
+    // Set pinned according to stat. More infrastructre is needed to set it
+    // closer to the actual allocation call.
+    *pinned = (stat == StatOk);
+  }
   return stat;
 }
 
 int RTDEF(CUFPointerAllocateSync)(Descriptor &desc, int64_t stream,
-    bool hasStat, const Descriptor *errMsg, const char *sourceFile,
-    int sourceLine) {
+    bool *pinned, bool hasStat, const Descriptor *errMsg,
+    const char *sourceFile, int sourceLine) {
   int stat{RTNAME(CUFPointerAllocate)(
-      desc, stream, hasStat, errMsg, sourceFile, sourceLine)};
+      desc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
 #ifndef RT_DEVICE_COMPILATION
   // Descriptor synchronization is only done when the allocation is done
   // from the host.
@@ -56,10 +62,10 @@ int RTDEF(CUFPointerAllocateSync)(Descriptor &desc, int64_t stream,
 }
 
 int RTDEF(CUFPointerAllocateSource)(Descriptor &pointer,
-    const Descriptor &source, int64_t stream, bool hasStat,
+    const Descriptor &source, int64_t stream, bool *pinned, bool hasStat,
     const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
   int stat{RTNAME(CUFPointerAllocate)(
-      pointer, stream, hasStat, errMsg, sourceFile, sourceLine)};
+      pointer, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
   if (stat == StatOk) {
     Terminator terminator{sourceFile, sourceLine};
     Fortran::runtime::DoFromSourceAssign(
@@ -69,10 +75,10 @@ int RTDEF(CUFPointerAllocateSource)(Descriptor &pointer,
 }
 
 int RTDEF(CUFPointerAllocateSourceSync)(Descriptor &pointer,
-    const Descriptor &source, int64_t stream, bool hasStat,
+    const Descriptor &source, int64_t stream, bool *pinned, bool hasStat,
     const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
   int stat{RTNAME(CUFPointerAllocateSync)(
-      pointer, stream, hasStat, errMsg, sourceFile, sourceLine)};
+      pointer, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
   if (stat == StatOk) {
     Terminator terminator{sourceFile, sourceLine};
     Fortran::runtime::DoFromSourceAssign(
diff --git flang-rt/lib/runtime/exceptions.cpp flang-rt/lib/runtime/exceptions.cpp
index d676a0ad53aa..d25a67c8e9cb 100644
--- flang-rt/lib/runtime/exceptions.cpp
+++ flang-rt/lib/runtime/exceptions.cpp
@@ -11,13 +11,16 @@
 #include "flang/Runtime/exceptions.h"
 #include "flang-rt/runtime/terminator.h"
 #include <cfenv>
-#if defined(__aarch64__) && !defined(_WIN32)
+#if defined(__aarch64__) && defined(__GLIBC__)
 #include <fpu_control.h>
-#elif defined(__x86_64__)
+#elif defined(__x86_64__) && !defined(_WIN32)
 #include <xmmintrin.h>
 #endif
 
-// fenv.h may not define exception macros.
+// File fenv.h usually, but not always, defines standard exceptions as both
+// enumerator values and preprocessor #defines. Some x86 environments also
+// define a nonstandard __FE_DENORM enumerator, but without a corresponding
+// #define, which makes it more difficult to determine if it is present or not.
 #ifndef FE_INVALID
 #define FE_INVALID 0
 #endif
@@ -33,6 +36,12 @@
 #ifndef FE_INEXACT
 #define FE_INEXACT 0
 #endif
+#if FE_INVALID == 1 && FE_DIVBYZERO == 4 && FE_OVERFLOW == 8 && \
+    FE_UNDERFLOW == 16 && FE_INEXACT == 32
+#define __FE_DENORM 2
+#else
+#define __FE_DENORM 0
+#endif
 
 namespace Fortran::runtime {
 
@@ -44,11 +53,7 @@ uint32_t RTNAME(MapException)(uint32_t excepts) {
   Terminator terminator{__FILE__, __LINE__};
 
   static constexpr uint32_t v{FE_INVALID};
-#if __x86_64__
-  static constexpr uint32_t s{__FE_DENORM}; // nonstandard, not a #define
-#else
-  static constexpr uint32_t s{0};
-#endif
+  static constexpr uint32_t s{__FE_DENORM};
   static constexpr uint32_t z{FE_DIVBYZERO};
   static constexpr uint32_t o{FE_OVERFLOW};
   static constexpr uint32_t u{FE_UNDERFLOW};
diff --git flang-rt/lib/runtime/io-api.cpp flang-rt/lib/runtime/io-api.cpp
index 72041ae42946..0355734c67fc 100644
--- flang-rt/lib/runtime/io-api.cpp
+++ flang-rt/lib/runtime/io-api.cpp
@@ -887,7 +887,7 @@ bool IODEF(SetForm)(Cookie cookie, const char *keyword, std::size_t length) {
     io.GetIoErrorHandler().Crash(
         "SetForm() called after GetNewUnit() for an OPEN statement");
   }
-  static const char *keywords[]{"FORMATTED", "UNFORMATTED", nullptr};
+  static const char *keywords[]{"FORMATTED", "UNFORMATTED", "BINARY", nullptr};
   switch (IdentifyValue(keyword, length, keywords)) {
   case 0:
     open->set_isUnformatted(false);
@@ -895,6 +895,10 @@ bool IODEF(SetForm)(Cookie cookie, const char *keyword, std::size_t length) {
   case 1:
     open->set_isUnformatted(true);
     break;
+  case 2: // legacy FORM='BINARY' means an unformatted stream
+    open->set_isUnformatted(true);
+    open->set_access(Access::Stream);
+    break;
   default:
     open->SignalError(IostatErrorInKeyword, "Invalid FORM='%.*s'",
         static_cast<int>(length), keyword);
diff --git flang/docs/Extensions.md flang/docs/Extensions.md
index f25f0d1e0ca3..e84bceee5ca7 100644
--- flang/docs/Extensions.md
+++ flang/docs/Extensions.md
@@ -411,6 +411,8 @@ end
   is accepted before an array specification (`ch*3(2)`) as well
   as afterwards.
 * A zero field width is allowed for logical formatted output (`L0`).
+* `OPEN(..., FORM='BINARY')` is accepted as a legacy synonym for
+  the standard `OPEN(..., FORM='UNFORMATTED', ACCESS='STREAM')`.
 
 ### Extensions supported when enabled by options
 
diff --git flang/examples/FeatureList/FeatureList.cpp flang/examples/FeatureList/FeatureList.cpp
index 3a689c335c81..e35f120d8661 100644
--- flang/examples/FeatureList/FeatureList.cpp
+++ flang/examples/FeatureList/FeatureList.cpp
@@ -514,7 +514,6 @@ public:
   READ_FEATURE(OmpReductionClause)
   READ_FEATURE(OmpInReductionClause)
   READ_FEATURE(OmpReductionCombiner)
-  READ_FEATURE(OmpReductionCombiner::FunctionCombiner)
   READ_FEATURE(OmpReductionInitializerClause)
   READ_FEATURE(OmpReductionIdentifier)
   READ_FEATURE(OmpAllocateClause)
diff --git flang/examples/FlangOmpReport/FlangOmpReportVisitor.cpp flang/examples/FlangOmpReport/FlangOmpReportVisitor.cpp
index c78dd7f14e50..b0a632247fe1 100644
--- flang/examples/FlangOmpReport/FlangOmpReportVisitor.cpp
+++ flang/examples/FlangOmpReport/FlangOmpReportVisitor.cpp
@@ -112,6 +112,10 @@ std::string OpenMPCounterVisitor::getName(const OpenMPDeclarativeConstruct &c) {
             const CharBlock &source{o.source};
             return normalize_construct_name(source.ToString());
           },
+          [&](const OmpMetadirectiveDirective &o) -> std::string {
+            const CharBlock &source{o.source};
+            return normalize_construct_name(source.ToString());
+          },
           [&](const auto &o) -> std::string {
             const CharBlock &source{std::get<Verbatim>(o.t).source};
             return normalize_construct_name(source.ToString());
diff --git flang/include/flang/Lower/Support/Utils.h flang/include/flang/Lower/Support/Utils.h
index 1cc74521e22d..baaf644dd6ef 100644
--- flang/include/flang/Lower/Support/Utils.h
+++ flang/include/flang/Lower/Support/Utils.h
@@ -14,13 +14,13 @@
 #define FORTRAN_LOWER_SUPPORT_UTILS_H
 
 #include "flang/Common/indirection.h"
+#include "flang/Lower/IterationSpace.h"
 #include "flang/Parser/char-block.h"
 #include "flang/Semantics/tools.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "llvm/ADT/StringRef.h"
-#include <cstdint>
 
 namespace Fortran::lower {
 using SomeExpr = Fortran::evaluate::Expr<Fortran::evaluate::SomeType>;
@@ -87,555 +87,13 @@ A flatZip(const A &container1, const A &container2) {
 }
 
 namespace Fortran::lower {
-// Fortran::evaluate::Expr are functional values organized like an AST. A
-// Fortran::evaluate::Expr is meant to be moved and cloned. Using the front end
-// tools can often cause copies and extra wrapper classes to be added to any
-// Fortran::evaluate::Expr. These values should not be assumed or relied upon to
-// have an *object* identity. They are deeply recursive, irregular structures
-// built from a large number of classes which do not use inheritance and
-// necessitate a large volume of boilerplate code as a result.
-//
-// Contrastingly, LLVM data structures make ubiquitous assumptions about an
-// object's identity via pointers to the object. An object's location in memory
-// is thus very often an identifying relation.
-
-// This class defines a hash computation of a Fortran::evaluate::Expr tree value
-// so it can be used with llvm::DenseMap. The Fortran::evaluate::Expr need not
-// have the same address.
-class HashEvaluateExpr {
-public:
-  // A Se::Symbol is the only part of an Fortran::evaluate::Expr with an
-  // identity property.
-  static unsigned getHashValue(const Fortran::semantics::Symbol &x) {
-    return static_cast<unsigned>(reinterpret_cast<std::intptr_t>(&x));
-  }
-  template <typename A, bool COPY>
-  static unsigned getHashValue(const Fortran::common::Indirection<A, COPY> &x) {
-    return getHashValue(x.value());
-  }
-  template <typename A>
-  static unsigned getHashValue(const std::optional<A> &x) {
-    if (x.has_value())
-      return getHashValue(x.value());
-    return 0u;
-  }
-  static unsigned getHashValue(const Fortran::evaluate::Subscript &x) {
-    return Fortran::common::visit(
-        [&](const auto &v) { return getHashValue(v); }, x.u);
-  }
-  static unsigned getHashValue(const Fortran::evaluate::Triplet &x) {
-    return getHashValue(x.lower()) - getHashValue(x.upper()) * 5u -
-           getHashValue(x.stride()) * 11u;
-  }
-  static unsigned getHashValue(const Fortran::evaluate::Component &x) {
-    return getHashValue(x.base()) * 83u - getHashValue(x.GetLastSymbol());
-  }
-  static unsigned getHashValue(const Fortran::evaluate::ArrayRef &x) {
-    unsigned subs = 1u;
-    for (const Fortran::evaluate::Subscript &v : x.subscript())
-      subs -= getHashValue(v);
-    return getHashValue(x.base()) * 89u - subs;
-  }
-  static unsigned getHashValue(const Fortran::evaluate::CoarrayRef &x) {
-    unsigned subs = 1u;
-    for (const Fortran::evaluate::Subscript &v : x.subscript())
-      subs -= getHashValue(v);
-    unsigned cosubs = 3u;
-    for (const Fortran::evaluate::Expr<Fortran::evaluate::SubscriptInteger> &v :
-         x.cosubscript())
-      cosubs -= getHashValue(v);
-    unsigned syms = 7u;
-    for (const Fortran::evaluate::SymbolRef &v : x.base())
-      syms += getHashValue(v);
-    return syms * 97u - subs - cosubs + getHashValue(x.stat()) + 257u +
-           getHashValue(x.team());
-  }
-  static unsigned getHashValue(const Fortran::evaluate::NamedEntity &x) {
-    if (x.IsSymbol())
-      return getHashValue(x.GetFirstSymbol()) * 11u;
-    return getHashValue(x.GetComponent()) * 13u;
-  }
-  static unsigned getHashValue(const Fortran::evaluate::DataRef &x) {
-    return Fortran::common::visit(
-        [&](const auto &v) { return getHashValue(v); }, x.u);
-  }
-  static unsigned getHashValue(const Fortran::evaluate::ComplexPart &x) {
-    return getHashValue(x.complex()) - static_cast<unsigned>(x.part());
-  }
-  template <Fortran::common::TypeCategory TC1, int KIND,
-            Fortran::common::TypeCategory TC2>
-  static unsigned getHashValue(
-      const Fortran::evaluate::Convert<Fortran::evaluate::Type<TC1, KIND>, TC2>
-          &x) {
-    return getHashValue(x.left()) - (static_cast<unsigned>(TC1) + 2u) -
-           (static_cast<unsigned>(KIND) + 5u);
-  }
-  template <int KIND>
-  static unsigned
-  getHashValue(const Fortran::evaluate::ComplexComponent<KIND> &x) {
-    return getHashValue(x.left()) -
-           (static_cast<unsigned>(x.isImaginaryPart) + 1u) * 3u;
-  }
-  template <typename T>
-  static unsigned getHashValue(const Fortran::evaluate::Parentheses<T> &x) {
-    return getHashValue(x.left()) * 17u;
-  }
-  template <Fortran::common::TypeCategory TC, int KIND>
-  static unsigned getHashValue(
-      const Fortran::evaluate::Negate<Fortran::evaluate::Type<TC, KIND>> &x) {
-    return getHashValue(x.left()) - (static_cast<unsigned>(TC) + 5u) -
-           (static_cast<unsigned>(KIND) + 7u);
-  }
-  template <Fortran::common::TypeCategory TC, int KIND>
-  static unsigned getHashValue(
-      const Fortran::evaluate::Add<Fortran::evaluate::Type<TC, KIND>> &x) {
-    return (getHashValue(x.left()) + getHashValue(x.right())) * 23u +
-           static_cast<unsigned>(TC) + static_cast<unsigned>(KIND);
-  }
-  template <Fortran::common::TypeCategory TC, int KIND>
-  static unsigned getHashValue(
-      const Fortran::evaluate::Subtract<Fortran::evaluate::Type<TC, KIND>> &x) {
-    return (getHashValue(x.left()) - getHashValue(x.right())) * 19u +
-           static_cast<unsigned>(TC) + static_cast<unsigned>(KIND);
-  }
-  template <Fortran::common::TypeCategory TC, int KIND>
-  static unsigned getHashValue(
-      const Fortran::evaluate::Multiply<Fortran::evaluate::Type<TC, KIND>> &x) {
-    return (getHashValue(x.left()) + getHashValue(x.right())) * 29u +
-           static_cast<unsigned>(TC) + static_cast<unsigned>(KIND);
-  }
-  template <Fortran::common::TypeCategory TC, int KIND>
-  static unsigned getHashValue(
-      const Fortran::evaluate::Divide<Fortran::evaluate::Type<TC, KIND>> &x) {
-    return (getHashValue(x.left()) - getHashValue(x.right())) * 31u +
-           static_cast<unsigned>(TC) + static_cast<unsigned>(KIND);
-  }
-  template <Fortran::common::TypeCategory TC, int KIND>
-  static unsigned getHashValue(
-      const Fortran::evaluate::Power<Fortran::evaluate::Type<TC, KIND>> &x) {
-    return (getHashValue(x.left()) - getHashValue(x.right())) * 37u +
-           static_cast<unsigned>(TC) + static_cast<unsigned>(KIND);
-  }
-  template <Fortran::common::TypeCategory TC, int KIND>
-  static unsigned getHashValue(
-      const Fortran::evaluate::Extremum<Fortran::evaluate::Type<TC, KIND>> &x) {
-    return (getHashValue(x.left()) + getHashValue(x.right())) * 41u +
-           static_cast<unsigned>(TC) + static_cast<unsigned>(KIND) +
-           static_cast<unsigned>(x.ordering) * 7u;
-  }
-  template <Fortran::common::TypeCategory TC, int KIND>
-  static unsigned getHashValue(
-      const Fortran::evaluate::RealToIntPower<Fortran::evaluate::Type<TC, KIND>>
-          &x) {
-    return (getHashValue(x.left()) - getHashValue(x.right())) * 43u +
-           static_cast<unsigned>(TC) + static_cast<unsigned>(KIND);
-  }
-  template <int KIND>
-  static unsigned
-  getHashValue(const Fortran::evaluate::ComplexConstructor<KIND> &x) {
-    return (getHashValue(x.left()) - getHashValue(x.right())) * 47u +
-           static_cast<unsigned>(KIND);
-  }
-  template <int KIND>
-  static unsigned getHashValue(const Fortran::evaluate::Concat<KIND> &x) {
-    return (getHashValue(x.left()) - getHashValue(x.right())) * 53u +
-           static_cast<unsigned>(KIND);
-  }
-  template <int KIND>
-  static unsigned getHashValue(const Fortran::evaluate::SetLength<KIND> &x) {
-    return (getHashValue(x.left()) - getHashValue(x.right())) * 59u +
-           static_cast<unsigned>(KIND);
-  }
-  static unsigned getHashValue(const Fortran::semantics::SymbolRef &sym) {
-    return getHashValue(sym.get());
-  }
-  static unsigned getHashValue(const Fortran::evaluate::Substring &x) {
-    return 61u *
-               Fortran::common::visit(
-                   [&](const auto &p) { return getHashValue(p); }, x.parent()) -
-           getHashValue(x.lower()) - (getHashValue(x.lower()) + 1u);
-  }
-  static unsigned
-  getHashValue(const Fortran::evaluate::StaticDataObject::Pointer &x) {
-    return llvm::hash_value(x->name());
-  }
-  static unsigned getHashValue(const Fortran::evaluate::SpecificIntrinsic &x) {
-    return llvm::hash_value(x.name);
-  }
-  template <typename A>
-  static unsigned getHashValue(const Fortran::evaluate::Constant<A> &x) {
-    // FIXME: Should hash the content.
-    return 103u;
-  }
-  static unsigned getHashValue(const Fortran::evaluate::ActualArgument &x) {
-    if (const Fortran::evaluate::Symbol *sym = x.GetAssumedTypeDummy())
-      return getHashValue(*sym);
-    return getHashValue(*x.UnwrapExpr());
-  }
-  static unsigned
-  getHashValue(const Fortran::evaluate::ProcedureDesignator &x) {
-    return Fortran::common::visit(
-        [&](const auto &v) { return getHashValue(v); }, x.u);
-  }
-  static unsigned getHashValue(const Fortran::evaluate::ProcedureRef &x) {
-    unsigned args = 13u;
-    for (const std::optional<Fortran::evaluate::ActualArgument> &v :
-         x.arguments())
-      args -= getHashValue(v);
-    return getHashValue(x.proc()) * 101u - args;
-  }
-  template <typename A>
-  static unsigned
-  getHashValue(const Fortran::evaluate::ArrayConstructor<A> &x) {
-    // FIXME: hash the contents.
-    return 127u;
-  }
-  static unsigned getHashValue(const Fortran::evaluate::ImpliedDoIndex &x) {
-    return llvm::hash_value(toStringRef(x.name).str()) * 131u;
-  }
-  static unsigned getHashValue(const Fortran::evaluate::TypeParamInquiry &x) {
-    return getHashValue(x.base()) * 137u - getHashValue(x.parameter()) * 3u;
-  }
-  static unsigned getHashValue(const Fortran::evaluate::DescriptorInquiry &x) {
-    return getHashValue(x.base()) * 139u -
-           static_cast<unsigned>(x.field()) * 13u +
-           static_cast<unsigned>(x.dimension());
-  }
-  static unsigned
-  getHashValue(const Fortran::evaluate::StructureConstructor &x) {
-    // FIXME: hash the contents.
-    return 149u;
-  }
-  template <int KIND>
-  static unsigned getHashValue(const Fortran::evaluate::Not<KIND> &x) {
-    return getHashValue(x.left()) * 61u + static_cast<unsigned>(KIND);
-  }
-  template <int KIND>
-  static unsigned
-  getHashValue(const Fortran::evaluate::LogicalOperation<KIND> &x) {
-    unsigned result = getHashValue(x.left()) + getHashValue(x.right());
-    return result * 67u + static_cast<unsigned>(x.logicalOperator) * 5u;
-  }
-  template <Fortran::common::TypeCategory TC, int KIND>
-  static unsigned getHashValue(
-      const Fortran::evaluate::Relational<Fortran::evaluate::Type<TC, KIND>>
-          &x) {
-    return (getHashValue(x.left()) + getHashValue(x.right())) * 71u +
-           static_cast<unsigned>(TC) + static_cast<unsigned>(KIND) +
-           static_cast<unsigned>(x.opr) * 11u;
-  }
-  template <typename A>
-  static unsigned getHashValue(const Fortran::evaluate::Expr<A> &x) {
-    return Fortran::common::visit(
-        [&](const auto &v) { return getHashValue(v); }, x.u);
-  }
-  static unsigned getHashValue(
-      const Fortran::evaluate::Relational<Fortran::evaluate::SomeType> &x) {
-    return Fortran::common::visit(
-        [&](const auto &v) { return getHashValue(v); }, x.u);
-  }
-  template <typename A>
-  static unsigned getHashValue(const Fortran::evaluate::Designator<A> &x) {
-    return Fortran::common::visit(
-        [&](const auto &v) { return getHashValue(v); }, x.u);
-  }
-  template <int BITS>
-  static unsigned
-  getHashValue(const Fortran::evaluate::value::Integer<BITS> &x) {
-    return static_cast<unsigned>(x.ToSInt());
-  }
-  static unsigned getHashValue(const Fortran::evaluate::NullPointer &x) {
-    return ~179u;
-  }
-};
+unsigned getHashValue(const Fortran::lower::SomeExpr *x);
+unsigned getHashValue(const Fortran::lower::ExplicitIterSpace::ArrayBases &x);
 
-// Define the is equals test for using Fortran::evaluate::Expr values with
-// llvm::DenseMap.
-class IsEqualEvaluateExpr {
-public:
-  // A Se::Symbol is the only part of an Fortran::evaluate::Expr with an
-  // identity property.
-  static bool isEqual(const Fortran::semantics::Symbol &x,
-                      const Fortran::semantics::Symbol &y) {
-    return isEqual(&x, &y);
-  }
-  static bool isEqual(const Fortran::semantics::Symbol *x,
-                      const Fortran::semantics::Symbol *y) {
-    return x == y;
-  }
-  template <typename A, bool COPY>
-  static bool isEqual(const Fortran::common::Indirection<A, COPY> &x,
-                      const Fortran::common::Indirection<A, COPY> &y) {
-    return isEqual(x.value(), y.value());
-  }
-  template <typename A>
-  static bool isEqual(const std::optional<A> &x, const std::optional<A> &y) {
-    if (x.has_value() && y.has_value())
-      return isEqual(x.value(), y.value());
-    return !x.has_value() && !y.has_value();
-  }
-  template <typename A>
-  static bool isEqual(const std::vector<A> &x, const std::vector<A> &y) {
-    if (x.size() != y.size())
-      return false;
-    const std::size_t size = x.size();
-    for (std::remove_const_t<decltype(size)> i = 0; i < size; ++i)
-      if (!isEqual(x[i], y[i]))
-        return false;
-    return true;
-  }
-  static bool isEqual(const Fortran::evaluate::Subscript &x,
-                      const Fortran::evaluate::Subscript &y) {
-    return Fortran::common::visit(
-        [&](const auto &v, const auto &w) { return isEqual(v, w); }, x.u, y.u);
-  }
-  static bool isEqual(const Fortran::evaluate::Triplet &x,
-                      const Fortran::evaluate::Triplet &y) {
-    return isEqual(x.lower(), y.lower()) && isEqual(x.upper(), y.upper()) &&
-           isEqual(x.stride(), y.stride());
-  }
-  static bool isEqual(const Fortran::evaluate::Component &x,
-                      const Fortran::evaluate::Component &y) {
-    return isEqual(x.base(), y.base()) &&
-           isEqual(x.GetLastSymbol(), y.GetLastSymbol());
-  }
-  static bool isEqual(const Fortran::evaluate::ArrayRef &x,
-                      const Fortran::evaluate::ArrayRef &y) {
-    return isEqual(x.base(), y.base()) && isEqual(x.subscript(), y.subscript());
-  }
-  static bool isEqual(const Fortran::evaluate::CoarrayRef &x,
-                      const Fortran::evaluate::CoarrayRef &y) {
-    return isEqual(x.base(), y.base()) &&
-           isEqual(x.subscript(), y.subscript()) &&
-           isEqual(x.cosubscript(), y.cosubscript()) &&
-           isEqual(x.stat(), y.stat()) && isEqual(x.team(), y.team());
-  }
-  static bool isEqual(const Fortran::evaluate::NamedEntity &x,
-                      const Fortran::evaluate::NamedEntity &y) {
-    if (x.IsSymbol() && y.IsSymbol())
-      return isEqual(x.GetFirstSymbol(), y.GetFirstSymbol());
-    return !x.IsSymbol() && !y.IsSymbol() &&
-           isEqual(x.GetComponent(), y.GetComponent());
-  }
-  static bool isEqual(const Fortran::evaluate::DataRef &x,
-                      const Fortran::evaluate::DataRef &y) {
-    return Fortran::common::visit(
-        [&](const auto &v, const auto &w) { return isEqual(v, w); }, x.u, y.u);
-  }
-  static bool isEqual(const Fortran::evaluate::ComplexPart &x,
-                      const Fortran::evaluate::ComplexPart &y) {
-    return isEqual(x.complex(), y.complex()) && x.part() == y.part();
-  }
-  template <typename A, Fortran::common::TypeCategory TC2>
-  static bool isEqual(const Fortran::evaluate::Convert<A, TC2> &x,
-                      const Fortran::evaluate::Convert<A, TC2> &y) {
-    return isEqual(x.left(), y.left());
-  }
-  template <int KIND>
-  static bool isEqual(const Fortran::evaluate::ComplexComponent<KIND> &x,
-                      const Fortran::evaluate::ComplexComponent<KIND> &y) {
-    return isEqual(x.left(), y.left()) &&
-           x.isImaginaryPart == y.isImaginaryPart;
-  }
-  template <typename T>
-  static bool isEqual(const Fortran::evaluate::Parentheses<T> &x,
-                      const Fortran::evaluate::Parentheses<T> &y) {
-    return isEqual(x.left(), y.left());
-  }
-  template <typename A>
-  static bool isEqual(const Fortran::evaluate::Negate<A> &x,
-                      const Fortran::evaluate::Negate<A> &y) {
-    return isEqual(x.left(), y.left());
-  }
-  template <typename A>
-  static bool isBinaryEqual(const A &x, const A &y) {
-    return isEqual(x.left(), y.left()) && isEqual(x.right(), y.right());
-  }
-  template <typename A>
-  static bool isEqual(const Fortran::evaluate::Add<A> &x,
-                      const Fortran::evaluate::Add<A> &y) {
-    return isBinaryEqual(x, y);
-  }
-  template <typename A>
-  static bool isEqual(const Fortran::evaluate::Subtract<A> &x,
-                      const Fortran::evaluate::Subtract<A> &y) {
-    return isBinaryEqual(x, y);
-  }
-  template <typename A>
-  static bool isEqual(const Fortran::evaluate::Multiply<A> &x,
-                      const Fortran::evaluate::Multiply<A> &y) {
-    return isBinaryEqual(x, y);
-  }
-  template <typename A>
-  static bool isEqual(const Fortran::evaluate::Divide<A> &x,
-                      const Fortran::evaluate::Divide<A> &y) {
-    return isBinaryEqual(x, y);
-  }
-  template <typename A>
-  static bool isEqual(const Fortran::evaluate::Power<A> &x,
-                      const Fortran::evaluate::Power<A> &y) {
-    return isBinaryEqual(x, y);
-  }
-  template <typename A>
-  static bool isEqual(const Fortran::evaluate::Extremum<A> &x,
-                      const Fortran::evaluate::Extremum<A> &y) {
-    return isBinaryEqual(x, y);
-  }
-  template <typename A>
-  static bool isEqual(const Fortran::evaluate::RealToIntPower<A> &x,
-                      const Fortran::evaluate::RealToIntPower<A> &y) {
-    return isBinaryEqual(x, y);
-  }
-  template <int KIND>
-  static bool isEqual(const Fortran::evaluate::ComplexConstructor<KIND> &x,
-                      const Fortran::evaluate::ComplexConstructor<KIND> &y) {
-    return isBinaryEqual(x, y);
-  }
-  template <int KIND>
-  static bool isEqual(const Fortran::evaluate::Concat<KIND> &x,
-                      const Fortran::evaluate::Concat<KIND> &y) {
-    return isBinaryEqual(x, y);
-  }
-  template <int KIND>
-  static bool isEqual(const Fortran::evaluate::SetLength<KIND> &x,
-                      const Fortran::evaluate::SetLength<KIND> &y) {
-    return isBinaryEqual(x, y);
-  }
-  static bool isEqual(const Fortran::semantics::SymbolRef &x,
-                      const Fortran::semantics::SymbolRef &y) {
-    return isEqual(x.get(), y.get());
-  }
-  static bool isEqual(const Fortran::evaluate::Substring &x,
-                      const Fortran::evaluate::Substring &y) {
-    return Fortran::common::visit(
-               [&](const auto &p, const auto &q) { return isEqual(p, q); },
-               x.parent(), y.parent()) &&
-           isEqual(x.lower(), y.lower()) && isEqual(x.upper(), y.upper());
-  }
-  static bool isEqual(const Fortran::evaluate::StaticDataObject::Pointer &x,
-                      const Fortran::evaluate::StaticDataObject::Pointer &y) {
-    return x->name() == y->name();
-  }
-  static bool isEqual(const Fortran::evaluate::SpecificIntrinsic &x,
-                      const Fortran::evaluate::SpecificIntrinsic &y) {
-    return x.name == y.name;
-  }
-  template <typename A>
-  static bool isEqual(const Fortran::evaluate::Constant<A> &x,
-                      const Fortran::evaluate::Constant<A> &y) {
-    return x == y;
-  }
-  static bool isEqual(const Fortran::evaluate::ActualArgument &x,
-                      const Fortran::evaluate::ActualArgument &y) {
-    if (const Fortran::evaluate::Symbol *xs = x.GetAssumedTypeDummy()) {
-      if (const Fortran::evaluate::Symbol *ys = y.GetAssumedTypeDummy())
-        return isEqual(*xs, *ys);
-      return false;
-    }
-    return !y.GetAssumedTypeDummy() &&
-           isEqual(*x.UnwrapExpr(), *y.UnwrapExpr());
-  }
-  static bool isEqual(const Fortran::evaluate::ProcedureDesignator &x,
-                      const Fortran::evaluate::ProcedureDesignator &y) {
-    return Fortran::common::visit(
-        [&](const auto &v, const auto &w) { return isEqual(v, w); }, x.u, y.u);
-  }
-  static bool isEqual(const Fortran::evaluate::ProcedureRef &x,
-                      const Fortran::evaluate::ProcedureRef &y) {
-    return isEqual(x.proc(), y.proc()) && isEqual(x.arguments(), y.arguments());
-  }
-  template <typename A>
-  static bool isEqual(const Fortran::evaluate::ArrayConstructor<A> &x,
-                      const Fortran::evaluate::ArrayConstructor<A> &y) {
-    llvm::report_fatal_error("not implemented");
-  }
-  static bool isEqual(const Fortran::evaluate::ImpliedDoIndex &x,
-                      const Fortran::evaluate::ImpliedDoIndex &y) {
-    return toStringRef(x.name) == toStringRef(y.name);
-  }
-  static bool isEqual(const Fortran::evaluate::TypeParamInquiry &x,
-                      const Fortran::evaluate::TypeParamInquiry &y) {
-    return isEqual(x.base(), y.base()) && isEqual(x.parameter(), y.parameter());
-  }
-  static bool isEqual(const Fortran::evaluate::DescriptorInquiry &x,
-                      const Fortran::evaluate::DescriptorInquiry &y) {
-    return isEqual(x.base(), y.base()) && x.field() == y.field() &&
-           x.dimension() == y.dimension();
-  }
-  static bool isEqual(const Fortran::evaluate::StructureConstructor &x,
-                      const Fortran::evaluate::StructureConstructor &y) {
-    const auto &xValues = x.values();
-    const auto &yValues = y.values();
-    if (xValues.size() != yValues.size())
-      return false;
-    if (x.derivedTypeSpec() != y.derivedTypeSpec())
-      return false;
-    for (const auto &[xSymbol, xValue] : xValues) {
-      auto yIt = yValues.find(xSymbol);
-      // This should probably never happen, since the derived type
-      // should be the same.
-      if (yIt == yValues.end())
-        return false;
-      if (!isEqual(xValue, yIt->second))
-        return false;
-    }
-    return true;
-  }
-  template <int KIND>
-  static bool isEqual(const Fortran::evaluate::Not<KIND> &x,
-                      const Fortran::evaluate::Not<KIND> &y) {
-    return isEqual(x.left(), y.left());
-  }
-  template <int KIND>
-  static bool isEqual(const Fortran::evaluate::LogicalOperation<KIND> &x,
-                      const Fortran::evaluate::LogicalOperation<KIND> &y) {
-    return isEqual(x.left(), y.left()) && isEqual(x.right(), y.right());
-  }
-  template <typename A>
-  static bool isEqual(const Fortran::evaluate::Relational<A> &x,
-                      const Fortran::evaluate::Relational<A> &y) {
-    return isEqual(x.left(), y.left()) && isEqual(x.right(), y.right());
-  }
-  template <typename A>
-  static bool isEqual(const Fortran::evaluate::Expr<A> &x,
-                      const Fortran::evaluate::Expr<A> &y) {
-    return Fortran::common::visit(
-        [&](const auto &v, const auto &w) { return isEqual(v, w); }, x.u, y.u);
-  }
-  static bool
-  isEqual(const Fortran::evaluate::Relational<Fortran::evaluate::SomeType> &x,
-          const Fortran::evaluate::Relational<Fortran::evaluate::SomeType> &y) {
-    return Fortran::common::visit(
-        [&](const auto &v, const auto &w) { return isEqual(v, w); }, x.u, y.u);
-  }
-  template <typename A>
-  static bool isEqual(const Fortran::evaluate::Designator<A> &x,
-                      const Fortran::evaluate::Designator<A> &y) {
-    return Fortran::common::visit(
-        [&](const auto &v, const auto &w) { return isEqual(v, w); }, x.u, y.u);
-  }
-  template <int BITS>
-  static bool isEqual(const Fortran::evaluate::value::Integer<BITS> &x,
-                      const Fortran::evaluate::value::Integer<BITS> &y) {
-    return x == y;
-  }
-  static bool isEqual(const Fortran::evaluate::NullPointer &x,
-                      const Fortran::evaluate::NullPointer &y) {
-    return true;
-  }
-  template <typename A, typename B,
-            std::enable_if_t<!std::is_same_v<A, B>, bool> = true>
-  static bool isEqual(const A &, const B &) {
-    return false;
-  }
-};
-
-static inline unsigned getHashValue(const Fortran::lower::SomeExpr *x) {
-  return HashEvaluateExpr::getHashValue(*x);
-}
-
-static bool isEqual(const Fortran::lower::SomeExpr *x,
-                    const Fortran::lower::SomeExpr *y);
+bool isEqual(const Fortran::lower::SomeExpr *x,
+             const Fortran::lower::SomeExpr *y);
+bool isEqual(const Fortran::lower::ExplicitIterSpace::ArrayBases &x,
+             const Fortran::lower::ExplicitIterSpace::ArrayBases &y);
 } // end namespace Fortran::lower
 
 // DenseMapInfo for pointers to Fortran::lower::SomeExpr.
@@ -658,17 +116,4 @@ struct DenseMapInfo<const Fortran::lower::SomeExpr *> {
 };
 } // namespace llvm
 
-namespace Fortran::lower {
-static inline bool isEqual(const Fortran::lower::SomeExpr *x,
-                           const Fortran::lower::SomeExpr *y) {
-  const auto *empty =
-      llvm::DenseMapInfo<const Fortran::lower::SomeExpr *>::getEmptyKey();
-  const auto *tombstone =
-      llvm::DenseMapInfo<const Fortran::lower::SomeExpr *>::getTombstoneKey();
-  if (x == empty || y == empty || x == tombstone || y == tombstone)
-    return x == y;
-  return x == y || IsEqualEvaluateExpr::isEqual(*x, *y);
-}
-} // end namespace Fortran::lower
-
 #endif // FORTRAN_LOWER_SUPPORT_UTILS_H
diff --git flang/include/flang/Optimizer/Builder/HLFIRTools.h flang/include/flang/Optimizer/Builder/HLFIRTools.h
index 0684ad0f926e..8b1235b50cc6 100644
--- flang/include/flang/Optimizer/Builder/HLFIRTools.h
+++ flang/include/flang/Optimizer/Builder/HLFIRTools.h
@@ -125,7 +125,7 @@ public:
   bool isSimplyContiguous() const {
     // If this can be described without a fir.box in FIR, this must
     // be contiguous.
-    if (!hlfir::isBoxAddressOrValueType(getFirBase().getType()))
+    if (!hlfir::isBoxAddressOrValueType(getFirBase().getType()) || isScalar())
       return true;
     // Otherwise, if this entity has a visible declaration in FIR,
     // or is the dereference of an allocatable or contiguous pointer
@@ -150,10 +150,7 @@ public:
     return base.getDefiningOp<fir::FortranVariableOpInterface>();
   }
 
-  bool isOptional() const {
-    auto varIface = getIfVariableInterface();
-    return varIface ? varIface.isOptional() : false;
-  }
+  bool mayBeOptional() const;
 
   bool isParameter() const {
     auto varIface = getIfVariableInterface();
@@ -210,7 +207,8 @@ public:
 using CleanupFunction = std::function<void()>;
 std::pair<fir::ExtendedValue, std::optional<CleanupFunction>>
 translateToExtendedValue(mlir::Location loc, fir::FirOpBuilder &builder,
-                         Entity entity, bool contiguousHint = false);
+                         Entity entity, bool contiguousHint = false,
+                         bool keepScalarOptionalBoxed = false);
 
 /// Function to translate FortranVariableOpInterface to fir::ExtendedValue.
 /// It may generates IR to unbox fir.boxchar, but has otherwise no side effects
diff --git flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h
index be07e8d2a939..1ffc354d6b80 100644
--- flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h
+++ flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h
@@ -401,6 +401,13 @@ constexpr TypeBuilderFunc getModel<bool &>() {
   };
 }
 template <>
+constexpr TypeBuilderFunc getModel<bool *>() {
+  return [](mlir::MLIRContext *context) -> mlir::Type {
+    TypeBuilderFunc f{getModel<bool>()};
+    return fir::ReferenceType::get(f(context));
+  };
+}
+template <>
 constexpr TypeBuilderFunc getModel<unsigned short>() {
   return [](mlir::MLIRContext *context) -> mlir::Type {
     return mlir::IntegerType::get(
diff --git flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
index a270e69b3941..c1021da0cfb2 100644
--- flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
+++ flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
@@ -207,7 +207,9 @@ def cuf_KernelLaunchOp : cuf_Op<"kernel_launch", [CallOpInterface,
     I32:$block_z,
     Optional<I32>:$bytes,
     Optional<I32>:$stream,
-    Variadic<AnyType>:$args
+    Variadic<AnyType>:$args,
+    OptionalAttr<DictArrayAttr>:$arg_attrs,
+    OptionalAttr<DictArrayAttr>:$res_attrs
   );
 
   let assemblyFormat = [{
diff --git flang/include/flang/Optimizer/Dialect/FIROps.td flang/include/flang/Optimizer/Dialect/FIROps.td
index 5f0f0b48e892..8dbc9df9f553 100644
--- flang/include/flang/Optimizer/Dialect/FIROps.td
+++ flang/include/flang/Optimizer/Dialect/FIROps.td
@@ -2432,6 +2432,8 @@ def fir_CallOp : fir_Op<"call",
   let arguments = (ins
     OptionalAttr<SymbolRefAttr>:$callee,
     Variadic<AnyType>:$args,
+    OptionalAttr<DictArrayAttr>:$arg_attrs,
+    OptionalAttr<DictArrayAttr>:$res_attrs,
     OptionalAttr<fir_FortranProcedureFlagsAttr>:$procedure_attrs,
     DefaultValuedAttr<Arith_FastMathAttr,
                       "::mlir::arith::FastMathFlags::none">:$fastmath
@@ -2518,6 +2520,8 @@ def fir_DispatchOp : fir_Op<"dispatch", []> {
     fir_ClassType:$object,
     Variadic<AnyType>:$args,
     OptionalAttr<I32Attr>:$pass_arg_pos,
+    OptionalAttr<DictArrayAttr>:$arg_attrs,
+    OptionalAttr<DictArrayAttr>:$res_attrs,
     OptionalAttr<fir_FortranProcedureFlagsAttr>:$procedure_attrs
   );
 
diff --git flang/include/flang/Optimizer/Support/DataLayout.h flang/include/flang/Optimizer/Support/DataLayout.h
index 6072425b7d63..957ea99162c5 100644
--- flang/include/flang/Optimizer/Support/DataLayout.h
+++ flang/include/flang/Optimizer/Support/DataLayout.h
@@ -18,10 +18,14 @@
 
 namespace mlir {
 class ModuleOp;
-}
+namespace gpu {
+class GPUModuleOp;
+} // namespace gpu
+} // namespace mlir
+
 namespace llvm {
 class DataLayout;
-}
+} // namespace llvm
 
 namespace fir::support {
 /// Create an mlir::DataLayoutSpecInterface attribute from an llvm::DataLayout
@@ -30,6 +34,8 @@ namespace fir::support {
 /// the llvm::DataLayout on the module.
 /// These attributes are replaced if they were already set.
 void setMLIRDataLayout(mlir::ModuleOp mlirModule, const llvm::DataLayout &dl);
+void setMLIRDataLayout(mlir::gpu::GPUModuleOp mlirModule,
+                       const llvm::DataLayout &dl);
 
 /// Create an mlir::DataLayoutSpecInterface from the llvm.data_layout attribute
 /// if one is provided. If such attribute is not available, create a default
@@ -37,6 +43,8 @@ void setMLIRDataLayout(mlir::ModuleOp mlirModule, const llvm::DataLayout &dl);
 /// nothing.
 void setMLIRDataLayoutFromAttributes(mlir::ModuleOp mlirModule,
                                      bool allowDefaultLayout);
+void setMLIRDataLayoutFromAttributes(mlir::gpu::GPUModuleOp mlirModule,
+                                     bool allowDefaultLayout);
 
 /// Create mlir::DataLayout from the data layout information on the
 /// mlir::Module. Creates the data layout information attributes with
@@ -44,7 +52,12 @@ void setMLIRDataLayoutFromAttributes(mlir::ModuleOp mlirModule,
 /// information is present at all and \p allowDefaultLayout is false, returns
 /// std::nullopt.
 std::optional<mlir::DataLayout>
-getOrSetDataLayout(mlir::ModuleOp mlirModule, bool allowDefaultLayout = false);
+getOrSetMLIRDataLayout(mlir::ModuleOp mlirModule,
+                       bool allowDefaultLayout = false);
+std::optional<mlir::DataLayout>
+getOrSetMLIRDataLayout(mlir::gpu::GPUModuleOp mlirModule,
+                       bool allowDefaultLayout = false);
+
 } // namespace fir::support
 
 #endif // FORTRAN_OPTIMIZER_SUPPORT_DATALAYOUT_H
diff --git flang/include/flang/Parser/dump-parse-tree.h flang/include/flang/Parser/dump-parse-tree.h
index a43b32daa2bd..21ee1d051784 100644
--- flang/include/flang/Parser/dump-parse-tree.h
+++ flang/include/flang/Parser/dump-parse-tree.h
@@ -477,6 +477,12 @@ public:
   NODE(parser, NullInit)
   NODE(parser, ObjectDecl)
   NODE(parser, OldParameterStmt)
+  NODE(parser, OmpTypeSpecifier)
+  NODE(parser, OmpTypeNameList)
+  NODE(parser, OmpLocator)
+  NODE(parser, OmpLocatorList)
+  NODE(parser, OmpReductionSpecifier)
+  NODE(parser, OmpArgument)
   NODE(parser, OmpMetadirectiveDirective)
   NODE(parser, OmpMatchClause)
   NODE(parser, OmpOtherwiseClause)
@@ -542,7 +548,7 @@ public:
   NODE(parser, OmpDeclareTargetSpecifier)
   NODE(parser, OmpDeclareTargetWithClause)
   NODE(parser, OmpDeclareTargetWithList)
-  NODE(parser, OmpDeclareMapperSpecifier)
+  NODE(parser, OmpMapperSpecifier)
   NODE(parser, OmpDefaultClause)
   NODE_ENUM(OmpDefaultClause, DataSharingAttribute)
   NODE(parser, OmpVariableCategory)
@@ -625,7 +631,6 @@ public:
   NODE(parser, OmpReductionCombiner)
   NODE(parser, OmpTaskReductionClause)
   NODE(OmpTaskReductionClause, Modifier)
-  NODE(OmpReductionCombiner, FunctionCombiner)
   NODE(parser, OmpReductionInitializerClause)
   NODE(parser, OmpReductionIdentifier)
   NODE(parser, OmpAllocateClause)
diff --git flang/include/flang/Parser/parse-tree.h flang/include/flang/Parser/parse-tree.h
index 4df5760f08c5..c3a02fca5ade 100644
--- flang/include/flang/Parser/parse-tree.h
+++ flang/include/flang/Parser/parse-tree.h
@@ -3457,15 +3457,7 @@ WRAPPER_CLASS(PauseStmt, std::optional<StopCode>);
 // --- Common definitions
 
 struct OmpClause;
-struct OmpClauseList;
-
-struct OmpDirectiveSpecification {
-  TUPLE_CLASS_BOILERPLATE(OmpDirectiveSpecification);
-  std::tuple<llvm::omp::Directive,
-      std::optional<common::Indirection<OmpClauseList>>>
-      t;
-  CharBlock source;
-};
+struct OmpDirectiveSpecification;
 
 // 2.1 Directives or clauses may accept a list or extended-list.
 //     A list item is a variable, array section or common block name (enclosed
@@ -3478,15 +3470,76 @@ struct OmpObject {
 
 WRAPPER_CLASS(OmpObjectList, std::list<OmpObject>);
 
-#define MODIFIER_BOILERPLATE(...) \
-  struct Modifier { \
-    using Variant = std::variant<__VA_ARGS__>; \
-    UNION_CLASS_BOILERPLATE(Modifier); \
-    CharBlock source; \
-    Variant u; \
-  }
+// Ref: [4.5:201-207], [5.0:293-299], [5.1:325-331], [5.2:124]
+//
+// reduction-identifier ->
+//    base-language-identifier |                    // since 4.5
+//    - |                                           // since 4.5, until 5.2
+//    + | * | .AND. | .OR. | .EQV. | .NEQV. |       // since 4.5
+//    MIN | MAX | IAND | IOR | IEOR                 // since 4.5
+struct OmpReductionIdentifier {
+  UNION_CLASS_BOILERPLATE(OmpReductionIdentifier);
+  std::variant<DefinedOperator, ProcedureDesignator> u;
+};
 
-#define MODIFIERS() std::optional<std::list<Modifier>>
+// Ref: [4.5:222:6], [5.0:305:27], [5.1:337:19], [5.2:126:3-4], [6.0:240:27-28]
+//
+// combiner-expression ->                           // since 4.5
+//    assignment-statement |
+//    function-reference
+struct OmpReductionCombiner {
+  UNION_CLASS_BOILERPLATE(OmpReductionCombiner);
+  std::variant<AssignmentStmt, FunctionReference> u;
+};
+
+inline namespace arguments {
+struct OmpTypeSpecifier {
+  UNION_CLASS_BOILERPLATE(OmpTypeSpecifier);
+  std::variant<TypeSpec, DeclarationTypeSpec> u;
+};
+
+WRAPPER_CLASS(OmpTypeNameList, std::list<OmpTypeSpecifier>);
+
+struct OmpLocator {
+  UNION_CLASS_BOILERPLATE(OmpLocator);
+  std::variant<OmpObject, FunctionReference> u;
+};
+
+WRAPPER_CLASS(OmpLocatorList, std::list<OmpLocator>);
+
+// Ref: [5.0:326:10-16], [5.1:359:5-11], [5.2:163:2-7], [6.0:293:16-21]
+//
+// mapper-specifier ->
+//    [mapper-identifier :] type :: var |           // since 5.0
+//    DEFAULT type :: var
+struct OmpMapperSpecifier {
+  // Absent mapper-identifier is equivalent to DEFAULT.
+  TUPLE_CLASS_BOILERPLATE(OmpMapperSpecifier);
+  std::tuple<std::optional<Name>, TypeSpec, Name> t;
+};
+
+// Ref: [4.5:222:1-5], [5.0:305:20-27], [5.1:337:11-19], [5.2:139:18-23],
+// [6.0:260:16-20]
+//
+// reduction-specifier ->
+//    reduction-identifier : typename-list
+//        : combiner-expression                     // since 4.5, until 5.2
+//    reduction-identifier : typename-list          // since 6.0
+struct OmpReductionSpecifier {
+  TUPLE_CLASS_BOILERPLATE(OmpReductionSpecifier);
+  std::tuple<OmpReductionIdentifier, OmpTypeNameList,
+      std::optional<OmpReductionCombiner>>
+      t;
+};
+
+struct OmpArgument {
+  CharBlock source;
+  UNION_CLASS_BOILERPLATE(OmpArgument);
+  std::variant<OmpLocator, // {variable, extended, locator}-list-item
+      OmpMapperSpecifier, OmpReductionSpecifier>
+      u;
+};
+} // namespace arguments
 
 inline namespace traits {
 // trait-property-name ->
@@ -3572,6 +3625,7 @@ struct OmpTraitProperty {
 // Trait-set-selectors:
 //    [D]evice, [T]arget_device, [C]onstruct, [I]mplementation, [U]ser.
 struct OmpTraitSelectorName {
+  std::string ToString() const;
   CharBlock source;
   UNION_CLASS_BOILERPLATE(OmpTraitSelectorName);
   ENUM_CLASS(Value, Arch, Atomic_Default_Mem_Order, Condition, Device_Num,
@@ -3596,6 +3650,7 @@ struct OmpTraitSelector {
 //    CONSTRUCT | DEVICE | IMPLEMENTATION | USER |  // since 5.0
 //    TARGET_DEVICE                                 // since 5.1
 struct OmpTraitSetSelectorName {
+  std::string ToString() const;
   CharBlock source;
   ENUM_CLASS(Value, Construct, Device, Implementation, Target_Device, User)
   WRAPPER_CLASS_BOILERPLATE(OmpTraitSetSelectorName, Value);
@@ -3618,6 +3673,16 @@ struct OmpContextSelectorSpecification { // Modifier
 };
 } // namespace traits
 
+#define MODIFIER_BOILERPLATE(...) \
+  struct Modifier { \
+    using Variant = std::variant<__VA_ARGS__>; \
+    UNION_CLASS_BOILERPLATE(Modifier); \
+    CharBlock source; \
+    Variant u; \
+  }
+
+#define MODIFIERS() std::optional<std::list<Modifier>>
+
 inline namespace modifier {
 // For uniformity, in all keyword modifiers the name of the type defined
 // by ENUM_CLASS is "Value", e.g.
@@ -3830,18 +3895,6 @@ struct OmpPrescriptiveness {
   WRAPPER_CLASS_BOILERPLATE(OmpPrescriptiveness, Value);
 };
 
-// Ref: [4.5:201-207], [5.0:293-299], [5.1:325-331], [5.2:124]
-//
-// reduction-identifier ->
-//    base-language-identifier |                    // since 4.5
-//    - |                                           // since 4.5, until 5.2
-//    + | * | .AND. | .OR. | .EQV. | .NEQV. |       // since 4.5
-//    MIN | MAX | IAND | IOR | IEOR                 // since 4.5
-struct OmpReductionIdentifier {
-  UNION_CLASS_BOILERPLATE(OmpReductionIdentifier);
-  std::variant<DefinedOperator, ProcedureDesignator> u;
-};
-
 // Ref: [5.0:300-302], [5.1:332-334], [5.2:134-137]
 //
 // reduction-modifier ->
@@ -3984,7 +4037,9 @@ struct OmpBindClause {
 struct OmpDefaultClause {
   ENUM_CLASS(DataSharingAttribute, Private, Firstprivate, Shared, None)
   UNION_CLASS_BOILERPLATE(OmpDefaultClause);
-  std::variant<DataSharingAttribute, OmpDirectiveSpecification> u;
+  std::variant<DataSharingAttribute,
+      common::Indirection<OmpDirectiveSpecification>>
+      u;
 };
 
 // Ref: [4.5:103-107], [5.0:324-325], [5.1:357-358], [5.2:161-162]
@@ -4249,8 +4304,8 @@ struct OmpOrderClause {
 // otherwise-clause ->
 //    OTHERWISE ([directive-specification])]        // since 5.2
 struct OmpOtherwiseClause {
-  WRAPPER_CLASS_BOILERPLATE(
-      OmpOtherwiseClause, std::optional<OmpDirectiveSpecification>);
+  WRAPPER_CLASS_BOILERPLATE(OmpOtherwiseClause,
+      std::optional<common::Indirection<OmpDirectiveSpecification>>);
 };
 
 // Ref: [4.5:46-50], [5.0:74-78], [5.1:92-96], [5.2:229-230]
@@ -4346,7 +4401,9 @@ struct OmpUpdateClause {
 struct OmpWhenClause {
   TUPLE_CLASS_BOILERPLATE(OmpWhenClause);
   MODIFIER_BOILERPLATE(OmpContextSelector);
-  std::tuple<MODIFIERS(), std::optional<OmpDirectiveSpecification>> t;
+  std::tuple<MODIFIERS(),
+      std::optional<common::Indirection<OmpDirectiveSpecification>>>
+      t;
 };
 
 // OpenMP Clauses
@@ -4373,6 +4430,14 @@ struct OmpClauseList {
 
 // --- Directives and constructs
 
+struct OmpDirectiveSpecification {
+  CharBlock source;
+  TUPLE_CLASS_BOILERPLATE(OmpDirectiveSpecification);
+  std::tuple<llvm::omp::Directive, std::optional<std::list<OmpArgument>>,
+      std::optional<OmpClauseList>>
+      t;
+};
+
 struct OmpMetadirectiveDirective {
   TUPLE_CLASS_BOILERPLATE(OmpMetadirectiveDirective);
   std::tuple<OmpClauseList> t;
@@ -4473,27 +4538,16 @@ struct OpenMPDeclareTargetConstruct {
   std::tuple<Verbatim, OmpDeclareTargetSpecifier> t;
 };
 
-struct OmpDeclareMapperSpecifier {
-  TUPLE_CLASS_BOILERPLATE(OmpDeclareMapperSpecifier);
-  std::tuple<std::optional<Name>, TypeSpec, Name> t;
-};
-
 // OMP v5.2: 5.8.8
 //  declare-mapper -> DECLARE MAPPER ([mapper-name :] type :: var) map-clauses
 struct OpenMPDeclareMapperConstruct {
   TUPLE_CLASS_BOILERPLATE(OpenMPDeclareMapperConstruct);
   CharBlock source;
-  std::tuple<Verbatim, OmpDeclareMapperSpecifier, OmpClauseList> t;
+  std::tuple<Verbatim, OmpMapperSpecifier, OmpClauseList> t;
 };
 
 // 2.16 declare-reduction -> DECLARE REDUCTION (reduction-identifier : type-list
 //                                              : combiner) [initializer-clause]
-struct OmpReductionCombiner {
-  UNION_CLASS_BOILERPLATE(OmpReductionCombiner);
-  WRAPPER_CLASS(FunctionCombiner, Call);
-  std::variant<AssignmentStmt, FunctionCombiner> u;
-};
-
 WRAPPER_CLASS(OmpReductionInitializerClause, Expr);
 
 struct OpenMPDeclareReductionConstruct {
@@ -4538,8 +4592,8 @@ struct OpenMPDeclarativeConstruct {
   CharBlock source;
   std::variant<OpenMPDeclarativeAllocate, OpenMPDeclareMapperConstruct,
       OpenMPDeclareReductionConstruct, OpenMPDeclareSimdConstruct,
-      OpenMPDeclareTargetConstruct, OpenMPThreadprivate,
-      OpenMPRequiresConstruct, OpenMPUtilityConstruct>
+      OpenMPThreadprivate, OpenMPRequiresConstruct, OpenMPUtilityConstruct,
+      OpenMPDeclareTargetConstruct, OmpMetadirectiveDirective>
       u;
 };
 
diff --git flang/include/flang/Runtime/CUDA/allocatable.h flang/include/flang/Runtime/CUDA/allocatable.h
index 0a96f73b6be4..822f2d4a2b29 100644
--- flang/include/flang/Runtime/CUDA/allocatable.h
+++ flang/include/flang/Runtime/CUDA/allocatable.h
@@ -18,28 +18,30 @@ extern "C" {
 
 /// Perform allocation of the descriptor.
 int RTDECL(CUFAllocatableAllocate)(Descriptor &, int64_t stream = -1,
-    bool hasStat = false, const Descriptor *errMsg = nullptr,
-    const char *sourceFile = nullptr, int sourceLine = 0);
+    bool *pinned = nullptr, bool hasStat = false,
+    const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
+    int sourceLine = 0);
 
 /// Perform allocation of the descriptor with synchronization of it when
 /// necessary.
 int RTDECL(CUFAllocatableAllocateSync)(Descriptor &, int64_t stream = -1,
-    bool hasStat = false, const Descriptor *errMsg = nullptr,
-    const char *sourceFile = nullptr, int sourceLine = 0);
+    bool *pinned = nullptr, bool hasStat = false,
+    const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
+    int sourceLine = 0);
 
 /// Perform allocation of the descriptor without synchronization. Assign data
 /// from source.
 int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
-    const Descriptor &source, int64_t stream = -1, bool hasStat = false,
-    const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
-    int sourceLine = 0);
+    const Descriptor &source, int64_t stream = -1, bool *pinned = nullptr,
+    bool hasStat = false, const Descriptor *errMsg = nullptr,
+    const char *sourceFile = nullptr, int sourceLine = 0);
 
 /// Perform allocation of the descriptor with synchronization of it when
 /// necessary. Assign data from source.
 int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc,
-    const Descriptor &source, int64_t stream = -1, bool hasStat = false,
-    const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
-    int sourceLine = 0);
+    const Descriptor &source, int64_t stream = -1, bool *pinned = nullptr,
+    bool hasStat = false, const Descriptor *errMsg = nullptr,
+    const char *sourceFile = nullptr, int sourceLine = 0);
 
 /// Perform deallocation of the descriptor with synchronization of it when
 /// necessary.
diff --git flang/include/flang/Runtime/CUDA/pointer.h flang/include/flang/Runtime/CUDA/pointer.h
index 78c7a1a92b7e..7fbd8f8e061f 100644
--- flang/include/flang/Runtime/CUDA/pointer.h
+++ flang/include/flang/Runtime/CUDA/pointer.h
@@ -18,28 +18,30 @@ extern "C" {
 
 /// Perform allocation of the descriptor.
 int RTDECL(CUFPointerAllocate)(Descriptor &, int64_t stream = -1,
-    bool hasStat = false, const Descriptor *errMsg = nullptr,
-    const char *sourceFile = nullptr, int sourceLine = 0);
+    bool *pinned = nullptr, bool hasStat = false,
+    const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
+    int sourceLine = 0);
 
 /// Perform allocation of the descriptor with synchronization of it when
 /// necessary.
 int RTDECL(CUFPointerAllocateSync)(Descriptor &, int64_t stream = -1,
-    bool hasStat = false, const Descriptor *errMsg = nullptr,
-    const char *sourceFile = nullptr, int sourceLine = 0);
+    bool *pinned = nullptr, bool hasStat = false,
+    const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
+    int sourceLine = 0);
 
 /// Perform allocation of the descriptor without synchronization. Assign data
 /// from source.
 int RTDEF(CUFPointerAllocateSource)(Descriptor &pointer,
-    const Descriptor &source, int64_t stream = -1, bool hasStat = false,
-    const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
-    int sourceLine = 0);
+    const Descriptor &source, int64_t stream = -1, bool *pinned = nullptr,
+    bool hasStat = false, const Descriptor *errMsg = nullptr,
+    const char *sourceFile = nullptr, int sourceLine = 0);
 
 /// Perform allocation of the descriptor with synchronization of it when
 /// necessary. Assign data from source.
 int RTDEF(CUFPointerAllocateSourceSync)(Descriptor &pointer,
-    const Descriptor &source, int64_t stream = -1, bool hasStat = false,
-    const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
-    int sourceLine = 0);
+    const Descriptor &source, int64_t stream = -1, bool *pinned = nullptr,
+    bool hasStat = false, const Descriptor *errMsg = nullptr,
+    const char *sourceFile = nullptr, int sourceLine = 0);
 
 } // extern "C"
 
diff --git flang/include/flang/Semantics/expression.h flang/include/flang/Semantics/expression.h
index 7b68ea728cbd..ce253e6f52e8 100644
--- flang/include/flang/Semantics/expression.h
+++ flang/include/flang/Semantics/expression.h
@@ -348,7 +348,8 @@ private:
   bool CheckDataRef(const DataRef &); // ditto
   std::optional<Expr<SubscriptInteger>> GetSubstringBound(
       const std::optional<parser::ScalarIntExpr> &);
-  MaybeExpr AnalyzeDefinedOp(const parser::Name &, ActualArguments &&);
+  MaybeExpr AnalyzeDefinedOp(
+      const parser::Name &, ActualArguments &&, const Symbol *&);
   MaybeExpr FixMisparsedSubstring(const parser::Designator &);
 
   struct CalleeAndArguments {
diff --git flang/include/flang/Semantics/openmp-modifiers.h flang/include/flang/Semantics/openmp-modifiers.h
index 5d5c5e97faf4..7cdbf65adebe 100644
--- flang/include/flang/Semantics/openmp-modifiers.h
+++ flang/include/flang/Semantics/openmp-modifiers.h
@@ -72,6 +72,7 @@ DECLARE_DESCRIPTOR(parser::OmpAlignModifier);
 DECLARE_DESCRIPTOR(parser::OmpAllocatorComplexModifier);
 DECLARE_DESCRIPTOR(parser::OmpAllocatorSimpleModifier);
 DECLARE_DESCRIPTOR(parser::OmpChunkModifier);
+DECLARE_DESCRIPTOR(parser::OmpContextSelector);
 DECLARE_DESCRIPTOR(parser::OmpDependenceType);
 DECLARE_DESCRIPTOR(parser::OmpDeviceModifier);
 DECLARE_DESCRIPTOR(parser::OmpDirectiveNameModifier);
diff --git flang/include/flang/Support/Fortran-features.h flang/include/flang/Support/Fortran-features.h
index a4bb6a55812b..aeaeb44f9039 100644
--- flang/include/flang/Support/Fortran-features.h
+++ flang/include/flang/Support/Fortran-features.h
@@ -71,7 +71,7 @@ ENUM_CLASS(UsageWarning, Portability, PointerToUndefinable,
     ZeroDoStep, UnusedForallIndex, OpenMPUsage, DataLength, IgnoredDirective,
     HomonymousSpecific, HomonymousResult, IgnoredIntrinsicFunctionType,
     PreviousScalarUse, RedeclaredInaccessibleComponent, ImplicitShared,
-    IndexVarRedefinition, IncompatibleImplicitInterfaces, BadTypeForTarget,
+    IndexVarRedefinition, IncompatibleImplicitInterfaces,
     VectorSubscriptFinalization, UndefinedFunctionResult, UselessIomsg,
     MismatchingDummyProcedure, SubscriptedEmptyArray, UnsignedLiteralTruncation,
     CompatibleDeclarationsFromDistinctModules)
diff --git flang/lib/Evaluate/intrinsics.cpp flang/lib/Evaluate/intrinsics.cpp
index 5f3a39e9fe6c..5c9527cec2f6 100644
--- flang/lib/Evaluate/intrinsics.cpp
+++ flang/lib/Evaluate/intrinsics.cpp
@@ -2036,11 +2036,16 @@ std::optional<SpecificCall> IntrinsicInterface::Match(
       if (!sameArg) {
         sameArg = arg;
       }
-      // Check both ways so that a CLASS(*) actuals to
-      // MOVE_ALLOC and EOSHIFT both work.
       auto sameType{sameArg->GetType().value()};
-      argOk = sameType.IsTkLenCompatibleWith(*type) ||
-          type->IsTkLenCompatibleWith(sameType);
+      if (name == "move_alloc"s) {
+        // second argument can be more general
+        argOk = type->IsTkLenCompatibleWith(sameType);
+      } else if (name == "merge"s) {
+        argOk = type->IsTkLenCompatibleWith(sameType) &&
+            sameType.IsTkLenCompatibleWith(*type);
+      } else {
+        argOk = sameType.IsTkLenCompatibleWith(*type);
+      }
     } break;
     case KindCode::sameKind:
       if (!sameArg) {
@@ -2364,7 +2369,7 @@ std::optional<SpecificCall> IntrinsicInterface::Match(
       if (kindArg) {
         if (auto *expr{kindArg->UnwrapExpr()}) {
           CHECK(expr->Rank() == 0);
-          if (auto code{ToInt64(*expr)}) {
+          if (auto code{ToInt64(Fold(context, common::Clone(*expr)))}) {
             if (context.targetCharacteristics().IsTypeEnabled(
                     *category, *code)) {
               if (*category == TypeCategory::Character) { // ACHAR & CHAR
@@ -2376,9 +2381,8 @@ std::optional<SpecificCall> IntrinsicInterface::Match(
             }
           }
         }
-        messages.Say("'kind=' argument must be a constant scalar integer "
-                     "whose value is a supported kind for the "
-                     "intrinsic result type"_err_en_US);
+        messages.Say(
+            "'kind=' argument must be a constant scalar integer whose value is a supported kind for the intrinsic result type"_err_en_US);
         // use default kind below for error recovery
       } else if (kindDummyArg->flags.test(ArgFlag::defaultsToSameKind)) {
         CHECK(sameArg);
diff --git flang/lib/Lower/CMakeLists.txt flang/lib/Lower/CMakeLists.txt
index 0bc708a16363..c9b249781552 100644
--- flang/lib/Lower/CMakeLists.txt
+++ flang/lib/Lower/CMakeLists.txt
@@ -34,6 +34,7 @@ add_flang_library(FortranLower
   OpenMP/Utils.cpp
   PFTBuilder.cpp
   Runtime.cpp
+  Support/Utils.cpp
   SymbolMap.cpp
   VectorSubscripts.cpp
   
diff --git flang/lib/Lower/ConvertCall.cpp flang/lib/Lower/ConvertCall.cpp
index 40cd106e6301..7ca2baf0193c 100644
--- flang/lib/Lower/ConvertCall.cpp
+++ flang/lib/Lower/ConvertCall.cpp
@@ -594,7 +594,8 @@ Fortran::lower::genCallOpAndResult(
 
     builder.create<cuf::KernelLaunchOp>(
         loc, funcType.getResults(), funcSymbolAttr, grid_x, grid_y, grid_z,
-        block_x, block_y, block_z, bytes, stream, operands);
+        block_x, block_y, block_z, bytes, stream, operands,
+        /*arg_attrs=*/nullptr, /*res_attrs=*/nullptr);
     callNumResults = 0;
   } else if (caller.requireDispatchCall()) {
     // Procedure call requiring a dynamic dispatch. Call is created with
@@ -621,7 +622,8 @@ Fortran::lower::genCallOpAndResult(
       dispatch = builder.create<fir::DispatchOp>(
           loc, funcType.getResults(), builder.getStringAttr(procName),
           caller.getInputs()[*passArg], operands,
-          builder.getI32IntegerAttr(*passArg), procAttrs);
+          builder.getI32IntegerAttr(*passArg), /*arg_attrs=*/nullptr,
+          /*res_attrs=*/nullptr, procAttrs);
     } else {
       // NOPASS
       const Fortran::evaluate::Component *component =
@@ -636,7 +638,8 @@ Fortran::lower::genCallOpAndResult(
         passObject = builder.create<fir::LoadOp>(loc, passObject);
       dispatch = builder.create<fir::DispatchOp>(
           loc, funcType.getResults(), builder.getStringAttr(procName),
-          passObject, operands, nullptr, procAttrs);
+          passObject, operands, nullptr, /*arg_attrs=*/nullptr,
+          /*res_attrs=*/nullptr, procAttrs);
     }
     callNumResults = dispatch.getNumResults();
     if (callNumResults != 0)
@@ -644,7 +647,8 @@ Fortran::lower::genCallOpAndResult(
   } else {
     // Standard procedure call with fir.call.
     auto call = builder.create<fir::CallOp>(
-        loc, funcType.getResults(), funcSymbolAttr, operands, procAttrs);
+        loc, funcType.getResults(), funcSymbolAttr, operands,
+        /*arg_attrs=*/nullptr, /*res_attrs=*/nullptr, procAttrs);
 
     callNumResults = call.getNumResults();
     if (callNumResults != 0)
diff --git flang/lib/Lower/ConvertVariable.cpp flang/lib/Lower/ConvertVariable.cpp
index 87236dc293eb..81d14fbb1d77 100644
--- flang/lib/Lower/ConvertVariable.cpp
+++ flang/lib/Lower/ConvertVariable.cpp
@@ -956,7 +956,15 @@ static void instantiateLocal(Fortran::lower::AbstractConverter &converter,
                              Fortran::lower::SymMap &symMap) {
   assert(!var.isAlias());
   Fortran::lower::StatementContext stmtCtx;
+  // isUnusedEntryDummy must be computed before mapSymbolAttributes.
+  const bool isUnusedEntryDummy =
+      var.hasSymbol() && Fortran::semantics::IsDummy(var.getSymbol()) &&
+      !symMap.lookupSymbol(var.getSymbol()).getAddr();
   mapSymbolAttributes(converter, var, symMap, stmtCtx);
+  // Do not generate code to initialize/finalize/destroy dummy arguments that
+  // are nor part of the current ENTRY. They do not have backing storage.
+  if (isUnusedEntryDummy)
+    return;
   deallocateIntentOut(converter, var, symMap);
   if (needDummyIntentoutFinalization(var))
     finalizeAtRuntime(converter, var, symMap);
@@ -999,7 +1007,6 @@ static void instantiateLocal(Fortran::lower::AbstractConverter &converter,
                "trying to deallocate entity not lowered as allocatable");
         Fortran::lower::genDeallocateIfAllocated(*converterPtr, *mutableBox,
                                                  loc, sym);
-        
       });
     }
   }
diff --git flang/lib/Lower/IterationSpace.cpp flang/lib/Lower/IterationSpace.cpp
index 63011483022b..b011b3ab9a24 100644
--- flang/lib/Lower/IterationSpace.cpp
+++ flang/lib/Lower/IterationSpace.cpp
@@ -19,36 +19,6 @@
 
 #define DEBUG_TYPE "flang-lower-iteration-space"
 
-unsigned Fortran::lower::getHashValue(
-    const Fortran::lower::ExplicitIterSpace::ArrayBases &x) {
-  return Fortran::common::visit(
-      [&](const auto *p) { return HashEvaluateExpr::getHashValue(*p); }, x);
-}
-
-bool Fortran::lower::isEqual(
-    const Fortran::lower::ExplicitIterSpace::ArrayBases &x,
-    const Fortran::lower::ExplicitIterSpace::ArrayBases &y) {
-  return Fortran::common::visit(
-      Fortran::common::visitors{
-          // Fortran::semantics::Symbol * are the exception here. These pointers
-          // have identity; if two Symbol * values are the same (different) then
-          // they are the same (different) logical symbol.
-          [&](Fortran::lower::FrontEndSymbol p,
-              Fortran::lower::FrontEndSymbol q) { return p == q; },
-          [&](const auto *p, const auto *q) {
-            if constexpr (std::is_same_v<decltype(p), decltype(q)>) {
-              LLVM_DEBUG(llvm::dbgs()
-                         << "is equal: " << p << ' ' << q << ' '
-                         << IsEqualEvaluateExpr::isEqual(*p, *q) << '\n');
-              return IsEqualEvaluateExpr::isEqual(*p, *q);
-            } else {
-              // Different subtree types are never equal.
-              return false;
-            }
-          }},
-      x, y);
-}
-
 namespace {
 
 /// This class can recover the base array in an expression that contains
diff --git flang/lib/Lower/OpenMP/DataSharingProcessor.cpp flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
index 5b89816850be..36a8efd43f8c 100644
--- flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
+++ flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
@@ -12,13 +12,17 @@
 
 #include "DataSharingProcessor.h"
 
+#include "PrivateReductionUtils.h"
 #include "Utils.h"
 #include "flang/Lower/ConvertVariable.h"
 #include "flang/Lower/PFTBuilder.h"
 #include "flang/Lower/SymbolMap.h"
+#include "flang/Optimizer/Builder/BoxValue.h"
 #include "flang/Optimizer/Builder/HLFIRTools.h"
 #include "flang/Optimizer/Builder/Todo.h"
+#include "flang/Optimizer/HLFIR/HLFIRDialect.h"
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
+#include "flang/Semantics/attr.h"
 #include "flang/Semantics/tools.h"
 
 namespace Fortran {
@@ -85,35 +89,65 @@ void DataSharingProcessor::insertDeallocs() {
         converter.createHostAssociateVarCloneDealloc(*sym);
         continue;
       }
-
-      lower::SymbolBox hsb = converter.lookupOneLevelUpSymbol(*sym);
-      assert(hsb && "Host symbol box not found");
-      mlir::Type symType = hsb.getAddr().getType();
-      mlir::Location symLoc = hsb.getAddr().getLoc();
-      fir::ExtendedValue symExV = converter.getSymbolExtendedValue(*sym);
-      mlir::omp::PrivateClauseOp privatizer = symToPrivatizer.at(sym);
-
-      lower::SymMapScope scope(symTable);
-      mlir::OpBuilder::InsertionGuard guard(firOpBuilder);
-
-      mlir::Region &deallocRegion = privatizer.getDeallocRegion();
-      fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
-      mlir::Block *deallocEntryBlock = firOpBuilder.createBlock(
-          &deallocRegion, /*insertPt=*/{}, symType, symLoc);
-
-      firOpBuilder.setInsertionPointToEnd(deallocEntryBlock);
-      symTable.addSymbol(*sym,
-                         fir::substBase(symExV, deallocRegion.getArgument(0)));
-
-      converter.createHostAssociateVarCloneDealloc(*sym);
-      firOpBuilder.create<mlir::omp::YieldOp>(hsb.getAddr().getLoc());
+      // For delayed privatization deallocs are created by
+      // populateByRefInitAndCleanupRegions
     }
 }
 
 void DataSharingProcessor::cloneSymbol(const semantics::Symbol *sym) {
   bool isFirstPrivate = sym->test(semantics::Symbol::Flag::OmpFirstPrivate);
-  bool success = converter.createHostAssociateVarClone(
-      *sym, /*skipDefaultInit=*/isFirstPrivate);
+
+  // If we are doing eager-privatization on a symbol created using delayed
+  // privatization there could be incompatible types here e.g.
+  // fir.ref<fir.box<fir.array<>>>
+  bool success = [&]() -> bool {
+    const auto *details =
+        sym->detailsIf<Fortran::semantics::HostAssocDetails>();
+    assert(details && "No host-association found");
+    const Fortran::semantics::Symbol &hsym = details->symbol();
+    mlir::Value addr = converter.getSymbolAddress(hsym);
+
+    if (auto refTy = mlir::dyn_cast<fir::ReferenceType>(addr.getType())) {
+      if (auto boxTy = mlir::dyn_cast<fir::BoxType>(refTy.getElementType())) {
+        if (auto arrayTy =
+                mlir::dyn_cast<fir::SequenceType>(boxTy.getElementType())) {
+          // FirConverter/fir::ExtendedValue considers all references to boxes
+          // as mutable boxes. Outside of OpenMP it doesn't make sense to have a
+          // mutable box of an array. Work around this here by loading the
+          // reference so it is a normal boxed array.
+          fir::FirOpBuilder &builder = converter.getFirOpBuilder();
+          mlir::Location loc = converter.genLocation(hsym.name());
+          fir::ExtendedValue hexv = converter.getSymbolExtendedValue(hsym);
+
+          llvm::SmallVector<mlir::Value> extents =
+              fir::factory::getExtents(loc, builder, hexv);
+
+          // TODO: uniqName, name
+          mlir::Value allocVal =
+              builder.allocateLocal(loc, arrayTy, /*uniqName=*/"",
+                                    /*name=*/"", extents, /*typeParams=*/{},
+                                    sym->GetUltimate().attrs().test(
+                                        Fortran::semantics::Attr::TARGET));
+          mlir::Value shape = builder.genShape(loc, extents);
+          mlir::Value box = builder.createBox(loc, boxTy, allocVal, shape,
+                                              nullptr, {}, nullptr);
+
+          // This can't be a CharArrayBoxValue because otherwise
+          // boxTy.getElementType() would be a character type.
+          // Assume the array element type isn't polymorphic because we are
+          // privatizing.
+          fir::ExtendedValue newExv = fir::ArrayBoxValue{box, extents};
+
+          converter.bindSymbol(*sym, newExv);
+          return true;
+        }
+      }
+    }
+
+    // Normal case:
+    return converter.createHostAssociateVarClone(
+        *sym, /*skipDefaultInit=*/isFirstPrivate);
+  }();
   (void)success;
   assert(success && "Privatization failed due to existing binding");
 
@@ -132,7 +166,7 @@ void DataSharingProcessor::cloneSymbol(const semantics::Symbol *sym) {
 
   if (needInitClone()) {
     Fortran::lower::initializeCloneAtRuntime(converter, *sym, symTable);
-    callsInitClone = true;
+    mightHaveReadHostSym = true;
   }
 }
 
@@ -184,7 +218,8 @@ bool DataSharingProcessor::needBarrier() {
   // Emit implicit barrier for linear clause. Maybe on somewhere else.
   for (const semantics::Symbol *sym : allPrivatizedSymbols) {
     if (sym->test(semantics::Symbol::Flag::OmpLastPrivate) &&
-        (sym->test(semantics::Symbol::Flag::OmpFirstPrivate) || callsInitClone))
+        (sym->test(semantics::Symbol::Flag::OmpFirstPrivate) ||
+         mightHaveReadHostSym))
       return true;
   }
   return false;
@@ -468,15 +503,47 @@ void DataSharingProcessor::doPrivatize(const semantics::Symbol *sym,
   lower::SymbolBox hsb = converter.lookupOneLevelUpSymbol(*sym);
   assert(hsb && "Host symbol box not found");
 
-  mlir::Type symType = hsb.getAddr().getType();
   mlir::Location symLoc = hsb.getAddr().getLoc();
   std::string privatizerName = sym->name().ToString() + ".privatizer";
   bool isFirstPrivate = sym->test(semantics::Symbol::Flag::OmpFirstPrivate);
 
+  mlir::Value privVal = hsb.getAddr();
+  mlir::Type allocType = privVal.getType();
+  if (!mlir::isa<fir::PointerType>(privVal.getType()))
+    allocType = fir::unwrapRefType(privVal.getType());
+
+  if (auto poly = mlir::dyn_cast<fir::ClassType>(allocType)) {
+    if (!mlir::isa<fir::PointerType>(poly.getEleTy()) && isFirstPrivate)
+      TODO(symLoc, "create polymorphic host associated copy");
+  }
+
+  // fir.array<> cannot be converted to any single llvm type and fir helpers
+  // are not available in openmp to llvmir translation so we cannot generate
+  // an alloca for a fir.array type there. Get around this by boxing all
+  // arrays.
+  if (mlir::isa<fir::SequenceType>(allocType)) {
+    hlfir::Entity entity{hsb.getAddr()};
+    entity = genVariableBox(symLoc, firOpBuilder, entity);
+    privVal = entity.getBase();
+    allocType = privVal.getType();
+  }
+
+  if (mlir::isa<fir::BaseBoxType>(privVal.getType())) {
+    // Boxes should be passed by reference into nested regions:
+    auto oldIP = firOpBuilder.saveInsertionPoint();
+    firOpBuilder.setInsertionPointToStart(firOpBuilder.getAllocaBlock());
+    auto alloca = firOpBuilder.create<fir::AllocaOp>(symLoc, privVal.getType());
+    firOpBuilder.restoreInsertionPoint(oldIP);
+    firOpBuilder.create<fir::StoreOp>(symLoc, privVal, alloca);
+    privVal = alloca;
+  }
+
+  mlir::Type argType = privVal.getType();
+
   mlir::omp::PrivateClauseOp privatizerOp = [&]() {
     auto moduleOp = firOpBuilder.getModule();
     auto uniquePrivatizerName = fir::getTypeAsString(
-        symType, converter.getKindMap(),
+        allocType, converter.getKindMap(),
         converter.mangleName(*sym) +
             (isFirstPrivate ? "_firstprivate" : "_private"));
 
@@ -488,44 +555,40 @@ void DataSharingProcessor::doPrivatize(const semantics::Symbol *sym,
     mlir::OpBuilder::InsertionGuard guard(firOpBuilder);
     firOpBuilder.setInsertionPointToStart(moduleOp.getBody());
     auto result = firOpBuilder.create<mlir::omp::PrivateClauseOp>(
-        symLoc, uniquePrivatizerName, symType,
+        symLoc, uniquePrivatizerName, allocType,
         isFirstPrivate ? mlir::omp::DataSharingClauseType::FirstPrivate
                        : mlir::omp::DataSharingClauseType::Private);
     fir::ExtendedValue symExV = converter.getSymbolExtendedValue(*sym);
     lower::SymMapScope outerScope(symTable);
 
-    // Populate the `alloc` region.
-    {
-      mlir::Region &allocRegion = result.getAllocRegion();
-      mlir::Block *allocEntryBlock = firOpBuilder.createBlock(
-          &allocRegion, /*insertPt=*/{}, symType, symLoc);
-
-      firOpBuilder.setInsertionPointToEnd(allocEntryBlock);
-
-      fir::ExtendedValue localExV =
-          hlfir::translateToExtendedValue(
-              symLoc, firOpBuilder, hlfir::Entity{allocRegion.getArgument(0)},
-              /*contiguousHint=*/
-              evaluate::IsSimplyContiguous(*sym, converter.getFoldingContext()))
-              .first;
-
-      symTable.addSymbol(*sym, localExV);
-      lower::SymMapScope innerScope(symTable);
-      cloneSymbol(sym);
-      mlir::Value cloneAddr = symTable.shallowLookupSymbol(*sym).getAddr();
-      mlir::Type cloneType = cloneAddr.getType();
-
-      // A `convert` op is required for variables that are storage associated
-      // via `equivalence`. The problem is that these variables are declared as
-      // `fir.ptr`s while their privatized storage is declared as `fir.ref`,
-      // therefore we convert to proper symbol type.
-      mlir::Value yieldedValue =
-          (symType == cloneType) ? cloneAddr
-                                 : firOpBuilder.createConvert(
-                                       cloneAddr.getLoc(), symType, cloneAddr);
-
-      firOpBuilder.create<mlir::omp::YieldOp>(hsb.getAddr().getLoc(),
-                                              yieldedValue);
+    // Populate the `init` region.
+    // We need to initialize in the following cases:
+    // 1. The allocation was for a derived type which requires initialization
+    //    (this can be skipped if it will be initialized anyway by the copy
+    //    region, unless the derived type has allocatable components)
+    // 2. The allocation was for any kind of box
+    // 3. The allocation was for a boxed character
+    const bool needsInitialization =
+        (Fortran::lower::hasDefaultInitialization(sym->GetUltimate()) &&
+         (!isFirstPrivate || hlfir::mayHaveAllocatableComponent(allocType))) ||
+        mlir::isa<fir::BaseBoxType>(allocType) ||
+        mlir::isa<fir::BoxCharType>(allocType);
+    if (needsInitialization) {
+      mlir::Region &initRegion = result.getInitRegion();
+      mlir::Block *initBlock = firOpBuilder.createBlock(
+          &initRegion, /*insertPt=*/{}, {argType, argType}, {symLoc, symLoc});
+
+      populateByRefInitAndCleanupRegions(
+          converter, symLoc, argType, /*scalarInitValue=*/nullptr, initBlock,
+          result.getInitPrivateArg(), result.getInitMoldArg(),
+          result.getDeallocRegion(),
+          isFirstPrivate ? DeclOperationKind::FirstPrivate
+                         : DeclOperationKind::Private,
+          sym);
+      // TODO: currently there are false positives from dead uses of the mold
+      // arg
+      if (!result.getInitMoldArg().getUses().empty())
+        mightHaveReadHostSym = true;
     }
 
     // Populate the `copy` region if this is a `firstprivate`.
@@ -534,7 +597,7 @@ void DataSharingProcessor::doPrivatize(const semantics::Symbol *sym,
       // First block argument corresponding to the original/host value while
       // second block argument corresponding to the privatized value.
       mlir::Block *copyEntryBlock = firOpBuilder.createBlock(
-          &copyRegion, /*insertPt=*/{}, {symType, symType}, {symLoc, symLoc});
+          &copyRegion, /*insertPt=*/{}, {argType, argType}, {symLoc, symLoc});
       firOpBuilder.setInsertionPointToEnd(copyEntryBlock);
 
       auto addSymbol = [&](unsigned argIdx, bool force = false) {
@@ -565,7 +628,7 @@ void DataSharingProcessor::doPrivatize(const semantics::Symbol *sym,
 
   if (clauseOps) {
     clauseOps->privateSyms.push_back(mlir::SymbolRefAttr::get(privatizerOp));
-    clauseOps->privateVars.push_back(hsb.getAddr());
+    clauseOps->privateVars.push_back(privVal);
   }
 
   symToPrivatizer[sym] = privatizerOp;
diff --git flang/lib/Lower/OpenMP/DataSharingProcessor.h flang/lib/Lower/OpenMP/DataSharingProcessor.h
index 8c7a222ec939..8e15c6d26038 100644
--- flang/lib/Lower/OpenMP/DataSharingProcessor.h
+++ flang/lib/Lower/OpenMP/DataSharingProcessor.h
@@ -86,7 +86,7 @@ private:
   lower::pft::Evaluation &eval;
   bool shouldCollectPreDeterminedSymbols;
   bool useDelayedPrivatization;
-  bool callsInitClone = false;
+  bool mightHaveReadHostSym = false;
   lower::SymMap &symTable;
   OMPConstructSymbolVisitor visitor;
 
diff --git flang/lib/Lower/OpenMP/OpenMP.cpp flang/lib/Lower/OpenMP/OpenMP.cpp
index b0cc2f591f3d..f088e4b5e79e 100644
--- flang/lib/Lower/OpenMP/OpenMP.cpp
+++ flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -3142,6 +3142,13 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
   // support the case of threadprivate variable declared in module.
 }
 
+static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
+                   semantics::SemanticsContext &semaCtx,
+                   lower::pft::Evaluation &eval,
+                   const parser::OmpMetadirectiveDirective &meta) {
+  TODO(converter.getCurrentLocation(), "METADIRECTIVE");
+}
+
 static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
                    semantics::SemanticsContext &semaCtx,
                    lower::pft::Evaluation &eval,
@@ -3234,11 +3241,6 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
   TODO(converter.getCurrentLocation(), "OpenMPDepobjConstruct");
 }
 
-static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
-                   semantics::SemanticsContext &semaCtx,
-                   lower::pft::Evaluation &eval,
-                   const parser::OmpMetadirectiveDirective &construct) {}
-
 static void
 genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
        semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval,
diff --git flang/lib/Lower/OpenMP/PrivateReductionUtils.cpp flang/lib/Lower/OpenMP/PrivateReductionUtils.cpp
index 83f0d4e93ca5..61706d082410 100644
--- flang/lib/Lower/OpenMP/PrivateReductionUtils.cpp
+++ flang/lib/Lower/OpenMP/PrivateReductionUtils.cpp
@@ -12,17 +12,38 @@
 
 #include "PrivateReductionUtils.h"
 
+#include "flang/Lower/AbstractConverter.h"
+#include "flang/Lower/Allocatable.h"
+#include "flang/Lower/ConvertVariable.h"
+#include "flang/Optimizer/Builder/BoxValue.h"
+#include "flang/Optimizer/Builder/Character.h"
 #include "flang/Optimizer/Builder/FIRBuilder.h"
 #include "flang/Optimizer/Builder/HLFIRTools.h"
+#include "flang/Optimizer/Builder/Runtime/Derived.h"
 #include "flang/Optimizer/Builder/Todo.h"
+#include "flang/Optimizer/Dialect/FIROps.h"
+#include "flang/Optimizer/Dialect/FIRType.h"
+#include "flang/Optimizer/HLFIR/HLFIRDialect.h"
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
 #include "flang/Optimizer/Support/FatalError.h"
+#include "flang/Semantics/symbol.h"
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 #include "mlir/IR/Location.h"
 
-static void createCleanupRegion(fir::FirOpBuilder &builder, mlir::Location loc,
-                                mlir::Type argType,
-                                mlir::Region &cleanupRegion) {
+static bool hasFinalization(const Fortran::semantics::Symbol &sym) {
+  if (sym.has<Fortran::semantics::ObjectEntityDetails>())
+    if (const Fortran::semantics::DeclTypeSpec *declTypeSpec = sym.GetType())
+      if (const Fortran::semantics::DerivedTypeSpec *derivedTypeSpec =
+              declTypeSpec->AsDerived())
+        return Fortran::semantics::IsFinalizable(*derivedTypeSpec);
+  return false;
+}
+
+static void createCleanupRegion(Fortran::lower::AbstractConverter &converter,
+                                mlir::Location loc, mlir::Type argType,
+                                mlir::Region &cleanupRegion,
+                                const Fortran::semantics::Symbol *sym) {
+  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
   assert(cleanupRegion.empty());
   mlir::Block *block = builder.createBlock(&cleanupRegion, cleanupRegion.end(),
                                            {argType}, {loc});
@@ -37,12 +58,29 @@ static void createCleanupRegion(fir::FirOpBuilder &builder, mlir::Location loc,
 
   mlir::Type valTy = fir::unwrapRefType(argType);
   if (auto boxTy = mlir::dyn_cast_or_null<fir::BaseBoxType>(valTy)) {
-    if (!mlir::isa<fir::HeapType, fir::PointerType>(boxTy.getEleTy())) {
-      mlir::Type innerTy = fir::extractSequenceType(boxTy);
-      if (!mlir::isa<fir::SequenceType>(innerTy))
-        typeError();
+    // TODO: what about undoing init of unboxed derived types?
+    if (auto recTy = mlir::dyn_cast<fir::RecordType>(
+            fir::unwrapSequenceType(fir::dyn_cast_ptrOrBoxEleTy(boxTy)))) {
+      mlir::Type eleTy = boxTy.getEleTy();
+      if (mlir::isa<fir::PointerType, fir::HeapType>(eleTy)) {
+        mlir::Type mutableBoxTy =
+            fir::ReferenceType::get(fir::BoxType::get(eleTy));
+        mlir::Value converted =
+            builder.createConvert(loc, mutableBoxTy, block->getArgument(0));
+        if (recTy.getNumLenParams() > 0)
+          TODO(loc, "Deallocate box with length parameters");
+        fir::MutableBoxValue mutableBox{converted, /*lenParameters=*/{},
+                                        /*mutableProperties=*/{}};
+        Fortran::lower::genDeallocateIfAllocated(converter, mutableBox, loc);
+        builder.create<mlir::omp::YieldOp>(loc);
+        return;
+      }
     }
 
+    // TODO: just replace this whole body with
+    // Fortran::lower::genDeallocateIfAllocated (not done now to avoid test
+    // churn)
+
     mlir::Value arg = builder.loadIfRef(loc, block->getArgument(0));
     assert(mlir::isa<fir::BaseBoxType>(arg.getType()));
 
@@ -66,6 +104,21 @@ static void createCleanupRegion(fir::FirOpBuilder &builder, mlir::Location loc,
     return;
   }
 
+  if (auto boxCharTy = mlir::dyn_cast<fir::BoxCharType>(argType)) {
+    auto [addr, len] =
+        fir::factory::CharacterExprHelper{builder, loc}.createUnboxChar(
+            block->getArgument(0));
+
+    // convert addr to a heap type so it can be used with fir::FreeMemOp
+    auto refTy = mlir::cast<fir::ReferenceType>(addr.getType());
+    auto heapTy = fir::HeapType::get(refTy.getEleTy());
+    addr = builder.createConvert(loc, heapTy, addr);
+
+    builder.create<fir::FreeMemOp>(loc, addr);
+    builder.create<mlir::omp::YieldOp>(loc);
+    return;
+  }
+
   typeError();
 }
 
@@ -96,17 +149,113 @@ fir::ShapeShiftOp Fortran::lower::omp::getShapeShift(fir::FirOpBuilder &builder,
   return shapeShift;
 }
 
+// Initialize box newBox using moldBox. These should both have the same type and
+// be boxes containing derived types e.g.
+// fir.box<!fir.type<>>
+// fir.box<!fir.heap<!fir.type<>>
+// fir.box<!fir.heap<!fir.array<fir.type<>>>
+// fir.class<...<!fir.type<>>>
+// If the type doesn't match , this does nothing
+static void initializeIfDerivedTypeBox(fir::FirOpBuilder &builder,
+                                       mlir::Location loc, mlir::Value newBox,
+                                       mlir::Value moldBox, bool hasInitializer,
+                                       bool isFirstPrivate) {
+  assert(moldBox.getType() == newBox.getType());
+  fir::BoxType boxTy = mlir::dyn_cast<fir::BoxType>(newBox.getType());
+  fir::ClassType classTy = mlir::dyn_cast<fir::ClassType>(newBox.getType());
+  if (!boxTy && !classTy)
+    return;
+
+  // remove pointer and array types in the middle
+  mlir::Type eleTy = boxTy ? boxTy.getElementType() : classTy.getEleTy();
+  mlir::Type derivedTy = fir::unwrapRefType(eleTy);
+  if (auto array = mlir::dyn_cast<fir::SequenceType>(derivedTy))
+    derivedTy = array.getElementType();
+
+  if (!fir::isa_derived(derivedTy))
+    return;
+
+  if (hasInitializer)
+    fir::runtime::genDerivedTypeInitialize(builder, loc, newBox);
+
+  if (hlfir::mayHaveAllocatableComponent(derivedTy) && !isFirstPrivate)
+    fir::runtime::genDerivedTypeInitializeClone(builder, loc, newBox, moldBox);
+}
+
+static void getLengthParameters(fir::FirOpBuilder &builder, mlir::Location loc,
+                                mlir::Value moldArg,
+                                llvm::SmallVectorImpl<mlir::Value> &lenParams) {
+  // We pass derived types unboxed and so are not self-contained entities.
+  // Assume that unboxed derived types won't need length paramters.
+  if (!hlfir::isFortranEntity(moldArg))
+    return;
+
+  hlfir::genLengthParameters(loc, builder, hlfir::Entity{moldArg}, lenParams);
+  if (lenParams.empty())
+    return;
+
+  // The verifier for EmboxOp doesn't allow length parameters when the the
+  // character already has static LEN. genLengthParameters may still return them
+  // in this case.
+  mlir::Type unwrappedType =
+      fir::unwrapRefType(fir::unwrapSeqOrBoxedSeqType(moldArg.getType()));
+  if (auto strTy = mlir::dyn_cast<fir::CharacterType>(unwrappedType)) {
+    if (strTy.hasConstantLen())
+      lenParams.resize(0);
+  }
+}
+
+static bool
+isDerivedTypeNeedingInitialization(const Fortran::semantics::Symbol &sym) {
+  // Fortran::lower::hasDefaultInitialization returns false for ALLOCATABLE, so
+  // re-implement here.
+  // ignorePointer=true because either the pointer points to the same target as
+  // the original variable, or it is uninitialized.
+  if (const Fortran::semantics::DeclTypeSpec *declTypeSpec = sym.GetType())
+    if (const Fortran::semantics::DerivedTypeSpec *derivedTypeSpec =
+            declTypeSpec->AsDerived())
+      return derivedTypeSpec->HasDefaultInitialization(
+          /*ignoreAllocatable=*/false, /*ignorePointer=*/true);
+  return false;
+}
+
+static mlir::Value generateZeroShapeForRank(fir::FirOpBuilder &builder,
+                                            mlir::Location loc,
+                                            mlir::Value moldArg) {
+  mlir::Type moldType = fir::unwrapRefType(moldArg.getType());
+  mlir::Type eleType = fir::dyn_cast_ptrOrBoxEleTy(moldType);
+  fir::SequenceType seqTy =
+      mlir::dyn_cast_if_present<fir::SequenceType>(eleType);
+  if (!seqTy)
+    return mlir::Value{};
+
+  unsigned rank = seqTy.getShape().size();
+  mlir::Value zero =
+      builder.createIntegerConstant(loc, builder.getIndexType(), 0);
+  mlir::SmallVector<mlir::Value> dims;
+  dims.resize(rank, zero);
+  mlir::Type shapeTy = fir::ShapeType::get(builder.getContext(), rank);
+  return builder.create<fir::ShapeOp>(loc, shapeTy, dims);
+}
+
 void Fortran::lower::omp::populateByRefInitAndCleanupRegions(
-    fir::FirOpBuilder &builder, mlir::Location loc, mlir::Type argType,
-    mlir::Value scalarInitValue, mlir::Block *initBlock,
+    Fortran::lower::AbstractConverter &converter, mlir::Location loc,
+    mlir::Type argType, mlir::Value scalarInitValue, mlir::Block *initBlock,
     mlir::Value allocatedPrivVarArg, mlir::Value moldArg,
-    mlir::Region &cleanupRegion) {
+    mlir::Region &cleanupRegion, DeclOperationKind kind,
+    const Fortran::semantics::Symbol *sym) {
+  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
   mlir::Type ty = fir::unwrapRefType(argType);
   builder.setInsertionPointToEnd(initBlock);
   auto yield = [&](mlir::Value ret) {
     builder.create<mlir::omp::YieldOp>(loc, ret);
   };
 
+  if (isPrivatization(kind))
+    assert(sym && "Symbol information is needed to privatize derived types");
+  bool needsInitialization =
+      sym ? isDerivedTypeNeedingInitialization(sym->GetUltimate()) : false;
+
   if (fir::isa_trivial(ty)) {
     builder.setInsertionPointToEnd(initBlock);
 
@@ -128,15 +277,22 @@ void Fortran::lower::omp::populateByRefInitAndCleanupRegions(
   //   fir.store %something to %box_alloca
   // }
   // omp.yield %box_alloca
-  moldArg = builder.loadIfRef(loc, moldArg);
-  auto handleNullAllocatable = [&](mlir::Value boxAlloca) -> fir::IfOp {
-    mlir::Value addr = builder.create<fir::BoxAddrOp>(loc, moldArg);
+  mlir::SmallVector<mlir::Value> lenParams;
+  auto handleNullAllocatable = [&](mlir::Value boxAlloca,
+                                   mlir::Value loadedMold) -> fir::IfOp {
+    mlir::Value addr = builder.create<fir::BoxAddrOp>(loc, loadedMold);
     mlir::Value isNotAllocated = builder.genIsNullAddr(loc, addr);
     fir::IfOp ifOp = builder.create<fir::IfOp>(loc, isNotAllocated,
                                                /*withElseRegion=*/true);
     builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
-    // just embox the null address and return
-    mlir::Value nullBox = builder.create<fir::EmboxOp>(loc, ty, addr);
+    // Just embox the null address and return.
+    // We have to give the embox a shape so that the LLVM box structure has the
+    // right rank. This returns an empty value if the types don't match.
+    mlir::Value shape = generateZeroShapeForRank(builder, loc, moldArg);
+
+    mlir::Value nullBox =
+        builder.create<fir::EmboxOp>(loc, ty, addr, shape,
+                                     /*slice=*/mlir::Value{}, lenParams);
     builder.create<fir::StoreOp>(loc, nullBox, boxAlloca);
     return ifOp;
   };
@@ -148,34 +304,79 @@ void Fortran::lower::omp::populateByRefInitAndCleanupRegions(
 
     builder.setInsertionPointToEnd(initBlock);
     mlir::Value boxAlloca = allocatedPrivVarArg;
+
+    moldArg = builder.loadIfRef(loc, moldArg);
+    getLengthParameters(builder, loc, moldArg, lenParams);
+
+    // The initial state of a private pointer is undefined so we don't need to
+    // match the mold argument (OpenMP 5.2 end of page 106).
+    if (isPrivatization(kind) &&
+        mlir::isa<fir::PointerType>(boxTy.getEleTy())) {
+      // we need a shape with the right rank so that the embox op is lowered
+      // to an llvm struct of the right type. This returns nullptr if the types
+      // aren't right.
+      mlir::Value shape = generateZeroShapeForRank(builder, loc, moldArg);
+      // Just incase, do initialize the box with a null value
+      mlir::Value null = builder.createNullConstant(loc, boxTy.getEleTy());
+      mlir::Value nullBox;
+      nullBox = builder.create<fir::EmboxOp>(
+          loc, boxTy, null, shape, /*slice=*/mlir::Value{}, lenParams);
+      builder.create<fir::StoreOp>(loc, nullBox, boxAlloca);
+      yield(boxAlloca);
+      return;
+    }
+
     mlir::Type innerTy = fir::unwrapRefType(boxTy.getEleTy());
-    if (fir::isa_trivial(innerTy)) {
+    bool isDerived = fir::isa_derived(innerTy);
+    bool isChar = fir::isa_char(innerTy);
+    if (fir::isa_trivial(innerTy) || isDerived || isChar) {
       // boxed non-sequence value e.g. !fir.box<!fir.heap<i32>>
-      if (!isAllocatableOrPointer)
-        TODO(loc,
-             "Reduction/Privatization of non-allocatable trivial typed box");
+      if (!isAllocatableOrPointer && !isDerived)
+        TODO(loc, "Reduction/Privatization of non-allocatable trivial or "
+                  "character typed box");
 
-      fir::IfOp ifUnallocated = handleNullAllocatable(boxAlloca);
+      if ((isDerived || isChar) && (isReduction(kind) || scalarInitValue))
+        TODO(loc, "Reduction of an unsupported boxed type");
 
-      builder.setInsertionPointToStart(&ifUnallocated.getElseRegion().front());
-      mlir::Value valAlloc = builder.create<fir::AllocMemOp>(loc, innerTy);
+      fir::IfOp ifUnallocated{nullptr};
+      if (isAllocatableOrPointer) {
+        ifUnallocated = handleNullAllocatable(boxAlloca, moldArg);
+        builder.setInsertionPointToStart(
+            &ifUnallocated.getElseRegion().front());
+      }
+
+      mlir::Value valAlloc = builder.createHeapTemporary(
+          loc, innerTy, /*name=*/{}, /*shape=*/{}, lenParams);
       if (scalarInitValue)
         builder.createStoreWithConvert(loc, scalarInitValue, valAlloc);
-      mlir::Value box = builder.create<fir::EmboxOp>(loc, ty, valAlloc);
-      builder.create<fir::StoreOp>(loc, box, boxAlloca);
+      mlir::Value box = builder.create<fir::EmboxOp>(
+          loc, ty, valAlloc, /*shape=*/mlir::Value{}, /*slice=*/mlir::Value{},
+          lenParams);
+      initializeIfDerivedTypeBox(
+          builder, loc, box, moldArg, needsInitialization,
+          /*isFirstPrivate=*/kind == DeclOperationKind::FirstPrivate);
+      fir::StoreOp lastOp = builder.create<fir::StoreOp>(loc, box, boxAlloca);
 
-      createCleanupRegion(builder, loc, argType, cleanupRegion);
-      builder.setInsertionPointAfter(ifUnallocated);
+      createCleanupRegion(converter, loc, argType, cleanupRegion, sym);
+
+      if (ifUnallocated)
+        builder.setInsertionPointAfter(ifUnallocated);
+      else
+        builder.setInsertionPointAfter(lastOp);
       yield(boxAlloca);
       return;
     }
+
     innerTy = fir::extractSequenceType(boxTy);
-    if (!mlir::isa<fir::SequenceType>(innerTy))
+    if (!innerTy || !mlir::isa<fir::SequenceType>(innerTy))
       TODO(loc, "Unsupported boxed type for reduction/privatization");
 
+    moldArg = builder.loadIfRef(loc, moldArg);
+    getLengthParameters(builder, loc, moldArg, lenParams);
+
     fir::IfOp ifUnallocated{nullptr};
     if (isAllocatableOrPointer) {
-      ifUnallocated = handleNullAllocatable(boxAlloca);
+      ifUnallocated = handleNullAllocatable(boxAlloca, moldArg);
       builder.setInsertionPointToStart(&ifUnallocated.getElseRegion().front());
     }
 
@@ -183,6 +384,28 @@ void Fortran::lower::omp::populateByRefInitAndCleanupRegions(
     mlir::Value loadedBox = builder.loadIfRef(loc, moldArg);
     hlfir::Entity source = hlfir::Entity{loadedBox};
 
+    // Special case for (possibly allocatable) arrays of polymorphic types
+    // e.g. !fir.class<!fir.heap<!fir.array<?x!fir.type<>>>>
+    if (source.isPolymorphic()) {
+      fir::ShapeShiftOp shape = getShapeShift(builder, loc, source);
+      mlir::Type arrayType = source.getElementOrSequenceType();
+      mlir::Value allocatedArray = builder.create<fir::AllocMemOp>(
+          loc, arrayType, /*typeparams=*/mlir::ValueRange{},
+          shape.getExtents());
+      mlir::Value firClass = builder.create<fir::EmboxOp>(
+          loc, source.getType(), allocatedArray, shape);
+      initializeIfDerivedTypeBox(
+          builder, loc, firClass, source, needsInitialization,
+          /*isFirstprivate=*/kind == DeclOperationKind::FirstPrivate);
+      builder.create<fir::StoreOp>(loc, firClass, allocatedPrivVarArg);
+      if (ifUnallocated)
+        builder.setInsertionPointAfter(ifUnallocated);
+      yield(allocatedPrivVarArg);
+      mlir::OpBuilder::InsertionGuard guard(builder);
+      createCleanupRegion(converter, loc, argType, cleanupRegion, sym);
+      return;
+    }
+
     // Allocating on the heap in case the whole reduction is nested inside of a
     // loop
     // TODO: compare performance here to using allocas - this could be made to
@@ -199,7 +422,7 @@ void Fortran::lower::omp::populateByRefInitAndCleanupRegions(
            "createTempFromMold decides this statically");
     if (cstNeedsDealloc.has_value() && *cstNeedsDealloc != false) {
       mlir::OpBuilder::InsertionGuard guard(builder);
-      createCleanupRegion(builder, loc, argType, cleanupRegion);
+      createCleanupRegion(converter, loc, argType, cleanupRegion, sym);
     } else {
       assert(!isAllocatableOrPointer &&
              "Pointer-like arrays must be heap allocated");
@@ -223,6 +446,11 @@ void Fortran::lower::omp::populateByRefInitAndCleanupRegions(
 
     if (scalarInitValue)
       builder.create<hlfir::AssignOp>(loc, scalarInitValue, box);
+
+    initializeIfDerivedTypeBox(builder, loc, box, moldArg, needsInitialization,
+                               /*isFirstPrivate=*/kind ==
+                                   DeclOperationKind::FirstPrivate);
+
     builder.create<fir::StoreOp>(loc, box, boxAlloca);
     if (ifUnallocated)
       builder.setInsertionPointAfter(ifUnallocated);
@@ -230,6 +458,49 @@ void Fortran::lower::omp::populateByRefInitAndCleanupRegions(
     return;
   }
 
+  if (auto boxCharTy = mlir::dyn_cast<fir::BoxCharType>(argType)) {
+    mlir::Type eleTy = boxCharTy.getEleTy();
+    builder.setInsertionPointToStart(initBlock);
+    fir::factory::CharacterExprHelper charExprHelper{builder, loc};
+    auto [addr, len] = charExprHelper.createUnboxChar(moldArg);
+
+    // Using heap temporary so that
+    // 1) It is safe to use privatization inside of big loops.
+    // 2) The lifetime can outlive the current stack frame for delayed task
+    // execution.
+    // We can't always allocate a boxchar implicitly as the type of the
+    // omp.private because the allocation potentially needs the length
+    // parameters fetched above.
+    // TODO: this deviates from the intended design for delayed task execution.
+    mlir::Value privateAddr = builder.createHeapTemporary(
+        loc, eleTy, /*name=*/{}, /*shape=*/{}, /*lenParams=*/len);
+    mlir::Value boxChar = charExprHelper.createEmboxChar(privateAddr, len);
+
+    createCleanupRegion(converter, loc, argType, cleanupRegion, sym);
+
+    builder.setInsertionPointToEnd(initBlock);
+    yield(boxChar);
+    return;
+  }
+
+  if (fir::isa_derived(ty)) {
+    builder.setInsertionPointToStart(initBlock);
+    mlir::Type boxedTy = fir::BoxType::get(ty);
+    mlir::Value newBox =
+        builder.create<fir::EmboxOp>(loc, boxedTy, allocatedPrivVarArg);
+    mlir::Value moldBox = builder.create<fir::EmboxOp>(loc, boxedTy, moldArg);
+    initializeIfDerivedTypeBox(
+        builder, loc, newBox, moldBox, needsInitialization,
+        /*isFirstPrivate=*/kind == DeclOperationKind::FirstPrivate);
+
+    if (sym && hasFinalization(*sym))
+      createCleanupRegion(converter, loc, argType, cleanupRegion, sym);
+
+    builder.setInsertionPointToEnd(initBlock);
+    yield(allocatedPrivVarArg);
+    return;
+  }
+
   TODO(loc,
        "creating reduction/privatization init region for unsupported type");
   return;
diff --git flang/lib/Lower/OpenMP/PrivateReductionUtils.h flang/lib/Lower/OpenMP/PrivateReductionUtils.h
index b4abc40cd4b6..fcd36392a29e 100644
--- flang/lib/Lower/OpenMP/PrivateReductionUtils.h
+++ flang/lib/Lower/OpenMP/PrivateReductionUtils.h
@@ -20,6 +20,12 @@ namespace mlir {
 class Region;
 } // namespace mlir
 
+namespace Fortran {
+namespace semantics {
+class Symbol;
+} // namespace semantics
+} // namespace Fortran
+
 namespace fir {
 class FirOpBuilder;
 class ShapeShiftOp;
@@ -27,18 +33,29 @@ class ShapeShiftOp;
 
 namespace Fortran {
 namespace lower {
+class AbstractConverter;
+
 namespace omp {
 
+enum class DeclOperationKind { Private, FirstPrivate, Reduction };
+inline bool isPrivatization(DeclOperationKind kind) {
+  return (kind == DeclOperationKind::FirstPrivate) ||
+         (kind == DeclOperationKind::Private);
+}
+inline bool isReduction(DeclOperationKind kind) {
+  return kind == DeclOperationKind::Reduction;
+}
+
 /// Generate init and cleanup regions suitable for reduction or privatizer
 /// declarations. `scalarInitValue` may be nullptr if there is no default
-/// initialization (for privatization).
-void populateByRefInitAndCleanupRegions(fir::FirOpBuilder &builder,
-                                        mlir::Location loc, mlir::Type argType,
-                                        mlir::Value scalarInitValue,
-                                        mlir::Block *initBlock,
-                                        mlir::Value allocatedPrivVarArg,
-                                        mlir::Value moldArg,
-                                        mlir::Region &cleanupRegion);
+/// initialization (for privatization). `kind` should be set to indicate
+/// what kind of operation definition this initialization belongs to.
+void populateByRefInitAndCleanupRegions(
+    AbstractConverter &converter, mlir::Location loc, mlir::Type argType,
+    mlir::Value scalarInitValue, mlir::Block *initBlock,
+    mlir::Value allocatedPrivVarArg, mlir::Value moldArg,
+    mlir::Region &cleanupRegion, DeclOperationKind kind,
+    const Fortran::semantics::Symbol *sym = nullptr);
 
 /// Generate a fir::ShapeShift op describing the provided boxed array.
 fir::ShapeShiftOp getShapeShift(fir::FirOpBuilder &builder, mlir::Location loc,
diff --git flang/lib/Lower/OpenMP/ReductionProcessor.cpp flang/lib/Lower/OpenMP/ReductionProcessor.cpp
index 2cd21107a916..4a811f1bdfdf 100644
--- flang/lib/Lower/OpenMP/ReductionProcessor.cpp
+++ flang/lib/Lower/OpenMP/ReductionProcessor.cpp
@@ -410,10 +410,11 @@ static mlir::Type unwrapSeqOrBoxedType(mlir::Type ty) {
 }
 
 static void createReductionAllocAndInitRegions(
-    fir::FirOpBuilder &builder, mlir::Location loc,
+    AbstractConverter &converter, mlir::Location loc,
     mlir::omp::DeclareReductionOp &reductionDecl,
     const ReductionProcessor::ReductionIdentifier redId, mlir::Type type,
     bool isByRef) {
+  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
   auto yield = [&](mlir::Value ret) {
     builder.create<mlir::omp::YieldOp>(loc, ret);
   };
@@ -439,10 +440,11 @@ static void createReductionAllocAndInitRegions(
       loc, unwrapSeqOrBoxedType(ty), redId, builder);
 
   if (isByRef) {
-    populateByRefInitAndCleanupRegions(builder, loc, type, initValue, initBlock,
-                                       reductionDecl.getInitializerAllocArg(),
-                                       reductionDecl.getInitializerMoldArg(),
-                                       reductionDecl.getCleanupRegion());
+    populateByRefInitAndCleanupRegions(
+        converter, loc, type, initValue, initBlock,
+        reductionDecl.getInitializerAllocArg(),
+        reductionDecl.getInitializerMoldArg(), reductionDecl.getCleanupRegion(),
+        DeclOperationKind::Reduction);
   }
 
   if (fir::isa_trivial(ty)) {
@@ -466,9 +468,10 @@ static void createReductionAllocAndInitRegions(
 }
 
 mlir::omp::DeclareReductionOp ReductionProcessor::createDeclareReduction(
-    fir::FirOpBuilder &builder, llvm::StringRef reductionOpName,
+    AbstractConverter &converter, llvm::StringRef reductionOpName,
     const ReductionIdentifier redId, mlir::Type type, mlir::Location loc,
     bool isByRef) {
+  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
   mlir::OpBuilder::InsertionGuard guard(builder);
   mlir::ModuleOp module = builder.getModule();
 
@@ -486,7 +489,8 @@ mlir::omp::DeclareReductionOp ReductionProcessor::createDeclareReduction(
 
   decl = modBuilder.create<mlir::omp::DeclareReductionOp>(loc, reductionOpName,
                                                           type);
-  createReductionAllocAndInitRegions(builder, loc, decl, redId, type, isByRef);
+  createReductionAllocAndInitRegions(converter, loc, decl, redId, type,
+                                     isByRef);
 
   builder.createBlock(&decl.getReductionRegion(),
                       decl.getReductionRegion().end(), {type, type},
@@ -645,7 +649,7 @@ void ReductionProcessor::addDeclareReduction(
       TODO(currentLocation, "Unexpected reduction type");
     }
 
-    decl = createDeclareReduction(firOpBuilder, reductionName, redId, redType,
+    decl = createDeclareReduction(converter, reductionName, redId, redType,
                                   currentLocation, isByRef);
     reductionDeclSymbols.push_back(
         mlir::SymbolRefAttr::get(firOpBuilder.getContext(), decl.getSymName()));
diff --git flang/lib/Lower/OpenMP/ReductionProcessor.h flang/lib/Lower/OpenMP/ReductionProcessor.h
index 5f4d742b62cb..d7d9b067e0ba 100644
--- flang/lib/Lower/OpenMP/ReductionProcessor.h
+++ flang/lib/Lower/OpenMP/ReductionProcessor.h
@@ -113,7 +113,7 @@ public:
   /// value `initValue`, and the reduction combiner carried over from `reduce`.
   /// TODO: add atomic region.
   static mlir::omp::DeclareReductionOp
-  createDeclareReduction(fir::FirOpBuilder &builder,
+  createDeclareReduction(AbstractConverter &builder,
                          llvm::StringRef reductionOpName,
                          const ReductionIdentifier redId, mlir::Type type,
                          mlir::Location loc, bool isByRef);
diff --git flang/lib/Lower/Support/Utils.cpp flang/lib/Lower/Support/Utils.cpp
new file mode 100644
index 000000000000..5a9a83933036
--- /dev/null
+++ flang/lib/Lower/Support/Utils.cpp
@@ -0,0 +1,605 @@
+//===-- Lower/Support/Utils.cpp -- utilities --------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Coding style: https://mlir.llvm.org/getting_started/DeveloperGuide/
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Lower/Support/Utils.h"
+
+#include "flang/Common/indirection.h"
+#include "flang/Lower/IterationSpace.h"
+#include "flang/Semantics/tools.h"
+#include <cstdint>
+#include <optional>
+#include <type_traits>
+
+namespace Fortran::lower {
+// Fortran::evaluate::Expr are functional values organized like an AST. A
+// Fortran::evaluate::Expr is meant to be moved and cloned. Using the front end
+// tools can often cause copies and extra wrapper classes to be added to any
+// Fortran::evaluate::Expr. These values should not be assumed or relied upon to
+// have an *object* identity. They are deeply recursive, irregular structures
+// built from a large number of classes which do not use inheritance and
+// necessitate a large volume of boilerplate code as a result.
+//
+// Contrastingly, LLVM data structures make ubiquitous assumptions about an
+// object's identity via pointers to the object. An object's location in memory
+// is thus very often an identifying relation.
+
+// This class defines a hash computation of a Fortran::evaluate::Expr tree value
+// so it can be used with llvm::DenseMap. The Fortran::evaluate::Expr need not
+// have the same address.
+class HashEvaluateExpr {
+public:
+  // A Se::Symbol is the only part of an Fortran::evaluate::Expr with an
+  // identity property.
+  static unsigned getHashValue(const Fortran::semantics::Symbol &x) {
+    return static_cast<unsigned>(reinterpret_cast<std::intptr_t>(&x));
+  }
+  template <typename A, bool COPY>
+  static unsigned getHashValue(const Fortran::common::Indirection<A, COPY> &x) {
+    return getHashValue(x.value());
+  }
+  template <typename A>
+  static unsigned getHashValue(const std::optional<A> &x) {
+    if (x.has_value())
+      return getHashValue(x.value());
+    return 0u;
+  }
+  static unsigned getHashValue(const Fortran::evaluate::Subscript &x) {
+    return Fortran::common::visit(
+        [&](const auto &v) { return getHashValue(v); }, x.u);
+  }
+  static unsigned getHashValue(const Fortran::evaluate::Triplet &x) {
+    return getHashValue(x.lower()) - getHashValue(x.upper()) * 5u -
+           getHashValue(x.stride()) * 11u;
+  }
+  static unsigned getHashValue(const Fortran::evaluate::Component &x) {
+    return getHashValue(x.base()) * 83u - getHashValue(x.GetLastSymbol());
+  }
+  static unsigned getHashValue(const Fortran::evaluate::ArrayRef &x) {
+    unsigned subs = 1u;
+    for (const Fortran::evaluate::Subscript &v : x.subscript())
+      subs -= getHashValue(v);
+    return getHashValue(x.base()) * 89u - subs;
+  }
+  static unsigned getHashValue(const Fortran::evaluate::CoarrayRef &x) {
+    unsigned subs = 1u;
+    for (const Fortran::evaluate::Subscript &v : x.subscript())
+      subs -= getHashValue(v);
+    unsigned cosubs = 3u;
+    for (const Fortran::evaluate::Expr<Fortran::evaluate::SubscriptInteger> &v :
+         x.cosubscript())
+      cosubs -= getHashValue(v);
+    unsigned syms = 7u;
+    for (const Fortran::evaluate::SymbolRef &v : x.base())
+      syms += getHashValue(v);
+    return syms * 97u - subs - cosubs + getHashValue(x.stat()) + 257u +
+           getHashValue(x.team());
+  }
+  static unsigned getHashValue(const Fortran::evaluate::NamedEntity &x) {
+    if (x.IsSymbol())
+      return getHashValue(x.GetFirstSymbol()) * 11u;
+    return getHashValue(x.GetComponent()) * 13u;
+  }
+  static unsigned getHashValue(const Fortran::evaluate::DataRef &x) {
+    return Fortran::common::visit(
+        [&](const auto &v) { return getHashValue(v); }, x.u);
+  }
+  static unsigned getHashValue(const Fortran::evaluate::ComplexPart &x) {
+    return getHashValue(x.complex()) - static_cast<unsigned>(x.part());
+  }
+  template <Fortran::common::TypeCategory TC1, int KIND,
+            Fortran::common::TypeCategory TC2>
+  static unsigned getHashValue(
+      const Fortran::evaluate::Convert<Fortran::evaluate::Type<TC1, KIND>, TC2>
+          &x) {
+    return getHashValue(x.left()) - (static_cast<unsigned>(TC1) + 2u) -
+           (static_cast<unsigned>(KIND) + 5u);
+  }
+  template <int KIND>
+  static unsigned
+  getHashValue(const Fortran::evaluate::ComplexComponent<KIND> &x) {
+    return getHashValue(x.left()) -
+           (static_cast<unsigned>(x.isImaginaryPart) + 1u) * 3u;
+  }
+  template <typename T>
+  static unsigned getHashValue(const Fortran::evaluate::Parentheses<T> &x) {
+    return getHashValue(x.left()) * 17u;
+  }
+  template <Fortran::common::TypeCategory TC, int KIND>
+  static unsigned getHashValue(
+      const Fortran::evaluate::Negate<Fortran::evaluate::Type<TC, KIND>> &x) {
+    return getHashValue(x.left()) - (static_cast<unsigned>(TC) + 5u) -
+           (static_cast<unsigned>(KIND) + 7u);
+  }
+  template <Fortran::common::TypeCategory TC, int KIND>
+  static unsigned getHashValue(
+      const Fortran::evaluate::Add<Fortran::evaluate::Type<TC, KIND>> &x) {
+    return (getHashValue(x.left()) + getHashValue(x.right())) * 23u +
+           static_cast<unsigned>(TC) + static_cast<unsigned>(KIND);
+  }
+  template <Fortran::common::TypeCategory TC, int KIND>
+  static unsigned getHashValue(
+      const Fortran::evaluate::Subtract<Fortran::evaluate::Type<TC, KIND>> &x) {
+    return (getHashValue(x.left()) - getHashValue(x.right())) * 19u +
+           static_cast<unsigned>(TC) + static_cast<unsigned>(KIND);
+  }
+  template <Fortran::common::TypeCategory TC, int KIND>
+  static unsigned getHashValue(
+      const Fortran::evaluate::Multiply<Fortran::evaluate::Type<TC, KIND>> &x) {
+    return (getHashValue(x.left()) + getHashValue(x.right())) * 29u +
+           static_cast<unsigned>(TC) + static_cast<unsigned>(KIND);
+  }
+  template <Fortran::common::TypeCategory TC, int KIND>
+  static unsigned getHashValue(
+      const Fortran::evaluate::Divide<Fortran::evaluate::Type<TC, KIND>> &x) {
+    return (getHashValue(x.left()) - getHashValue(x.right())) * 31u +
+           static_cast<unsigned>(TC) + static_cast<unsigned>(KIND);
+  }
+  template <Fortran::common::TypeCategory TC, int KIND>
+  static unsigned getHashValue(
+      const Fortran::evaluate::Power<Fortran::evaluate::Type<TC, KIND>> &x) {
+    return (getHashValue(x.left()) - getHashValue(x.right())) * 37u +
+           static_cast<unsigned>(TC) + static_cast<unsigned>(KIND);
+  }
+  template <Fortran::common::TypeCategory TC, int KIND>
+  static unsigned getHashValue(
+      const Fortran::evaluate::Extremum<Fortran::evaluate::Type<TC, KIND>> &x) {
+    return (getHashValue(x.left()) + getHashValue(x.right())) * 41u +
+           static_cast<unsigned>(TC) + static_cast<unsigned>(KIND) +
+           static_cast<unsigned>(x.ordering) * 7u;
+  }
+  template <Fortran::common::TypeCategory TC, int KIND>
+  static unsigned getHashValue(
+      const Fortran::evaluate::RealToIntPower<Fortran::evaluate::Type<TC, KIND>>
+          &x) {
+    return (getHashValue(x.left()) - getHashValue(x.right())) * 43u +
+           static_cast<unsigned>(TC) + static_cast<unsigned>(KIND);
+  }
+  template <int KIND>
+  static unsigned
+  getHashValue(const Fortran::evaluate::ComplexConstructor<KIND> &x) {
+    return (getHashValue(x.left()) - getHashValue(x.right())) * 47u +
+           static_cast<unsigned>(KIND);
+  }
+  template <int KIND>
+  static unsigned getHashValue(const Fortran::evaluate::Concat<KIND> &x) {
+    return (getHashValue(x.left()) - getHashValue(x.right())) * 53u +
+           static_cast<unsigned>(KIND);
+  }
+  template <int KIND>
+  static unsigned getHashValue(const Fortran::evaluate::SetLength<KIND> &x) {
+    return (getHashValue(x.left()) - getHashValue(x.right())) * 59u +
+           static_cast<unsigned>(KIND);
+  }
+  static unsigned getHashValue(const Fortran::semantics::SymbolRef &sym) {
+    return getHashValue(sym.get());
+  }
+  static unsigned getHashValue(const Fortran::evaluate::Substring &x) {
+    return 61u *
+               Fortran::common::visit(
+                   [&](const auto &p) { return getHashValue(p); }, x.parent()) -
+           getHashValue(x.lower()) - (getHashValue(x.lower()) + 1u);
+  }
+  static unsigned
+  getHashValue(const Fortran::evaluate::StaticDataObject::Pointer &x) {
+    return llvm::hash_value(x->name());
+  }
+  static unsigned getHashValue(const Fortran::evaluate::SpecificIntrinsic &x) {
+    return llvm::hash_value(x.name);
+  }
+  template <typename A>
+  static unsigned getHashValue(const Fortran::evaluate::Constant<A> &x) {
+    // FIXME: Should hash the content.
+    return 103u;
+  }
+  static unsigned getHashValue(const Fortran::evaluate::ActualArgument &x) {
+    if (const Fortran::evaluate::Symbol *sym = x.GetAssumedTypeDummy())
+      return getHashValue(*sym);
+    return getHashValue(*x.UnwrapExpr());
+  }
+  static unsigned
+  getHashValue(const Fortran::evaluate::ProcedureDesignator &x) {
+    return Fortran::common::visit(
+        [&](const auto &v) { return getHashValue(v); }, x.u);
+  }
+  static unsigned getHashValue(const Fortran::evaluate::ProcedureRef &x) {
+    unsigned args = 13u;
+    for (const std::optional<Fortran::evaluate::ActualArgument> &v :
+         x.arguments())
+      args -= getHashValue(v);
+    return getHashValue(x.proc()) * 101u - args;
+  }
+  template <typename A>
+  static unsigned
+  getHashValue(const Fortran::evaluate::ArrayConstructor<A> &x) {
+    // FIXME: hash the contents.
+    return 127u;
+  }
+  static unsigned getHashValue(const Fortran::evaluate::ImpliedDoIndex &x) {
+    return llvm::hash_value(toStringRef(x.name).str()) * 131u;
+  }
+  static unsigned getHashValue(const Fortran::evaluate::TypeParamInquiry &x) {
+    return getHashValue(x.base()) * 137u - getHashValue(x.parameter()) * 3u;
+  }
+  static unsigned getHashValue(const Fortran::evaluate::DescriptorInquiry &x) {
+    return getHashValue(x.base()) * 139u -
+           static_cast<unsigned>(x.field()) * 13u +
+           static_cast<unsigned>(x.dimension());
+  }
+  static unsigned
+  getHashValue(const Fortran::evaluate::StructureConstructor &x) {
+    // FIXME: hash the contents.
+    return 149u;
+  }
+  template <int KIND>
+  static unsigned getHashValue(const Fortran::evaluate::Not<KIND> &x) {
+    return getHashValue(x.left()) * 61u + static_cast<unsigned>(KIND);
+  }
+  template <int KIND>
+  static unsigned
+  getHashValue(const Fortran::evaluate::LogicalOperation<KIND> &x) {
+    unsigned result = getHashValue(x.left()) + getHashValue(x.right());
+    return result * 67u + static_cast<unsigned>(x.logicalOperator) * 5u;
+  }
+  template <Fortran::common::TypeCategory TC, int KIND>
+  static unsigned getHashValue(
+      const Fortran::evaluate::Relational<Fortran::evaluate::Type<TC, KIND>>
+          &x) {
+    return (getHashValue(x.left()) + getHashValue(x.right())) * 71u +
+           static_cast<unsigned>(TC) + static_cast<unsigned>(KIND) +
+           static_cast<unsigned>(x.opr) * 11u;
+  }
+  template <typename A>
+  static unsigned getHashValue(const Fortran::evaluate::Expr<A> &x) {
+    return Fortran::common::visit(
+        [&](const auto &v) { return getHashValue(v); }, x.u);
+  }
+  static unsigned getHashValue(
+      const Fortran::evaluate::Relational<Fortran::evaluate::SomeType> &x) {
+    return Fortran::common::visit(
+        [&](const auto &v) { return getHashValue(v); }, x.u);
+  }
+  template <typename A>
+  static unsigned getHashValue(const Fortran::evaluate::Designator<A> &x) {
+    return Fortran::common::visit(
+        [&](const auto &v) { return getHashValue(v); }, x.u);
+  }
+  template <int BITS>
+  static unsigned
+  getHashValue(const Fortran::evaluate::value::Integer<BITS> &x) {
+    return static_cast<unsigned>(x.ToSInt());
+  }
+  static unsigned getHashValue(const Fortran::evaluate::NullPointer &x) {
+    return ~179u;
+  }
+};
+
+// Define the is equals test for using Fortran::evaluate::Expr values with
+// llvm::DenseMap.
+class IsEqualEvaluateExpr {
+public:
+  // A Se::Symbol is the only part of an Fortran::evaluate::Expr with an
+  // identity property.
+  static bool isEqual(const Fortran::semantics::Symbol &x,
+                      const Fortran::semantics::Symbol &y) {
+    return isEqual(&x, &y);
+  }
+  static bool isEqual(const Fortran::semantics::Symbol *x,
+                      const Fortran::semantics::Symbol *y) {
+    return x == y;
+  }
+  template <typename A, bool COPY>
+  static bool isEqual(const Fortran::common::Indirection<A, COPY> &x,
+                      const Fortran::common::Indirection<A, COPY> &y) {
+    return isEqual(x.value(), y.value());
+  }
+  template <typename A>
+  static bool isEqual(const std::optional<A> &x, const std::optional<A> &y) {
+    if (x.has_value() && y.has_value())
+      return isEqual(x.value(), y.value());
+    return !x.has_value() && !y.has_value();
+  }
+  template <typename A>
+  static bool isEqual(const std::vector<A> &x, const std::vector<A> &y) {
+    if (x.size() != y.size())
+      return false;
+    const std::size_t size = x.size();
+    for (std::remove_const_t<decltype(size)> i = 0; i < size; ++i)
+      if (!isEqual(x[i], y[i]))
+        return false;
+    return true;
+  }
+  static bool isEqual(const Fortran::evaluate::Subscript &x,
+                      const Fortran::evaluate::Subscript &y) {
+    return Fortran::common::visit(
+        [&](const auto &v, const auto &w) { return isEqual(v, w); }, x.u, y.u);
+  }
+  static bool isEqual(const Fortran::evaluate::Triplet &x,
+                      const Fortran::evaluate::Triplet &y) {
+    return isEqual(x.lower(), y.lower()) && isEqual(x.upper(), y.upper()) &&
+           isEqual(x.stride(), y.stride());
+  }
+  static bool isEqual(const Fortran::evaluate::Component &x,
+                      const Fortran::evaluate::Component &y) {
+    return isEqual(x.base(), y.base()) &&
+           isEqual(x.GetLastSymbol(), y.GetLastSymbol());
+  }
+  static bool isEqual(const Fortran::evaluate::ArrayRef &x,
+                      const Fortran::evaluate::ArrayRef &y) {
+    return isEqual(x.base(), y.base()) && isEqual(x.subscript(), y.subscript());
+  }
+  static bool isEqual(const Fortran::evaluate::CoarrayRef &x,
+                      const Fortran::evaluate::CoarrayRef &y) {
+    return isEqual(x.base(), y.base()) &&
+           isEqual(x.subscript(), y.subscript()) &&
+           isEqual(x.cosubscript(), y.cosubscript()) &&
+           isEqual(x.stat(), y.stat()) && isEqual(x.team(), y.team());
+  }
+  static bool isEqual(const Fortran::evaluate::NamedEntity &x,
+                      const Fortran::evaluate::NamedEntity &y) {
+    if (x.IsSymbol() && y.IsSymbol())
+      return isEqual(x.GetFirstSymbol(), y.GetFirstSymbol());
+    return !x.IsSymbol() && !y.IsSymbol() &&
+           isEqual(x.GetComponent(), y.GetComponent());
+  }
+  static bool isEqual(const Fortran::evaluate::DataRef &x,
+                      const Fortran::evaluate::DataRef &y) {
+    return Fortran::common::visit(
+        [&](const auto &v, const auto &w) { return isEqual(v, w); }, x.u, y.u);
+  }
+  static bool isEqual(const Fortran::evaluate::ComplexPart &x,
+                      const Fortran::evaluate::ComplexPart &y) {
+    return isEqual(x.complex(), y.complex()) && x.part() == y.part();
+  }
+  template <typename A, Fortran::common::TypeCategory TC2>
+  static bool isEqual(const Fortran::evaluate::Convert<A, TC2> &x,
+                      const Fortran::evaluate::Convert<A, TC2> &y) {
+    return isEqual(x.left(), y.left());
+  }
+  template <int KIND>
+  static bool isEqual(const Fortran::evaluate::ComplexComponent<KIND> &x,
+                      const Fortran::evaluate::ComplexComponent<KIND> &y) {
+    return isEqual(x.left(), y.left()) &&
+           x.isImaginaryPart == y.isImaginaryPart;
+  }
+  template <typename T>
+  static bool isEqual(const Fortran::evaluate::Parentheses<T> &x,
+                      const Fortran::evaluate::Parentheses<T> &y) {
+    return isEqual(x.left(), y.left());
+  }
+  template <typename A>
+  static bool isEqual(const Fortran::evaluate::Negate<A> &x,
+                      const Fortran::evaluate::Negate<A> &y) {
+    return isEqual(x.left(), y.left());
+  }
+  template <typename A>
+  static bool isBinaryEqual(const A &x, const A &y) {
+    return isEqual(x.left(), y.left()) && isEqual(x.right(), y.right());
+  }
+  template <typename A>
+  static bool isEqual(const Fortran::evaluate::Add<A> &x,
+                      const Fortran::evaluate::Add<A> &y) {
+    return isBinaryEqual(x, y);
+  }
+  template <typename A>
+  static bool isEqual(const Fortran::evaluate::Subtract<A> &x,
+                      const Fortran::evaluate::Subtract<A> &y) {
+    return isBinaryEqual(x, y);
+  }
+  template <typename A>
+  static bool isEqual(const Fortran::evaluate::Multiply<A> &x,
+                      const Fortran::evaluate::Multiply<A> &y) {
+    return isBinaryEqual(x, y);
+  }
+  template <typename A>
+  static bool isEqual(const Fortran::evaluate::Divide<A> &x,
+                      const Fortran::evaluate::Divide<A> &y) {
+    return isBinaryEqual(x, y);
+  }
+  template <typename A>
+  static bool isEqual(const Fortran::evaluate::Power<A> &x,
+                      const Fortran::evaluate::Power<A> &y) {
+    return isBinaryEqual(x, y);
+  }
+  template <typename A>
+  static bool isEqual(const Fortran::evaluate::Extremum<A> &x,
+                      const Fortran::evaluate::Extremum<A> &y) {
+    return isBinaryEqual(x, y);
+  }
+  template <typename A>
+  static bool isEqual(const Fortran::evaluate::RealToIntPower<A> &x,
+                      const Fortran::evaluate::RealToIntPower<A> &y) {
+    return isBinaryEqual(x, y);
+  }
+  template <int KIND>
+  static bool isEqual(const Fortran::evaluate::ComplexConstructor<KIND> &x,
+                      const Fortran::evaluate::ComplexConstructor<KIND> &y) {
+    return isBinaryEqual(x, y);
+  }
+  template <int KIND>
+  static bool isEqual(const Fortran::evaluate::Concat<KIND> &x,
+                      const Fortran::evaluate::Concat<KIND> &y) {
+    return isBinaryEqual(x, y);
+  }
+  template <int KIND>
+  static bool isEqual(const Fortran::evaluate::SetLength<KIND> &x,
+                      const Fortran::evaluate::SetLength<KIND> &y) {
+    return isBinaryEqual(x, y);
+  }
+  static bool isEqual(const Fortran::semantics::SymbolRef &x,
+                      const Fortran::semantics::SymbolRef &y) {
+    return isEqual(x.get(), y.get());
+  }
+  static bool isEqual(const Fortran::evaluate::Substring &x,
+                      const Fortran::evaluate::Substring &y) {
+    return Fortran::common::visit(
+               [&](const auto &p, const auto &q) { return isEqual(p, q); },
+               x.parent(), y.parent()) &&
+           isEqual(x.lower(), y.lower()) && isEqual(x.upper(), y.upper());
+  }
+  static bool isEqual(const Fortran::evaluate::StaticDataObject::Pointer &x,
+                      const Fortran::evaluate::StaticDataObject::Pointer &y) {
+    return x->name() == y->name();
+  }
+  static bool isEqual(const Fortran::evaluate::SpecificIntrinsic &x,
+                      const Fortran::evaluate::SpecificIntrinsic &y) {
+    return x.name == y.name;
+  }
+  template <typename A>
+  static bool isEqual(const Fortran::evaluate::Constant<A> &x,
+                      const Fortran::evaluate::Constant<A> &y) {
+    return x == y;
+  }
+  static bool isEqual(const Fortran::evaluate::ActualArgument &x,
+                      const Fortran::evaluate::ActualArgument &y) {
+    if (const Fortran::evaluate::Symbol *xs = x.GetAssumedTypeDummy()) {
+      if (const Fortran::evaluate::Symbol *ys = y.GetAssumedTypeDummy())
+        return isEqual(*xs, *ys);
+      return false;
+    }
+    return !y.GetAssumedTypeDummy() &&
+           isEqual(*x.UnwrapExpr(), *y.UnwrapExpr());
+  }
+  static bool isEqual(const Fortran::evaluate::ProcedureDesignator &x,
+                      const Fortran::evaluate::ProcedureDesignator &y) {
+    return Fortran::common::visit(
+        [&](const auto &v, const auto &w) { return isEqual(v, w); }, x.u, y.u);
+  }
+  static bool isEqual(const Fortran::evaluate::ProcedureRef &x,
+                      const Fortran::evaluate::ProcedureRef &y) {
+    return isEqual(x.proc(), y.proc()) && isEqual(x.arguments(), y.arguments());
+  }
+  template <typename A>
+  static bool isEqual(const Fortran::evaluate::ArrayConstructor<A> &x,
+                      const Fortran::evaluate::ArrayConstructor<A> &y) {
+    llvm::report_fatal_error("not implemented");
+  }
+  static bool isEqual(const Fortran::evaluate::ImpliedDoIndex &x,
+                      const Fortran::evaluate::ImpliedDoIndex &y) {
+    return toStringRef(x.name) == toStringRef(y.name);
+  }
+  static bool isEqual(const Fortran::evaluate::TypeParamInquiry &x,
+                      const Fortran::evaluate::TypeParamInquiry &y) {
+    return isEqual(x.base(), y.base()) && isEqual(x.parameter(), y.parameter());
+  }
+  static bool isEqual(const Fortran::evaluate::DescriptorInquiry &x,
+                      const Fortran::evaluate::DescriptorInquiry &y) {
+    return isEqual(x.base(), y.base()) && x.field() == y.field() &&
+           x.dimension() == y.dimension();
+  }
+  static bool isEqual(const Fortran::evaluate::StructureConstructor &x,
+                      const Fortran::evaluate::StructureConstructor &y) {
+    const auto &xValues = x.values();
+    const auto &yValues = y.values();
+    if (xValues.size() != yValues.size())
+      return false;
+    if (x.derivedTypeSpec() != y.derivedTypeSpec())
+      return false;
+    for (const auto &[xSymbol, xValue] : xValues) {
+      auto yIt = yValues.find(xSymbol);
+      // This should probably never happen, since the derived type
+      // should be the same.
+      if (yIt == yValues.end())
+        return false;
+      if (!isEqual(xValue, yIt->second))
+        return false;
+    }
+    return true;
+  }
+  template <int KIND>
+  static bool isEqual(const Fortran::evaluate::Not<KIND> &x,
+                      const Fortran::evaluate::Not<KIND> &y) {
+    return isEqual(x.left(), y.left());
+  }
+  template <int KIND>
+  static bool isEqual(const Fortran::evaluate::LogicalOperation<KIND> &x,
+                      const Fortran::evaluate::LogicalOperation<KIND> &y) {
+    return isEqual(x.left(), y.left()) && isEqual(x.right(), y.right());
+  }
+  template <typename A>
+  static bool isEqual(const Fortran::evaluate::Relational<A> &x,
+                      const Fortran::evaluate::Relational<A> &y) {
+    return isEqual(x.left(), y.left()) && isEqual(x.right(), y.right());
+  }
+  template <typename A>
+  static bool isEqual(const Fortran::evaluate::Expr<A> &x,
+                      const Fortran::evaluate::Expr<A> &y) {
+    return Fortran::common::visit(
+        [&](const auto &v, const auto &w) { return isEqual(v, w); }, x.u, y.u);
+  }
+  static bool
+  isEqual(const Fortran::evaluate::Relational<Fortran::evaluate::SomeType> &x,
+          const Fortran::evaluate::Relational<Fortran::evaluate::SomeType> &y) {
+    return Fortran::common::visit(
+        [&](const auto &v, const auto &w) { return isEqual(v, w); }, x.u, y.u);
+  }
+  template <typename A>
+  static bool isEqual(const Fortran::evaluate::Designator<A> &x,
+                      const Fortran::evaluate::Designator<A> &y) {
+    return Fortran::common::visit(
+        [&](const auto &v, const auto &w) { return isEqual(v, w); }, x.u, y.u);
+  }
+  template <int BITS>
+  static bool isEqual(const Fortran::evaluate::value::Integer<BITS> &x,
+                      const Fortran::evaluate::value::Integer<BITS> &y) {
+    return x == y;
+  }
+  static bool isEqual(const Fortran::evaluate::NullPointer &x,
+                      const Fortran::evaluate::NullPointer &y) {
+    return true;
+  }
+  template <typename A, typename B,
+            std::enable_if_t<!std::is_same_v<A, B>, bool> = true>
+  static bool isEqual(const A &, const B &) {
+    return false;
+  }
+};
+
+unsigned getHashValue(const Fortran::lower::SomeExpr *x) {
+  return HashEvaluateExpr::getHashValue(*x);
+}
+
+unsigned getHashValue(const Fortran::lower::ExplicitIterSpace::ArrayBases &x) {
+  return Fortran::common::visit(
+      [&](const auto *p) { return HashEvaluateExpr::getHashValue(*p); }, x);
+}
+
+bool isEqual(const Fortran::lower::SomeExpr *x,
+             const Fortran::lower::SomeExpr *y) {
+  const auto *empty =
+      llvm::DenseMapInfo<const Fortran::lower::SomeExpr *>::getEmptyKey();
+  const auto *tombstone =
+      llvm::DenseMapInfo<const Fortran::lower::SomeExpr *>::getTombstoneKey();
+  if (x == empty || y == empty || x == tombstone || y == tombstone)
+    return x == y;
+  return x == y || IsEqualEvaluateExpr::isEqual(*x, *y);
+}
+
+bool isEqual(const Fortran::lower::ExplicitIterSpace::ArrayBases &x,
+             const Fortran::lower::ExplicitIterSpace::ArrayBases &y) {
+  return Fortran::common::visit(
+      Fortran::common::visitors{
+          // Fortran::semantics::Symbol * are the exception here. These pointers
+          // have identity; if two Symbol * values are the same (different) then
+          // they are the same (different) logical symbol.
+          [&](Fortran::lower::FrontEndSymbol p,
+              Fortran::lower::FrontEndSymbol q) { return p == q; },
+          [&](const auto *p, const auto *q) {
+            if constexpr (std::is_same_v<decltype(p), decltype(q)>) {
+              return IsEqualEvaluateExpr::isEqual(*p, *q);
+            } else {
+              // Different subtree types are never equal.
+              return false;
+            }
+          }},
+      x, y);
+}
+} // end namespace Fortran::lower
diff --git flang/lib/Optimizer/Builder/FIRBuilder.cpp flang/lib/Optimizer/Builder/FIRBuilder.cpp
index af350d1331e5..d9779c46ae79 100644
--- flang/lib/Optimizer/Builder/FIRBuilder.cpp
+++ flang/lib/Optimizer/Builder/FIRBuilder.cpp
@@ -1778,7 +1778,7 @@ llvm::SmallVector<mlir::Value> fir::factory::updateRuntimeExtentsForEmptyArrays(
     isEmpty = builder.create<mlir::arith::OrIOp>(loc, isEmpty, isZero);
   }
 
-  llvm::SmallVector<mlir::Value, Fortran::common::maxRank> newExtents;
+  llvm::SmallVector<mlir::Value> newExtents;
   for (auto [zero, extent] : llvm::zip_equal(zeroes, extents)) {
     newExtents.push_back(
         builder.create<mlir::arith::SelectOp>(loc, isEmpty, zero, extent));
diff --git flang/lib/Optimizer/Builder/HLFIRTools.cpp flang/lib/Optimizer/Builder/HLFIRTools.cpp
index f71adf123511..8993065c2bb6 100644
--- flang/lib/Optimizer/Builder/HLFIRTools.cpp
+++ flang/lib/Optimizer/Builder/HLFIRTools.cpp
@@ -221,6 +221,25 @@ bool hlfir::Entity::mayHaveNonDefaultLowerBounds() const {
   return true;
 }
 
+mlir::Operation *traverseConverts(mlir::Operation *op) {
+  while (auto convert = llvm::dyn_cast_or_null<fir::ConvertOp>(op))
+    op = convert.getValue().getDefiningOp();
+  return op;
+}
+
+bool hlfir::Entity::mayBeOptional() const {
+  if (!isVariable())
+    return false;
+  // TODO: introduce a fir type to better identify optionals.
+  if (mlir::Operation *op = traverseConverts(getDefiningOp())) {
+    if (auto varIface = llvm::dyn_cast<fir::FortranVariableOpInterface>(op))
+      return varIface.isOptional();
+    return !llvm::isa<fir::AllocaOp, fir::AllocMemOp, fir::ReboxOp,
+                      fir::EmboxOp, fir::LoadOp>(op);
+  }
+  return true;
+}
+
 fir::FortranVariableOpInterface
 hlfir::genDeclare(mlir::Location loc, fir::FirOpBuilder &builder,
                   const fir::ExtendedValue &exv, llvm::StringRef name,
@@ -963,9 +982,69 @@ llvm::SmallVector<mlir::Value> hlfir::genLoopNestWithReductions(
   return outerLoop->getResults();
 }
 
+template <typename Lambda>
+static fir::ExtendedValue
+conditionallyEvaluate(mlir::Location loc, fir::FirOpBuilder &builder,
+                      mlir::Value condition, const Lambda &genIfTrue) {
+  mlir::OpBuilder::InsertPoint insertPt = builder.saveInsertionPoint();
+
+  // Evaluate in some region that will be moved into the actual ifOp (the actual
+  // ifOp can only be created when the result types are known).
+  auto badIfOp = builder.create<fir::IfOp>(loc, condition.getType(), condition,
+                                           /*withElseRegion=*/false);
+  mlir::Block *preparationBlock = &badIfOp.getThenRegion().front();
+  builder.setInsertionPointToStart(preparationBlock);
+  fir::ExtendedValue result = genIfTrue();
+  fir::ResultOp resultOp = result.match(
+      [&](const fir::CharBoxValue &box) -> fir::ResultOp {
+        return builder.create<fir::ResultOp>(
+            loc, mlir::ValueRange{box.getAddr(), box.getLen()});
+      },
+      [&](const mlir::Value &addr) -> fir::ResultOp {
+        return builder.create<fir::ResultOp>(loc, addr);
+      },
+      [&](const auto &) -> fir::ResultOp {
+        TODO(loc, "unboxing non scalar optional fir.box");
+      });
+  builder.restoreInsertionPoint(insertPt);
+
+  // Create actual fir.if operation.
+  auto ifOp =
+      builder.create<fir::IfOp>(loc, resultOp->getOperandTypes(), condition,
+                                /*withElseRegion=*/true);
+  // Move evaluation into Then block,
+  preparationBlock->moveBefore(&ifOp.getThenRegion().back());
+  ifOp.getThenRegion().back().erase();
+  // Create absent result in the Else block.
+  builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
+  llvm::SmallVector<mlir::Value> absentValues;
+  for (mlir::Type resTy : ifOp->getResultTypes()) {
+    if (fir::isa_ref_type(resTy) || fir::isa_box_type(resTy))
+      absentValues.emplace_back(builder.create<fir::AbsentOp>(loc, resTy));
+    else
+      absentValues.emplace_back(builder.create<fir::ZeroOp>(loc, resTy));
+  }
+  builder.create<fir::ResultOp>(loc, absentValues);
+  badIfOp->erase();
+
+  // Build fir::ExtendedValue from the result values.
+  builder.setInsertionPointAfter(ifOp);
+  return result.match(
+      [&](const fir::CharBoxValue &box) -> fir::ExtendedValue {
+        return fir::CharBoxValue{ifOp.getResult(0), ifOp.getResult(1)};
+      },
+      [&](const mlir::Value &) -> fir::ExtendedValue {
+        return ifOp.getResult(0);
+      },
+      [&](const auto &) -> fir::ExtendedValue {
+        TODO(loc, "unboxing non scalar optional fir.box");
+      });
+}
+
 static fir::ExtendedValue translateVariableToExtendedValue(
     mlir::Location loc, fir::FirOpBuilder &builder, hlfir::Entity variable,
-    bool forceHlfirBase = false, bool contiguousHint = false) {
+    bool forceHlfirBase = false, bool contiguousHint = false,
+    bool keepScalarOptionalBoxed = false) {
   assert(variable.isVariable() && "must be a variable");
   // When going towards FIR, use the original base value to avoid
   // introducing descriptors at runtime when they are not required.
@@ -984,14 +1063,33 @@ static fir::ExtendedValue translateVariableToExtendedValue(
     const bool contiguous = variable.isSimplyContiguous() || contiguousHint;
     const bool isAssumedRank = variable.isAssumedRank();
     if (!contiguous || variable.isPolymorphic() ||
-        variable.isDerivedWithLengthParameters() || variable.isOptional() ||
-        isAssumedRank) {
+        variable.isDerivedWithLengthParameters() || isAssumedRank) {
       llvm::SmallVector<mlir::Value> nonDefaultLbounds;
       if (!isAssumedRank)
         nonDefaultLbounds = getNonDefaultLowerBounds(loc, builder, variable);
       return fir::BoxValue(base, nonDefaultLbounds,
                            getExplicitTypeParams(variable));
     }
+    if (variable.mayBeOptional()) {
+      if (!keepScalarOptionalBoxed && variable.isScalar()) {
+        mlir::Value isPresent = builder.create<fir::IsPresentOp>(
+            loc, builder.getI1Type(), variable);
+        return conditionallyEvaluate(
+            loc, builder, isPresent, [&]() -> fir::ExtendedValue {
+              mlir::Value base = genVariableRawAddress(loc, builder, variable);
+              if (variable.isCharacter()) {
+                mlir::Value len =
+                    genCharacterVariableLength(loc, builder, variable);
+                return fir::CharBoxValue{base, len};
+              }
+              return base;
+            });
+      }
+      llvm::SmallVector<mlir::Value> nonDefaultLbounds =
+          getNonDefaultLowerBounds(loc, builder, variable);
+      return fir::BoxValue(base, nonDefaultLbounds,
+                           getExplicitTypeParams(variable));
+    }
     // Otherwise, the variable can be represented in a fir::ExtendedValue
     // without the overhead of a fir.box.
     base = genVariableRawAddress(loc, builder, variable);
@@ -1035,10 +1133,12 @@ hlfir::translateToExtendedValue(mlir::Location loc, fir::FirOpBuilder &builder,
 
 std::pair<fir::ExtendedValue, std::optional<hlfir::CleanupFunction>>
 hlfir::translateToExtendedValue(mlir::Location loc, fir::FirOpBuilder &builder,
-                                hlfir::Entity entity, bool contiguousHint) {
+                                hlfir::Entity entity, bool contiguousHint,
+                                bool keepScalarOptionalBoxed) {
   if (entity.isVariable())
     return {translateVariableToExtendedValue(loc, builder, entity, false,
-                                             contiguousHint),
+                                             contiguousHint,
+                                             keepScalarOptionalBoxed),
             std::nullopt};
 
   if (entity.isProcedure()) {
@@ -1094,7 +1194,9 @@ hlfir::convertToBox(mlir::Location loc, fir::FirOpBuilder &builder,
   if (entity.isProcedurePointer())
     entity = hlfir::derefPointersAndAllocatables(loc, builder, entity);
 
-  auto [exv, cleanup] = translateToExtendedValue(loc, builder, entity);
+  auto [exv, cleanup] =
+      translateToExtendedValue(loc, builder, entity, /*contiguousHint=*/false,
+                               /*keepScalarOptionalBoxed=*/true);
   // Procedure entities should not go through createBoxValue that embox
   // object entities. Return the fir.boxproc directly.
   if (entity.isProcedure())
diff --git flang/lib/Optimizer/Builder/IntrinsicCall.cpp flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index d7d1471535db..ec1fc24bf819 100644
--- flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -6366,10 +6366,12 @@ mlir::Value IntrinsicLibrary::genNearest(mlir::Type resultType,
   mlir::FloatType xType = mlir::dyn_cast<mlir::FloatType>(x.getType());
   const unsigned xBitWidth = xType.getWidth();
   mlir::Type i1Ty = builder.getI1Type();
-  if constexpr (proc == NearestProc::NextAfter)
+  if constexpr (proc == NearestProc::NextAfter) {
     // If isNan(Y), set X to a qNaN that will propagate to the resultIsX result.
-    x = builder.create<mlir::arith::SelectOp>(
-        loc, genIsFPClass(i1Ty, args[1], nanTest), genQNan(xType), x);
+    mlir::Value qNan = genQNan(xType);
+    mlir::Value isFPClass = genIsFPClass(i1Ty, args[1], nanTest);
+    x = builder.create<mlir::arith::SelectOp>(loc, isFPClass, qNan, x);
+  }
   mlir::Value resultIsX = genIsFPClass(i1Ty, x, nanTest);
   mlir::Type intType = builder.getIntegerType(xBitWidth);
   mlir::Value one = builder.createIntegerConstant(loc, intType, 1);
@@ -6489,12 +6491,11 @@ mlir::Value IntrinsicLibrary::genNearest(mlir::Type resultType,
   } else {
     // Kind 2, 3, 4, 8, 16. Increment or decrement X cast to integer.
     mlir::Value intX = builder.create<mlir::arith::BitcastOp>(loc, intType, x);
+    mlir::Value add = builder.create<mlir::arith::AddIOp>(loc, intX, one);
+    mlir::Value sub = builder.create<mlir::arith::SubIOp>(loc, intX, one);
     result = builder.create<mlir::arith::BitcastOp>(
         loc, resultType,
-        builder.create<mlir::arith::SelectOp>(
-            loc, magnitudeUp,
-            builder.create<mlir::arith::AddIOp>(loc, intX, one),
-            builder.create<mlir::arith::SubIOp>(loc, intX, one)));
+        builder.create<mlir::arith::SelectOp>(loc, magnitudeUp, add, sub));
     if constexpr (proc == NearestProc::Nearest ||
                   proc == NearestProc::NextAfter) {
       genRaiseExcept(_FORTRAN_RUNTIME_IEEE_OVERFLOW |
diff --git flang/lib/Optimizer/CodeGen/CodeGen.cpp flang/lib/Optimizer/CodeGen/CodeGen.cpp
index 1b078be7bb1c..cb4eb8303a49 100644
--- flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -1190,7 +1190,7 @@ genCUFAllocDescriptor(mlir::Location loc,
                       mlir::ModuleOp mod, fir::BaseBoxType boxTy,
                       const fir::LLVMTypeConverter &typeConverter) {
   std::optional<mlir::DataLayout> dl =
-      fir::support::getOrSetDataLayout(mod, /*allowDefaultLayout=*/true);
+      fir::support::getOrSetMLIRDataLayout(mod, /*allowDefaultLayout=*/true);
   if (!dl)
     mlir::emitError(mod.getLoc(),
                     "module operation must carry a data layout attribute "
@@ -3942,7 +3942,7 @@ public:
       return signalPassFailure();
 
     std::optional<mlir::DataLayout> dl =
-        fir::support::getOrSetDataLayout(mod, /*allowDefaultLayout=*/true);
+        fir::support::getOrSetMLIRDataLayout(mod, /*allowDefaultLayout=*/true);
     if (!dl) {
       mlir::emitError(mod.getLoc(),
                       "module operation must carry a data layout attribute "
diff --git flang/lib/Optimizer/CodeGen/CodeGenOpenMP.cpp flang/lib/Optimizer/CodeGen/CodeGenOpenMP.cpp
index da13ed648e44..37f1c9f97e1c 100644
--- flang/lib/Optimizer/CodeGen/CodeGenOpenMP.cpp
+++ flang/lib/Optimizer/CodeGen/CodeGenOpenMP.cpp
@@ -90,9 +90,45 @@ struct MapInfoOpConversion
     return mlir::success();
   }
 };
+
+// FIR op specific conversion for PrivateClauseOp that overwrites the default
+// OpenMP Dialect lowering, this allows FIR-aware lowering of types, required
+// for boxes because the OpenMP dialect conversion doesn't know anything about
+// FIR types.
+struct PrivateClauseOpConversion
+    : public OpenMPFIROpConversion<mlir::omp::PrivateClauseOp> {
+  using OpenMPFIROpConversion::OpenMPFIROpConversion;
+
+  llvm::LogicalResult
+  matchAndRewrite(mlir::omp::PrivateClauseOp curOp, OpAdaptor adaptor,
+                  mlir::ConversionPatternRewriter &rewriter) const override {
+    const fir::LLVMTypeConverter &converter = lowerTy();
+    mlir::Type convertedAllocType;
+    if (auto box = mlir::dyn_cast<fir::BaseBoxType>(curOp.getType())) {
+      // In LLVM codegen fir.box<> == fir.ref<fir.box<>> == llvm.ptr
+      // Here we really do want the actual structure
+      if (box.isAssumedRank())
+        TODO(curOp->getLoc(), "Privatize an assumed rank array");
+      unsigned rank = 0;
+      if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(
+              fir::unwrapRefType(box.getEleTy())))
+        rank = seqTy.getShape().size();
+      convertedAllocType = converter.convertBoxTypeAsStruct(box, rank);
+    } else {
+      convertedAllocType = converter.convertType(adaptor.getType());
+    }
+    if (!convertedAllocType)
+      return mlir::failure();
+    rewriter.startOpModification(curOp);
+    curOp.setType(convertedAllocType);
+    rewriter.finalizeOpModification(curOp);
+    return mlir::success();
+  }
+};
 } // namespace
 
 void fir::populateOpenMPFIRToLLVMConversionPatterns(
     const LLVMTypeConverter &converter, mlir::RewritePatternSet &patterns) {
   patterns.add<MapInfoOpConversion>(converter);
+  patterns.add<PrivateClauseOpConversion>(converter);
 }
diff --git flang/lib/Optimizer/CodeGen/TargetRewrite.cpp flang/lib/Optimizer/CodeGen/TargetRewrite.cpp
index b0b9499557e2..c099a08ffd30 100644
--- flang/lib/Optimizer/CodeGen/TargetRewrite.cpp
+++ flang/lib/Optimizer/CodeGen/TargetRewrite.cpp
@@ -107,7 +107,7 @@ public:
     // TargetRewrite will require querying the type storage sizes, if it was
     // not set already, create a DataLayoutSpec for the ModuleOp now.
     std::optional<mlir::DataLayout> dl =
-        fir::support::getOrSetDataLayout(mod, /*allowDefaultLayout=*/true);
+        fir::support::getOrSetMLIRDataLayout(mod, /*allowDefaultLayout=*/true);
     if (!dl) {
       mlir::emitError(mod.getLoc(),
                       "module operation must carry a data layout attribute "
@@ -518,6 +518,7 @@ public:
     newOpers.insert(newOpers.end(), trailingOpers.begin(), trailingOpers.end());
 
     llvm::SmallVector<mlir::Value, 1> newCallResults;
+    // TODO propagate/update call argument and result attributes.
     if constexpr (std::is_same_v<std::decay_t<A>, mlir::gpu::LaunchFuncOp>) {
       auto newCall = rewriter->create<A>(
           loc, callOp.getKernel(), callOp.getGridSizeOperandValues(),
@@ -557,6 +558,7 @@ public:
           loc, newResTys, rewriter->getStringAttr(callOp.getMethod()),
           callOp.getOperands()[0], newOpers,
           rewriter->getI32IntegerAttr(*callOp.getPassArgPos() + passArgShift),
+          /*arg_attrs=*/nullptr, /*res_attrs=*/nullptr,
           callOp.getProcedureAttrsAttr());
       if (wrap)
         newCallResults.push_back((*wrap)(dispatchOp.getOperation()));
diff --git flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIRIntrinsics.cpp flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIRIntrinsics.cpp
index bd12700f1383..7c0fcba80686 100644
--- flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIRIntrinsics.cpp
+++ flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIRIntrinsics.cpp
@@ -121,8 +121,14 @@ protected:
         // simplified since the fir.box lowered here are now guarenteed to
         // contain the local lower bounds thanks to the hlfir.declare (the extra
         // rebox can be removed).
-        auto [exv, cleanup] =
-            hlfir::translateToExtendedValue(loc, builder, entity);
+        // When taking arguments as descriptors, the runtime expect absent
+        // OPTIONAL to be a nullptr to a descriptor, lowering has already
+        // prepared such descriptors as needed, hence set
+        // keepScalarOptionalBoxed to avoid building descriptors with a null
+        // address for them.
+        auto [exv, cleanup] = hlfir::translateToExtendedValue(
+            loc, builder, entity, /*contiguous=*/false,
+            /*keepScalarOptionalBoxed=*/true);
         if (cleanup)
           cleanupFns.push_back(*cleanup);
         ret.emplace_back(exv);
diff --git flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
index fe7ae0eeed3c..c1c3839c47e1 100644
--- flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
+++ flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
@@ -951,6 +951,219 @@ private:
   }
 };
 
+class ReshapeAsElementalConversion
+    : public mlir::OpRewritePattern<hlfir::ReshapeOp> {
+public:
+  using mlir::OpRewritePattern<hlfir::ReshapeOp>::OpRewritePattern;
+
+  llvm::LogicalResult
+  matchAndRewrite(hlfir::ReshapeOp reshape,
+                  mlir::PatternRewriter &rewriter) const override {
+    // Do not inline RESHAPE with ORDER yet. The runtime implementation
+    // may be good enough, unless the temporary creation overhead
+    // is high.
+    // TODO: If ORDER is constant, then we can still easily inline.
+    // TODO: If the result's rank is 1, then we can assume ORDER == (/1/).
+    if (reshape.getOrder())
+      return rewriter.notifyMatchFailure(reshape,
+                                         "RESHAPE with ORDER argument");
+
+    // Verify that the element types of ARRAY, PAD and the result
+    // match before doing any transformations. For example,
+    // the character types of different lengths may appear in the dead
+    // code, and it just does not make sense to inline hlfir.reshape
+    // in this case (a runtime call might have less code size footprint).
+    hlfir::Entity result = hlfir::Entity{reshape};
+    hlfir::Entity array = hlfir::Entity{reshape.getArray()};
+    mlir::Type elementType = array.getFortranElementType();
+    if (result.getFortranElementType() != elementType)
+      return rewriter.notifyMatchFailure(
+          reshape, "ARRAY and result have different types");
+    mlir::Value pad = reshape.getPad();
+    if (pad && hlfir::getFortranElementType(pad.getType()) != elementType)
+      return rewriter.notifyMatchFailure(reshape,
+                                         "ARRAY and PAD have different types");
+
+    // TODO: selecting between ARRAY and PAD of non-trivial element types
+    // requires more work. We have to select between two references
+    // to elements in ARRAY and PAD. This requires conditional
+    // bufferization of the element, if ARRAY/PAD is an expression.
+    if (pad && !fir::isa_trivial(elementType))
+      return rewriter.notifyMatchFailure(reshape,
+                                         "PAD present with non-trivial type");
+
+    mlir::Location loc = reshape.getLoc();
+    fir::FirOpBuilder builder{rewriter, reshape.getOperation()};
+    // Assume that all the indices arithmetic does not overflow
+    // the IndexType.
+    builder.setIntegerOverflowFlags(mlir::arith::IntegerOverflowFlags::nuw);
+
+    llvm::SmallVector<mlir::Value, 1> typeParams;
+    hlfir::genLengthParameters(loc, builder, array, typeParams);
+
+    // Fetch the extents of ARRAY, PAD and result beforehand.
+    llvm::SmallVector<mlir::Value, Fortran::common::maxRank> arrayExtents =
+        hlfir::genExtentsVector(loc, builder, array);
+
+    // If PAD is present, we have to use array size to start taking
+    // elements from the PAD array.
+    mlir::Value arraySize =
+        pad ? computeArraySize(loc, builder, arrayExtents) : nullptr;
+    hlfir::Entity shape = hlfir::Entity{reshape.getShape()};
+    llvm::SmallVector<mlir::Value, Fortran::common::maxRank> resultExtents;
+    mlir::Type indexType = builder.getIndexType();
+    for (int idx = 0; idx < result.getRank(); ++idx)
+      resultExtents.push_back(hlfir::loadElementAt(
+          loc, builder, shape,
+          builder.createIntegerConstant(loc, indexType, idx + 1)));
+    auto resultShape = builder.create<fir::ShapeOp>(loc, resultExtents);
+
+    auto genKernel = [&](mlir::Location loc, fir::FirOpBuilder &builder,
+                         mlir::ValueRange inputIndices) -> hlfir::Entity {
+      mlir::Value linearIndex =
+          computeLinearIndex(loc, builder, resultExtents, inputIndices);
+      fir::IfOp ifOp;
+      if (pad) {
+        // PAD is present. Check if this element comes from the PAD array.
+        mlir::Value isInsideArray = builder.create<mlir::arith::CmpIOp>(
+            loc, mlir::arith::CmpIPredicate::ult, linearIndex, arraySize);
+        ifOp = builder.create<fir::IfOp>(loc, elementType, isInsideArray,
+                                         /*withElseRegion=*/true);
+
+        // In the 'else' block, return an element from the PAD.
+        builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
+        // PAD is dynamically optional, but we can unconditionally access it
+        // in the 'else' block. If we have to start taking elements from it,
+        // then it must be present in a valid program.
+        llvm::SmallVector<mlir::Value, Fortran::common::maxRank> padExtents =
+            hlfir::genExtentsVector(loc, builder, hlfir::Entity{pad});
+        // Subtract the ARRAY size from the zero-based linear index
+        // to get the zero-based linear index into PAD.
+        mlir::Value padLinearIndex =
+            builder.create<mlir::arith::SubIOp>(loc, linearIndex, arraySize);
+        llvm::SmallVector<mlir::Value, Fortran::common::maxRank> padIndices =
+            delinearizeIndex(loc, builder, padExtents, padLinearIndex,
+                             /*wrapAround=*/true);
+        mlir::Value padElement =
+            hlfir::loadElementAt(loc, builder, hlfir::Entity{pad}, padIndices);
+        builder.create<fir::ResultOp>(loc, padElement);
+
+        // In the 'then' block, return an element from the ARRAY.
+        builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
+      }
+
+      llvm::SmallVector<mlir::Value, Fortran::common::maxRank> arrayIndices =
+          delinearizeIndex(loc, builder, arrayExtents, linearIndex,
+                           /*wrapAround=*/false);
+      mlir::Value arrayElement =
+          hlfir::loadElementAt(loc, builder, array, arrayIndices);
+
+      if (ifOp) {
+        builder.create<fir::ResultOp>(loc, arrayElement);
+        builder.setInsertionPointAfter(ifOp);
+        arrayElement = ifOp.getResult(0);
+      }
+
+      return hlfir::Entity{arrayElement};
+    };
+    hlfir::ElementalOp elementalOp = hlfir::genElementalOp(
+        loc, builder, elementType, resultShape, typeParams, genKernel,
+        /*isUnordered=*/true,
+        /*polymorphicMold=*/result.isPolymorphic() ? array : mlir::Value{},
+        reshape.getResult().getType());
+    assert(elementalOp.getResult().getType() == reshape.getResult().getType());
+    rewriter.replaceOp(reshape, elementalOp);
+    return mlir::success();
+  }
+
+private:
+  /// Compute zero-based linear index given an array extents
+  /// and one-based indices:
+  ///   \p extents: [e0, e1, ..., en]
+  ///   \p indices: [i0, i1, ..., in]
+  ///
+  /// linear-index :=
+  ///   (...((in-1)*e(n-1)+(i(n-1)-1))*e(n-2)+...)*e0+(i0-1)
+  static mlir::Value computeLinearIndex(mlir::Location loc,
+                                        fir::FirOpBuilder &builder,
+                                        mlir::ValueRange extents,
+                                        mlir::ValueRange indices) {
+    std::size_t rank = extents.size();
+    assert(rank == indices.size());
+    mlir::Type indexType = builder.getIndexType();
+    mlir::Value zero = builder.createIntegerConstant(loc, indexType, 0);
+    mlir::Value one = builder.createIntegerConstant(loc, indexType, 1);
+    mlir::Value linearIndex = zero;
+    std::size_t idx = 0;
+    for (auto index : llvm::reverse(indices)) {
+      mlir::Value tmp = builder.create<mlir::arith::SubIOp>(
+          loc, builder.createConvert(loc, indexType, index), one);
+      tmp = builder.create<mlir::arith::AddIOp>(loc, linearIndex, tmp);
+      if (idx + 1 < rank)
+        tmp = builder.create<mlir::arith::MulIOp>(
+            loc, tmp,
+            builder.createConvert(loc, indexType, extents[rank - idx - 2]));
+
+      linearIndex = tmp;
+      ++idx;
+    }
+    return linearIndex;
+  }
+
+  /// Compute one-based array indices from the given zero-based \p linearIndex
+  /// and the array \p extents [e0, e1, ..., en].
+  ///   i0 := linearIndex % e0 + 1
+  ///   linearIndex := linearIndex / e0
+  ///   i1 := linearIndex % e1 + 1
+  ///   linearIndex := linearIndex / e1
+  ///   ...
+  ///   i(n-1) := linearIndex % e(n-1) + 1
+  ///   linearIndex := linearIndex / e(n-1)
+  ///   if (wrapAround) {
+  ///     // If the index is allowed to wrap around, then
+  ///     // we need to modulo it by the last dimension's extent.
+  ///     in := linearIndex % en + 1
+  ///   } else {
+  ///     in := linearIndex + 1
+  ///   }
+  static llvm::SmallVector<mlir::Value, Fortran::common::maxRank>
+  delinearizeIndex(mlir::Location loc, fir::FirOpBuilder &builder,
+                   mlir::ValueRange extents, mlir::Value linearIndex,
+                   bool wrapAround) {
+    llvm::SmallVector<mlir::Value, Fortran::common::maxRank> indices;
+    mlir::Type indexType = builder.getIndexType();
+    mlir::Value one = builder.createIntegerConstant(loc, indexType, 1);
+    linearIndex = builder.createConvert(loc, indexType, linearIndex);
+
+    for (std::size_t dim = 0; dim < extents.size(); ++dim) {
+      mlir::Value extent = builder.createConvert(loc, indexType, extents[dim]);
+      // Avoid the modulo for the last index, unless wrap around is allowed.
+      mlir::Value currentIndex = linearIndex;
+      if (dim != extents.size() - 1 || wrapAround)
+        currentIndex =
+            builder.create<mlir::arith::RemUIOp>(loc, linearIndex, extent);
+      // The result of the last division is unused, so it will be DCEd.
+      linearIndex =
+          builder.create<mlir::arith::DivUIOp>(loc, linearIndex, extent);
+      indices.push_back(
+          builder.create<mlir::arith::AddIOp>(loc, currentIndex, one));
+    }
+    return indices;
+  }
+
+  /// Return size of an array given its extents.
+  static mlir::Value computeArraySize(mlir::Location loc,
+                                      fir::FirOpBuilder &builder,
+                                      mlir::ValueRange extents) {
+    mlir::Type indexType = builder.getIndexType();
+    mlir::Value size = builder.createIntegerConstant(loc, indexType, 1);
+    for (auto extent : extents)
+      size = builder.create<mlir::arith::MulIOp>(
+          loc, size, builder.createConvert(loc, indexType, extent));
+    return size;
+  }
+};
+
 class SimplifyHLFIRIntrinsics
     : public hlfir::impl::SimplifyHLFIRIntrinsicsBase<SimplifyHLFIRIntrinsics> {
 public:
@@ -987,6 +1200,7 @@ public:
       patterns.insert<MatmulConversion<hlfir::MatmulOp>>(context);
 
     patterns.insert<DotProductConversion>(context);
+    patterns.insert<ReshapeAsElementalConversion>(context);
 
     if (mlir::failed(mlir::applyPatternsGreedily(
             getOperation(), std::move(patterns), config))) {
diff --git flang/lib/Optimizer/OpenMP/MapsForPrivatizedSymbols.cpp flang/lib/Optimizer/OpenMP/MapsForPrivatizedSymbols.cpp
index c990bebcabde..963ae863c1fc 100644
--- flang/lib/Optimizer/OpenMP/MapsForPrivatizedSymbols.cpp
+++ flang/lib/Optimizer/OpenMP/MapsForPrivatizedSymbols.cpp
@@ -55,15 +55,17 @@ class MapsForPrivatizedSymbolsPass
         std::underlying_type_t<llvm::omp::OpenMPOffloadMappingFlags>>(
         llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO);
     Operation *definingOp = var.getDefiningOp();
-    auto declOp = llvm::dyn_cast_or_null<hlfir::DeclareOp>(definingOp);
-    assert(declOp &&
-           "Expected defining Op of privatized var to be hlfir.declare");
 
+    Value varPtr = var;
     // We want the first result of the hlfir.declare op because our goal
     // is to map the descriptor (fir.box or fir.boxchar) and the first
     // result for hlfir.declare is the descriptor if a the symbol being
     // decalred needs a descriptor.
-    Value varPtr = declOp.getBase();
+    // Some types are boxed immediately before privatization. These have other
+    // operations in between the privatization and the declaration. It is safe
+    // to use var directly here because they will be boxed anyway.
+    if (auto declOp = llvm::dyn_cast_if_present<hlfir::DeclareOp>(definingOp))
+      varPtr = declOp.getBase();
 
     // If we do not have a reference to descritor, but the descriptor itself
     // then we need to store that on the stack so that we can map the
diff --git flang/lib/Optimizer/Passes/Pipelines.cpp flang/lib/Optimizer/Passes/Pipelines.cpp
index 1cc3f0b81c20..d55ad9e603ff 100644
--- flang/lib/Optimizer/Passes/Pipelines.cpp
+++ flang/lib/Optimizer/Passes/Pipelines.cpp
@@ -245,7 +245,15 @@ void createHLFIRToFIRPassPipeline(mlir::PassManager &pm, bool enableOpenMP,
   }
   pm.addPass(hlfir::createLowerHLFIROrderedAssignments());
   pm.addPass(hlfir::createLowerHLFIRIntrinsics());
-  pm.addPass(hlfir::createBufferizeHLFIR());
+
+  hlfir::BufferizeHLFIROptions bufferizeOptions;
+  // For opt-for-speed, avoid running any of the loops resulting
+  // from hlfir.elemental lowering, if the result is an empty array.
+  // This helps to avoid long running loops for elementals with
+  // shapes like (0, HUGE).
+  if (optLevel.isOptimizingForSpeed())
+    bufferizeOptions.optimizeEmptyElementals = true;
+  pm.addPass(hlfir::createBufferizeHLFIR(bufferizeOptions));
   // Run hlfir.assign inlining again after BufferizeHLFIR,
   // because the latter may introduce new hlfir.assign operations,
   // e.g. for copying an array into a temporary due to
diff --git flang/lib/Optimizer/Support/DataLayout.cpp flang/lib/Optimizer/Support/DataLayout.cpp
index 93a3b92d0810..f89ce5984b91 100644
--- flang/lib/Optimizer/Support/DataLayout.cpp
+++ flang/lib/Optimizer/Support/DataLayout.cpp
@@ -10,6 +10,7 @@
 #include "flang/Optimizer/Dialect/Support/FIRContext.h"
 #include "flang/Optimizer/Support/FatalError.h"
 #include "mlir/Dialect/DLTI/DLTI.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/Interfaces/DataLayoutInterfaces.h"
@@ -20,8 +21,9 @@
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Target/TargetMachine.h"
 
-void fir::support::setMLIRDataLayout(mlir::ModuleOp mlirModule,
-                                     const llvm::DataLayout &dl) {
+namespace {
+template <typename ModOpTy>
+static void setDataLayout(ModOpTy mlirModule, const llvm::DataLayout &dl) {
   mlir::MLIRContext *context = mlirModule.getContext();
   mlirModule->setAttr(
       mlir::LLVM::LLVMDialect::getDataLayoutAttrName(),
@@ -30,12 +32,14 @@ void fir::support::setMLIRDataLayout(mlir::ModuleOp mlirModule,
   mlirModule->setAttr(mlir::DLTIDialect::kDataLayoutAttrName, dlSpec);
 }
 
-void fir::support::setMLIRDataLayoutFromAttributes(mlir::ModuleOp mlirModule,
-                                                   bool allowDefaultLayout) {
+template <typename ModOpTy>
+static void setDataLayoutFromAttributes(ModOpTy mlirModule,
+                                        bool allowDefaultLayout) {
   if (mlirModule.getDataLayoutSpec())
     return; // Already set.
-  if (auto dataLayoutString = mlirModule->getAttrOfType<mlir::StringAttr>(
-          mlir::LLVM::LLVMDialect::getDataLayoutAttrName())) {
+  if (auto dataLayoutString =
+          mlirModule->template getAttrOfType<mlir::StringAttr>(
+              mlir::LLVM::LLVMDialect::getDataLayoutAttrName())) {
     llvm::DataLayout llvmDataLayout(dataLayoutString);
     fir::support::setMLIRDataLayout(mlirModule, llvmDataLayout);
     return;
@@ -46,15 +50,48 @@ void fir::support::setMLIRDataLayoutFromAttributes(mlir::ModuleOp mlirModule,
   fir::support::setMLIRDataLayout(mlirModule, llvmDataLayout);
 }
 
-std::optional<mlir::DataLayout>
-fir::support::getOrSetDataLayout(mlir::ModuleOp mlirModule,
-                                 bool allowDefaultLayout) {
-  if (!mlirModule.getDataLayoutSpec()) {
+template <typename ModOpTy>
+static std::optional<mlir::DataLayout>
+getOrSetDataLayout(ModOpTy mlirModule, bool allowDefaultLayout) {
+  if (!mlirModule.getDataLayoutSpec())
     fir::support::setMLIRDataLayoutFromAttributes(mlirModule,
                                                   allowDefaultLayout);
-    if (!mlirModule.getDataLayoutSpec()) {
-      return std::nullopt;
-    }
-  }
+  if (!mlirModule.getDataLayoutSpec() &&
+      !mlir::isa<mlir::gpu::GPUModuleOp>(mlirModule))
+    return std::nullopt;
   return mlir::DataLayout(mlirModule);
 }
+
+} // namespace
+
+void fir::support::setMLIRDataLayout(mlir::ModuleOp mlirModule,
+                                     const llvm::DataLayout &dl) {
+  setDataLayout(mlirModule, dl);
+}
+
+void fir::support::setMLIRDataLayout(mlir::gpu::GPUModuleOp mlirModule,
+                                     const llvm::DataLayout &dl) {
+  setDataLayout(mlirModule, dl);
+}
+
+void fir::support::setMLIRDataLayoutFromAttributes(mlir::ModuleOp mlirModule,
+                                                   bool allowDefaultLayout) {
+  setDataLayoutFromAttributes(mlirModule, allowDefaultLayout);
+}
+
+void fir::support::setMLIRDataLayoutFromAttributes(
+    mlir::gpu::GPUModuleOp mlirModule, bool allowDefaultLayout) {
+  setDataLayoutFromAttributes(mlirModule, allowDefaultLayout);
+}
+
+std::optional<mlir::DataLayout>
+fir::support::getOrSetMLIRDataLayout(mlir::ModuleOp mlirModule,
+                                     bool allowDefaultLayout) {
+  return getOrSetDataLayout(mlirModule, allowDefaultLayout);
+}
+
+std::optional<mlir::DataLayout>
+fir::support::getOrSetMLIRDataLayout(mlir::gpu::GPUModuleOp mlirModule,
+                                     bool allowDefaultLayout) {
+  return getOrSetDataLayout(mlirModule, allowDefaultLayout);
+}
diff --git flang/lib/Optimizer/Transforms/AbstractResult.cpp flang/lib/Optimizer/Transforms/AbstractResult.cpp
index b0327cc10e9d..f8badfa639f9 100644
--- flang/lib/Optimizer/Transforms/AbstractResult.cpp
+++ flang/lib/Optimizer/Transforms/AbstractResult.cpp
@@ -147,6 +147,7 @@ public:
       newResultTypes.emplace_back(getVoidPtrType(result.getContext()));
 
     Op newOp;
+    // TODO: propagate argument and result attributes (need to be shifted).
     // fir::CallOp specific handling.
     if constexpr (std::is_same_v<Op, fir::CallOp>) {
       if (op.getCallee()) {
@@ -189,9 +190,11 @@ public:
       if (op.getPassArgPos())
         passArgPos =
             rewriter.getI32IntegerAttr(*op.getPassArgPos() + passArgShift);
+      // TODO: propagate argument and result attributes (need to be shifted).
       newOp = rewriter.create<fir::DispatchOp>(
           loc, newResultTypes, rewriter.getStringAttr(op.getMethod()),
           op.getOperands()[0], newOperands, passArgPos,
+          /*arg_attrs=*/nullptr, /*res_attrs=*/nullptr,
           op.getProcedureAttrsAttr());
     }
 
diff --git flang/lib/Optimizer/Transforms/AddDebugInfo.cpp flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
index da0f11399d4f..3a5d7479bbf3 100644
--- flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
+++ flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
@@ -523,7 +523,7 @@ void AddDebugInfoPass::runOnOperation() {
   llvm::StringRef fileName;
   std::string filePath;
   std::optional<mlir::DataLayout> dl =
-      fir::support::getOrSetDataLayout(module, /*allowDefaultLayout=*/true);
+      fir::support::getOrSetMLIRDataLayout(module, /*allowDefaultLayout=*/true);
   if (!dl) {
     mlir::emitError(module.getLoc(), "Missing data layout attribute in module");
     signalPassFailure();
diff --git flang/lib/Optimizer/Transforms/CUFAddConstructor.cpp flang/lib/Optimizer/Transforms/CUFAddConstructor.cpp
index 97551595db03..43ef6822de30 100644
--- flang/lib/Optimizer/Transforms/CUFAddConstructor.cpp
+++ flang/lib/Optimizer/Transforms/CUFAddConstructor.cpp
@@ -57,7 +57,7 @@ struct CUFAddConstructor
     auto funcTy =
         mlir::LLVM::LLVMFunctionType::get(voidTy, {}, /*isVarArg=*/false);
     std::optional<mlir::DataLayout> dl =
-        fir::support::getOrSetDataLayout(mod, /*allowDefaultLayout=*/false);
+        fir::support::getOrSetMLIRDataLayout(mod, /*allowDefaultLayout=*/false);
     if (!dl) {
       mlir::emitError(mod.getLoc(),
                       "data layout attribute is required to perform " +
diff --git flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp
index 9a46323c1111..2a95e41944f3 100644
--- flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp
+++ flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp
@@ -188,8 +188,8 @@ public:
     if (!module)
       return signalPassFailure();
 
-    std::optional<mlir::DataLayout> dl =
-        fir::support::getOrSetDataLayout(module, /*allowDefaultLayout=*/false);
+    std::optional<mlir::DataLayout> dl = fir::support::getOrSetMLIRDataLayout(
+        module, /*allowDefaultLayout=*/false);
     fir::LLVMTypeConverter typeConverter(module, /*applyTBAA=*/false,
                                          /*forceUnifiedTBAATree=*/false, *dl);
     cuf::populateCUFGPUToLLVMConversionPatterns(typeConverter, patterns);
diff --git flang/lib/Optimizer/Transforms/CUFOpConversion.cpp flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
index 7d84d2f7e2e5..1f0576aa82f8 100644
--- flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
+++ flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
@@ -103,7 +103,7 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
   mlir::Value sourceLine;
   if constexpr (std::is_same_v<OpTy, cuf::AllocateOp>)
     sourceLine = fir::factory::locationToLineNo(
-        builder, loc, op.getSource() ? fTy.getInput(6) : fTy.getInput(5));
+        builder, loc, op.getSource() ? fTy.getInput(7) : fTy.getInput(6));
   else
     sourceLine = fir::factory::locationToLineNo(builder, loc, fTy.getInput(4));
 
@@ -119,22 +119,28 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
   }
   llvm::SmallVector<mlir::Value> args;
   if constexpr (std::is_same_v<OpTy, cuf::AllocateOp>) {
+    mlir::Value pinned =
+        op.getPinned()
+            ? op.getPinned()
+            : builder.createNullConstant(
+                  loc, fir::ReferenceType::get(
+                           mlir::IntegerType::get(op.getContext(), 1)));
     if (op.getSource()) {
       mlir::Value stream =
           op.getStream()
               ? op.getStream()
               : builder.createIntegerConstant(loc, fTy.getInput(2), -1);
-      args = fir::runtime::createArguments(builder, loc, fTy, op.getBox(),
-                                           op.getSource(), stream, hasStat,
-                                           errmsg, sourceFile, sourceLine);
+      args = fir::runtime::createArguments(
+          builder, loc, fTy, op.getBox(), op.getSource(), stream, pinned,
+          hasStat, errmsg, sourceFile, sourceLine);
     } else {
       mlir::Value stream =
           op.getStream()
               ? op.getStream()
               : builder.createIntegerConstant(loc, fTy.getInput(1), -1);
       args = fir::runtime::createArguments(builder, loc, fTy, op.getBox(),
-                                           stream, hasStat, errmsg, sourceFile,
-                                           sourceLine);
+                                           stream, pinned, hasStat, errmsg,
+                                           sourceFile, sourceLine);
     }
   } else {
     args =
@@ -153,11 +159,6 @@ struct CUFAllocateOpConversion
   mlir::LogicalResult
   matchAndRewrite(cuf::AllocateOp op,
                   mlir::PatternRewriter &rewriter) const override {
-    // TODO: Pinned is a reference to a logical value that can be set to true
-    // when pinned allocation succeed. This will require a new entry point.
-    if (op.getPinned())
-      return mlir::failure();
-
     auto mod = op->getParentOfType<mlir::ModuleOp>();
     fir::FirOpBuilder builder(rewriter, mod);
     mlir::Location loc = op.getLoc();
@@ -919,8 +920,8 @@ public:
       return signalPassFailure();
     mlir::SymbolTable symtab(module);
 
-    std::optional<mlir::DataLayout> dl =
-        fir::support::getOrSetDataLayout(module, /*allowDefaultLayout=*/false);
+    std::optional<mlir::DataLayout> dl = fir::support::getOrSetMLIRDataLayout(
+        module, /*allowDefaultLayout=*/false);
     fir::LLVMTypeConverter typeConverter(module, /*applyTBAA=*/false,
                                          /*forceUnifiedTBAATree=*/false, *dl);
     target.addLegalDialect<fir::FIROpsDialect, mlir::arith::ArithDialect,
diff --git flang/lib/Optimizer/Transforms/LoopVersioning.cpp flang/lib/Optimizer/Transforms/LoopVersioning.cpp
index 343cc6ff2895..1f3495569c9d 100644
--- flang/lib/Optimizer/Transforms/LoopVersioning.cpp
+++ flang/lib/Optimizer/Transforms/LoopVersioning.cpp
@@ -312,8 +312,8 @@ void LoopVersioningPass::runOnOperation() {
   mlir::ModuleOp module = func->getParentOfType<mlir::ModuleOp>();
   fir::KindMapping kindMap = fir::getKindMapping(module);
   mlir::SmallVector<ArgInfo, 4> argsOfInterest;
-  std::optional<mlir::DataLayout> dl =
-      fir::support::getOrSetDataLayout(module, /*allowDefaultLayout=*/false);
+  std::optional<mlir::DataLayout> dl = fir::support::getOrSetMLIRDataLayout(
+      module, /*allowDefaultLayout=*/false);
   if (!dl)
     mlir::emitError(module.getLoc(),
                     "data layout attribute is required to perform " DEBUG_TYPE
diff --git flang/lib/Optimizer/Transforms/PolymorphicOpConversion.cpp flang/lib/Optimizer/Transforms/PolymorphicOpConversion.cpp
index 070889a284f4..0c78a878cdc5 100644
--- flang/lib/Optimizer/Transforms/PolymorphicOpConversion.cpp
+++ flang/lib/Optimizer/Transforms/PolymorphicOpConversion.cpp
@@ -205,8 +205,9 @@ struct DispatchOpConv : public OpConversionPattern<fir::DispatchOp> {
     // Make the call.
     llvm::SmallVector<mlir::Value> args{funcPtr};
     args.append(dispatch.getArgs().begin(), dispatch.getArgs().end());
-    rewriter.replaceOpWithNewOp<fir::CallOp>(dispatch, resTypes, nullptr, args,
-                                             dispatch.getProcedureAttrsAttr());
+    rewriter.replaceOpWithNewOp<fir::CallOp>(
+        dispatch, resTypes, nullptr, args, dispatch.getArgAttrsAttr(),
+        dispatch.getResAttrsAttr(), dispatch.getProcedureAttrsAttr());
     return mlir::success();
   }
 
diff --git flang/lib/Parser/openmp-parsers.cpp flang/lib/Parser/openmp-parsers.cpp
index 43b4e9df97db..2b6c77c08cc5 100644
--- flang/lib/Parser/openmp-parsers.cpp
+++ flang/lib/Parser/openmp-parsers.cpp
@@ -68,6 +68,8 @@ void OmpDirectiveNameParser::initTokens(NameWithId *table) const {
       [](auto &a, auto &b) { return a.first.size() > b.first.size(); });
 }
 
+// --- Modifier helpers -----------------------------------------------
+
 template <typename Clause, typename Separator> struct ModifierList {
   constexpr ModifierList(Separator sep) : sep_(sep) {}
   constexpr ModifierList(const ModifierList &) = default;
@@ -118,10 +120,8 @@ struct SpecificModifierParser {
   }
 };
 
-// OpenMP Clauses
+// --- Iterator helpers -----------------------------------------------
 
-// [5.0] 2.1.6 iterator-specifier -> type-declaration-stmt = subscript-triple |
-//                                   identifier = subscript-triple
 // [5.0:47:17-18] In an iterator-specifier, if the iterator-type is not
 // specified then the type of that iterator is default integer.
 // [5.0:49:14] The iterator-type must be an integer type.
@@ -153,8 +153,30 @@ static TypeDeclarationStmt makeIterSpecDecl(std::list<ObjectName> &&names) {
       makeEntityList(std::move(names)));
 }
 
-TYPE_PARSER(sourced(construct<OmpDirectiveSpecification>(
-    OmpDirectiveNameParser{}, maybe(indirect(Parser<OmpClauseList>{})))))
+// --- Parsers for arguments ------------------------------------------
+
+// At the moment these are only directive arguments. This is needed for
+// parsing directive-specification.
+
+TYPE_PARSER( //
+    construct<OmpLocator>(Parser<OmpObject>{}) ||
+    construct<OmpLocator>(Parser<FunctionReference>{}))
+
+TYPE_PARSER(sourced( //
+    construct<OmpArgument>(Parser<OmpMapperSpecifier>{}) ||
+    construct<OmpArgument>(Parser<OmpReductionSpecifier>{}) ||
+    construct<OmpArgument>(Parser<OmpLocator>{})))
+
+TYPE_PARSER(construct<OmpLocatorList>(nonemptyList(Parser<OmpLocator>{})))
+
+TYPE_PARSER( //
+    construct<OmpTypeSpecifier>(Parser<TypeSpec>{}) ||
+    construct<OmpTypeSpecifier>(Parser<DeclarationTypeSpec>{}))
+
+TYPE_PARSER(construct<OmpReductionSpecifier>( //
+    Parser<OmpReductionIdentifier>{},
+    ":"_tok >> nonemptyList(Parser<OmpTypeSpecifier>{}),
+    maybe(":"_tok >> Parser<OmpReductionCombiner>{})))
 
 // --- Parsers for context traits -------------------------------------
 
@@ -213,15 +235,11 @@ static constexpr auto propertyListParser(PropParser... pp) {
   // the entire list in each of the alternative property parsers. Otherwise,
   // the name parser could stop after "foo" in "(foo, bar(1))", without
   // allowing the next parser to give the list a try.
-  auto listOf{[](auto parser) { //
-    return nonemptySeparated(parser, ",");
-  }};
-
   using P = OmpTraitProperty;
   return maybe("(" >> //
       construct<OmpTraitSelector::Properties>(
           maybe(Parser<OmpTraitScore>{} / ":"),
-          (attempt(listOf(sourced(construct<P>(pp))) / ")") || ...)));
+          (attempt(nonemptyList(sourced(construct<P>(pp))) / ")") || ...)));
 }
 
 // Parser for OmpTraitSelector
@@ -309,7 +327,7 @@ TYPE_PARSER(sourced(construct<OmpTraitSetSelector>( //
 TYPE_PARSER(sourced(construct<OmpContextSelectorSpecification>(
     nonemptySeparated(Parser<OmpTraitSetSelector>{}, ","))))
 
-// Parser<OmpContextSelector> == Parser<traits::OmpContextSelectorSpecification>
+// Note: OmpContextSelector is a type alias.
 
 // --- Parsers for clause modifiers -----------------------------------
 
@@ -543,7 +561,7 @@ TYPE_PARSER(construct<OmpDefaultClause::DataSharingAttribute>(
 TYPE_PARSER(construct<OmpDefaultClause>(
     construct<OmpDefaultClause>(
         Parser<OmpDefaultClause::DataSharingAttribute>{}) ||
-    construct<OmpDefaultClause>(Parser<OmpDirectiveSpecification>{})))
+    construct<OmpDefaultClause>(indirect(Parser<OmpDirectiveSpecification>{}))))
 
 // 2.5 PROC_BIND (MASTER | CLOSE | PRIMARY | SPREAD)
 TYPE_PARSER(construct<OmpProcBindClause>(
@@ -713,11 +731,11 @@ TYPE_PARSER(construct<OmpMatchClause>(
     Parser<traits::OmpContextSelectorSpecification>{}))
 
 TYPE_PARSER(construct<OmpOtherwiseClause>(
-    maybe(sourced(Parser<OmpDirectiveSpecification>{}))))
+    maybe(indirect(sourced(Parser<OmpDirectiveSpecification>{})))))
 
 TYPE_PARSER(construct<OmpWhenClause>(
     maybe(nonemptyList(Parser<OmpWhenClause::Modifier>{}) / ":"),
-    maybe(sourced(Parser<OmpDirectiveSpecification>{}))))
+    maybe(indirect(sourced(Parser<OmpDirectiveSpecification>{})))))
 
 // OMP 5.2 12.6.1 grainsize([ prescriptiveness :] scalar-integer-expression)
 TYPE_PARSER(construct<OmpGrainsizeClause>(
@@ -933,6 +951,13 @@ TYPE_PARSER(construct<OmpObjectList>(nonemptyList(Parser<OmpObject>{})))
 TYPE_PARSER(sourced(construct<OmpErrorDirective>(
     verbatim("ERROR"_tok), Parser<OmpClauseList>{})))
 
+// --- Parsers for directives and constructs --------------------------
+
+TYPE_PARSER(sourced(construct<OmpDirectiveSpecification>( //
+    OmpDirectiveNameParser{},
+    maybe(parenthesized(nonemptyList(Parser<OmpArgument>{}))),
+    maybe(Parser<OmpClauseList>{}))))
+
 TYPE_PARSER(sourced(construct<OmpNothingDirective>("NOTHING" >> ok)))
 
 TYPE_PARSER(sourced(construct<OpenMPUtilityConstruct>(
@@ -1145,20 +1170,17 @@ TYPE_PARSER(
 TYPE_PARSER(sourced(construct<OpenMPDeclareTargetConstruct>(
     verbatim("DECLARE TARGET"_tok), Parser<OmpDeclareTargetSpecifier>{})))
 
-// declare-mapper-specifier
-TYPE_PARSER(construct<OmpDeclareMapperSpecifier>(
+// mapper-specifier
+TYPE_PARSER(construct<OmpMapperSpecifier>(
     maybe(name / ":" / !":"_tok), typeSpec / "::", name))
 
 // OpenMP 5.2: 5.8.8 Declare Mapper Construct
-TYPE_PARSER(sourced(construct<OpenMPDeclareMapperConstruct>(
-    verbatim("DECLARE MAPPER"_tok),
-    "(" >> Parser<OmpDeclareMapperSpecifier>{} / ")", Parser<OmpClauseList>{})))
+TYPE_PARSER(sourced(
+    construct<OpenMPDeclareMapperConstruct>(verbatim("DECLARE MAPPER"_tok),
+        parenthesized(Parser<OmpMapperSpecifier>{}), Parser<OmpClauseList>{})))
 
 TYPE_PARSER(construct<OmpReductionCombiner>(Parser<AssignmentStmt>{}) ||
-    construct<OmpReductionCombiner>(
-        construct<OmpReductionCombiner::FunctionCombiner>(
-            construct<Call>(Parser<ProcedureDesignator>{},
-                parenthesized(optionalList(actualArgSpec))))))
+    construct<OmpReductionCombiner>(Parser<FunctionReference>{}))
 
 // 2.17.7 atomic -> ATOMIC [clause [,]] atomic-clause [[,] clause] |
 //                  ATOMIC [clause]
@@ -1297,7 +1319,9 @@ TYPE_PARSER(startOmpLine >>
             construct<OpenMPDeclarativeConstruct>(
                 Parser<OpenMPThreadprivate>{}) ||
             construct<OpenMPDeclarativeConstruct>(
-                Parser<OpenMPUtilityConstruct>{})) /
+                Parser<OpenMPUtilityConstruct>{}) ||
+            construct<OpenMPDeclarativeConstruct>(
+                Parser<OmpMetadirectiveDirective>{})) /
             endOmpLine))
 
 // Block Construct
diff --git flang/lib/Parser/parse-tree.cpp flang/lib/Parser/parse-tree.cpp
index a414f226058e..251b6919cf52 100644
--- flang/lib/Parser/parse-tree.cpp
+++ flang/lib/Parser/parse-tree.cpp
@@ -281,6 +281,26 @@ OmpTaskDependenceType::Value OmpDependClause::TaskDep::GetTaskDepType() const {
   }
 }
 
+std::string OmpTraitSelectorName::ToString() const {
+  return common::visit( //
+      common::visitors{
+          [&](Value v) { //
+            return std::string(EnumToString(v));
+          },
+          [&](llvm::omp::Directive d) {
+            return llvm::omp::getOpenMPDirectiveName(d).str();
+          },
+          [&](const std::string &s) { //
+            return s;
+          },
+      },
+      u);
+}
+
+std::string OmpTraitSetSelectorName::ToString() const {
+  return std::string(EnumToString(v));
+}
+
 } // namespace Fortran::parser
 
 template <typename C> static llvm::omp::Clause getClauseIdForClass(C &&) {
diff --git flang/lib/Parser/unparse.cpp flang/lib/Parser/unparse.cpp
index 51df9d48d1d4..cd91fbe4ea5e 100644
--- flang/lib/Parser/unparse.cpp
+++ flang/lib/Parser/unparse.cpp
@@ -2071,12 +2071,33 @@ public:
   }
 
   // OpenMP Clauses & Directives
+  void Unparse(const OmpTypeNameList &x) { //
+    Walk(x.v, ",");
+  }
+  void Unparse(const OmpMapperSpecifier &x) {
+    Walk(std::get<std::optional<Name>>(x.t), ":");
+    Walk(std::get<TypeSpec>(x.t));
+    Put("::");
+    Walk(std::get<Name>(x.t));
+  }
+  void Unparse(const OmpReductionSpecifier &x) {
+    Walk(std::get<OmpReductionIdentifier>(x.t));
+    Put(":");
+    Walk(std::get<OmpTypeNameList>(x.t));
+    Walk(":", std::get<std::optional<OmpReductionCombiner>>(x.t));
+  }
   void Unparse(const llvm::omp::Directive &x) {
     Word(llvm::omp::getOpenMPDirectiveName(x).str());
   }
   void Unparse(const OmpDirectiveSpecification &x) {
+    using ArgList = std::list<parser::OmpArgument>;
     Walk(std::get<llvm::omp::Directive>(x.t));
-    Walk(std::get<std::optional<common::Indirection<OmpClauseList>>>(x.t));
+    if (auto &args{std::get<std::optional<ArgList>>(x.t)}) {
+      Put("(");
+      Walk(*args);
+      Put(")");
+    }
+    Walk(std::get<std::optional<OmpClauseList>>(x.t));
   }
   void Unparse(const OmpTraitScore &x) {
     Word("SCORE(");
@@ -2301,8 +2322,9 @@ public:
   }
   void Unparse(const OmpWhenClause &x) {
     using Modifier = OmpWhenClause::Modifier;
+    using Directive = common::Indirection<OmpDirectiveSpecification>;
     Walk(std::get<std::optional<std::list<Modifier>>>(x.t), ": ");
-    Walk(std::get<std::optional<OmpDirectiveSpecification>>(x.t));
+    Walk(std::get<std::optional<Directive>>(x.t));
   }
 #define GEN_FLANG_CLAUSE_UNPARSE
 #include "llvm/Frontend/OpenMP/OMP.inc"
@@ -2664,18 +2686,6 @@ public:
     Walk(x.v);
     Put(")");
   }
-  void Unparse(const OmpReductionCombiner::FunctionCombiner &x) {
-    const auto &pd = std::get<ProcedureDesignator>(x.v.t);
-    const auto &args = std::get<std::list<ActualArgSpec>>(x.v.t);
-    Walk(pd);
-    if (args.empty()) {
-      if (std::holds_alternative<ProcComponentRef>(pd.u)) {
-        Put("()");
-      }
-    } else {
-      Walk("(", args, ", ", ")");
-    }
-  }
   void Unparse(const OpenMPDeclareReductionConstruct &x) {
     BeginOpenMP();
     Word("!$OMP DECLARE REDUCTION ");
@@ -2691,7 +2701,7 @@ public:
   void Unparse(const OpenMPDeclareMapperConstruct &z) {
     BeginOpenMP();
     Word("!$OMP DECLARE MAPPER (");
-    const auto &spec{std::get<OmpDeclareMapperSpecifier>(z.t)};
+    const auto &spec{std::get<OmpMapperSpecifier>(z.t)};
     if (auto mapname{std::get<std::optional<Name>>(spec.t)}) {
       Walk(mapname);
       Put(":");
diff --git flang/lib/Semantics/assignment.cpp flang/lib/Semantics/assignment.cpp
index 0b57197fb8db..2b562571a679 100644
--- flang/lib/Semantics/assignment.cpp
+++ flang/lib/Semantics/assignment.cpp
@@ -90,6 +90,17 @@ void AssignmentContext::Analyze(const parser::AssignmentStmt &stmt) {
     if (whereDepth_ > 0) {
       CheckShape(lhsLoc, &lhs);
     }
+    if (context_.foldingContext().languageFeatures().IsEnabled(
+            common::LanguageFeature::CUDA)) {
+      const auto &scope{context_.FindScope(lhsLoc)};
+      const Scope &progUnit{GetProgramUnitContaining(scope)};
+      if (!IsCUDADeviceContext(&progUnit)) {
+        if (Fortran::evaluate::HasCUDADeviceAttrs(lhs) &&
+            Fortran::evaluate::HasCUDAImplicitTransfer(rhs)) {
+          context_.Say(lhsLoc, "Unsupported CUDA data transfer"_err_en_US);
+        }
+      }
+    }
   }
 }
 
diff --git flang/lib/Semantics/check-io.cpp flang/lib/Semantics/check-io.cpp
index 3c99163c1f13..42c3b9e11efc 100644
--- flang/lib/Semantics/check-io.cpp
+++ flang/lib/Semantics/check-io.cpp
@@ -920,7 +920,7 @@ void IoChecker::CheckStringValue(IoSpecKind specKind, const std::string &value,
       {IoSpecKind::Decimal, {"COMMA", "POINT"}},
       {IoSpecKind::Delim, {"APOSTROPHE", "NONE", "QUOTE"}},
       {IoSpecKind::Encoding, {"DEFAULT", "UTF-8"}},
-      {IoSpecKind::Form, {"FORMATTED", "UNFORMATTED"}},
+      {IoSpecKind::Form, {"FORMATTED", "UNFORMATTED", "BINARY"}},
       {IoSpecKind::Pad, {"NO", "YES"}},
       {IoSpecKind::Position, {"APPEND", "ASIS", "REWIND"}},
       {IoSpecKind::Round,
diff --git flang/lib/Semantics/check-omp-structure.cpp flang/lib/Semantics/check-omp-structure.cpp
index 035064ecf3a4..6306251ca15d 100644
--- flang/lib/Semantics/check-omp-structure.cpp
+++ flang/lib/Semantics/check-omp-structure.cpp
@@ -9,6 +9,8 @@
 #include "check-omp-structure.h"
 #include "definable.h"
 #include "flang/Evaluate/check-expression.h"
+#include "flang/Evaluate/expression.h"
+#include "flang/Evaluate/type.h"
 #include "flang/Parser/parse-tree.h"
 #include "flang/Semantics/expression.h"
 #include "flang/Semantics/openmp-modifiers.h"
@@ -1638,7 +1640,7 @@ void OmpStructureChecker::Enter(const parser::OpenMPDeclareMapperConstruct &x) {
   const auto &dir{std::get<parser::Verbatim>(x.t)};
   PushContextAndClauseSets(
       dir.source, llvm::omp::Directive::OMPD_declare_mapper);
-  const auto &spec{std::get<parser::OmpDeclareMapperSpecifier>(x.t)};
+  const auto &spec{std::get<parser::OmpMapperSpecifier>(x.t)};
   const auto &type = std::get<parser::TypeSpec>(spec.t);
   if (!std::get_if<parser::DerivedTypeSpec>(&type.u)) {
     context_.Say(dir.source, "Type is not a derived type"_err_en_US);
@@ -2985,7 +2987,6 @@ CHECK_SIMPLE_CLAUSE(Severity, OMPC_severity)
 CHECK_SIMPLE_CLAUSE(Message, OMPC_message)
 CHECK_SIMPLE_CLAUSE(Filter, OMPC_filter)
 CHECK_SIMPLE_CLAUSE(Otherwise, OMPC_otherwise)
-CHECK_SIMPLE_CLAUSE(When, OMPC_when)
 CHECK_SIMPLE_CLAUSE(AdjustArgs, OMPC_adjust_args)
 CHECK_SIMPLE_CLAUSE(AppendArgs, OMPC_append_args)
 CHECK_SIMPLE_CLAUSE(MemoryOrder, OMPC_memory_order)
@@ -4537,14 +4538,518 @@ void OmpStructureChecker::Enter(const parser::OmpClause::OmpxBare &x) {
   }
 }
 
-void OmpStructureChecker::Enter(const parser::OmpContextSelector &ctxSel) {
+void OmpStructureChecker::Enter(const parser::OmpClause::When &x) {
+  CheckAllowedClause(llvm::omp::Clause::OMPC_when);
+  OmpVerifyModifiers(
+      x.v, llvm::omp::OMPC_when, GetContext().clauseSource, context_);
+}
+
+void OmpStructureChecker::Enter(const parser::OmpContextSelector &ctx) {
   EnterDirectiveNest(ContextSelectorNest);
+
+  using SetName = parser::OmpTraitSetSelectorName;
+  std::map<SetName::Value, const SetName *> visited;
+
+  for (const parser::OmpTraitSetSelector &traitSet : ctx.v) {
+    auto &name{std::get<SetName>(traitSet.t)};
+    auto [prev, unique]{visited.insert(std::make_pair(name.v, &name))};
+    if (!unique) {
+      std::string showName{parser::ToUpperCaseLetters(name.ToString())};
+      parser::MessageFormattedText txt(
+          "Repeated trait set name %s in a context specifier"_err_en_US,
+          showName);
+      parser::Message message(name.source, txt);
+      message.Attach(prev->second->source,
+          "Previous trait set %s provided here"_en_US, showName);
+      context_.Say(std::move(message));
+    }
+    CheckTraitSetSelector(traitSet);
+  }
 }
 
 void OmpStructureChecker::Leave(const parser::OmpContextSelector &) {
   ExitDirectiveNest(ContextSelectorNest);
 }
 
+std::optional<evaluate::DynamicType> OmpStructureChecker::GetDynamicType(
+    const common::Indirection<parser::Expr> &parserExpr) {
+  // Indirection<parser::Expr>      parserExpr
+  //  `- parser::Expr               ^.value()
+  const parser::TypedExpr &typedExpr{parserExpr.value().typedExpr};
+  // ForwardOwningPointer           typedExpr
+  // `- GenericExprWrapper          ^.get()
+  //    `- std::optional<Expr>      ^->v
+  if (auto maybeExpr{typedExpr.get()->v}) {
+    return maybeExpr->GetType();
+  } else {
+    return std::nullopt;
+  }
+}
+
+const std::list<parser::OmpTraitProperty> &
+OmpStructureChecker::GetTraitPropertyList(
+    const parser::OmpTraitSelector &trait) {
+  static const std::list<parser::OmpTraitProperty> empty{};
+  auto &[_, maybeProps]{trait.t};
+  if (maybeProps) {
+    using PropertyList = std::list<parser::OmpTraitProperty>;
+    return std::get<PropertyList>(maybeProps->t);
+  } else {
+    return empty;
+  }
+}
+
+std::optional<llvm::omp::Clause> OmpStructureChecker::GetClauseFromProperty(
+    const parser::OmpTraitProperty &property) {
+  using MaybeClause = std::optional<llvm::omp::Clause>;
+
+  // The parser for OmpClause will only succeed if the clause was
+  // given with all required arguments.
+  // If this is a string or complex extension with a clause name,
+  // treat it as a clause and let the trait checker deal with it.
+
+  auto getClauseFromString{[&](const std::string &s) -> MaybeClause {
+    auto id{llvm::omp::getOpenMPClauseKind(parser::ToLowerCaseLetters(s))};
+    if (id != llvm::omp::Clause::OMPC_unknown) {
+      return id;
+    } else {
+      return std::nullopt;
+    }
+  }};
+
+  return common::visit( //
+      common::visitors{
+          [&](const parser::OmpTraitPropertyName &x) -> MaybeClause {
+            return getClauseFromString(x.v);
+          },
+          [&](const common::Indirection<parser::OmpClause> &x) -> MaybeClause {
+            return x.value().Id();
+          },
+          [&](const parser::ScalarExpr &x) -> MaybeClause {
+            return std::nullopt;
+          },
+          [&](const parser::OmpTraitPropertyExtension &x) -> MaybeClause {
+            using ExtProperty = parser::OmpTraitPropertyExtension;
+            if (auto *name{std::get_if<parser::OmpTraitPropertyName>(&x.u)}) {
+              return getClauseFromString(name->v);
+            } else if (auto *cpx{std::get_if<ExtProperty::Complex>(&x.u)}) {
+              return getClauseFromString(
+                  std::get<parser::OmpTraitPropertyName>(cpx->t).v);
+            }
+            return std::nullopt;
+          },
+      },
+      property.u);
+}
+
+void OmpStructureChecker::CheckTraitSelectorList(
+    const std::list<parser::OmpTraitSelector> &traits) {
+  // [6.0:322:20]
+  // Each trait-selector-name may only be specified once in a trait selector
+  // set.
+
+  // Cannot store OmpTraitSelectorName directly, because it's not copyable.
+  using TraitName = parser::OmpTraitSelectorName;
+  using BareName = decltype(TraitName::u);
+  std::map<BareName, const TraitName *> visited;
+
+  for (const parser::OmpTraitSelector &trait : traits) {
+    auto &name{std::get<TraitName>(trait.t)};
+
+    auto [prev, unique]{visited.insert(std::make_pair(name.u, &name))};
+    if (!unique) {
+      std::string showName{parser::ToUpperCaseLetters(name.ToString())};
+      parser::MessageFormattedText txt(
+          "Repeated trait name %s in a trait set"_err_en_US, showName);
+      parser::Message message(name.source, txt);
+      message.Attach(prev->second->source,
+          "Previous trait %s provided here"_en_US, showName);
+      context_.Say(std::move(message));
+    }
+  }
+}
+
+void OmpStructureChecker::CheckTraitSetSelector(
+    const parser::OmpTraitSetSelector &traitSet) {
+
+  // Trait Set      |           Allowed traits | D-traits | X-traits | Score |
+  //
+  // Construct      |     Simd, directive-name |      Yes |       No |    No |
+  // Device         |          Arch, Isa, Kind |       No |      Yes |    No |
+  // Implementation | Atomic_Default_Mem_Order |       No |      Yes |   Yes |
+  //                |      Extension, Requires |          |          |       |
+  //                |                   Vendor |          |          |       |
+  // Target_Device  |    Arch, Device_Num, Isa |       No |      Yes |    No |
+  //                |                Kind, Uid |          |          |       |
+  // User           |                Condition |       No |       No |   Yes |
+
+  struct TraitSetConfig {
+    std::set<parser::OmpTraitSelectorName::Value> allowed;
+    bool allowsDirectiveTraits;
+    bool allowsExtensionTraits;
+    bool allowsScore;
+  };
+
+  using SName = parser::OmpTraitSetSelectorName::Value;
+  using TName = parser::OmpTraitSelectorName::Value;
+
+  static const std::map<SName, TraitSetConfig> configs{
+      {SName::Construct, //
+          {{TName::Simd}, true, false, false}},
+      {SName::Device, //
+          {{TName::Arch, TName::Isa, TName::Kind}, false, true, false}},
+      {SName::Implementation, //
+          {{TName::Atomic_Default_Mem_Order, TName::Extension, TName::Requires,
+               TName::Vendor},
+              false, true, true}},
+      {SName::Target_Device, //
+          {{TName::Arch, TName::Device_Num, TName::Isa, TName::Kind,
+               TName::Uid},
+              false, true, false}},
+      {SName::User, //
+          {{TName::Condition}, false, false, true}},
+  };
+
+  auto checkTraitSet{[&](const TraitSetConfig &config) {
+    auto &[setName, traits]{traitSet.t};
+    auto usn{parser::ToUpperCaseLetters(setName.ToString())};
+
+    // Check if there are any duplicate traits.
+    CheckTraitSelectorList(traits);
+
+    for (const parser::OmpTraitSelector &trait : traits) {
+      // Don't use structured bindings here, because they cannot be captured
+      // before C++20.
+      auto &traitName = std::get<parser::OmpTraitSelectorName>(trait.t);
+      auto &maybeProps =
+          std::get<std::optional<parser::OmpTraitSelector::Properties>>(
+              trait.t);
+
+      // Check allowed traits
+      common::visit( //
+          common::visitors{
+              [&](parser::OmpTraitSelectorName::Value v) {
+                if (!config.allowed.count(v)) {
+                  context_.Say(traitName.source,
+                      "%s is not a valid trait for %s trait set"_err_en_US,
+                      parser::ToUpperCaseLetters(traitName.ToString()), usn);
+                }
+              },
+              [&](llvm::omp::Directive) {
+                if (!config.allowsDirectiveTraits) {
+                  context_.Say(traitName.source,
+                      "Directive name is not a valid trait for %s trait set"_err_en_US,
+                      usn);
+                }
+              },
+              [&](const std::string &) {
+                if (!config.allowsExtensionTraits) {
+                  context_.Say(traitName.source,
+                      "Extension traits are not valid for %s trait set"_err_en_US,
+                      usn);
+                }
+              },
+          },
+          traitName.u);
+
+      // Check score
+      if (maybeProps) {
+        auto &[maybeScore, _]{maybeProps->t};
+        if (maybeScore) {
+          CheckTraitScore(*maybeScore);
+        }
+      }
+
+      // Check the properties of the individual traits
+      CheckTraitSelector(traitSet, trait);
+    }
+  }};
+
+  checkTraitSet(
+      configs.at(std::get<parser::OmpTraitSetSelectorName>(traitSet.t).v));
+}
+
+void OmpStructureChecker::CheckTraitScore(const parser::OmpTraitScore &score) {
+  // [6.0:322:23]
+  // A score-expression must be a non-negative constant integer expression.
+  if (auto value{GetIntValue(score)}; !value || value <= 0) {
+    context_.Say(score.source,
+        "SCORE expression must be a non-negative constant integer expression"_err_en_US);
+  }
+}
+
+bool OmpStructureChecker::VerifyTraitPropertyLists(
+    const parser::OmpTraitSetSelector &traitSet,
+    const parser::OmpTraitSelector &trait) {
+  using TraitName = parser::OmpTraitSelectorName;
+  using PropertyList = std::list<parser::OmpTraitProperty>;
+  auto &[traitName, maybeProps]{trait.t};
+
+  auto checkPropertyList{[&](const PropertyList &properties, auto isValid,
+                             const std::string &message) {
+    bool foundInvalid{false};
+    for (const parser::OmpTraitProperty &prop : properties) {
+      if (!isValid(prop)) {
+        if (foundInvalid) {
+          context_.Say(
+              prop.source, "More invalid properties are present"_err_en_US);
+          break;
+        }
+        context_.Say(prop.source, "%s"_err_en_US, message);
+        foundInvalid = true;
+      }
+    }
+    return !foundInvalid;
+  }};
+
+  bool invalid{false};
+
+  if (std::holds_alternative<llvm::omp::Directive>(traitName.u)) {
+    // Directive-name traits don't have properties.
+    if (maybeProps) {
+      context_.Say(trait.source,
+          "Directive-name traits cannot have properties"_err_en_US);
+      invalid = true;
+    }
+  }
+  // Ignore properties on extension traits.
+
+  // See `TraitSelectorParser` in openmp-parser.cpp
+  if (auto *v{std::get_if<TraitName::Value>(&traitName.u)}) {
+    switch (*v) {
+    // name-list properties
+    case parser::OmpTraitSelectorName::Value::Arch:
+    case parser::OmpTraitSelectorName::Value::Extension:
+    case parser::OmpTraitSelectorName::Value::Isa:
+    case parser::OmpTraitSelectorName::Value::Kind:
+    case parser::OmpTraitSelectorName::Value::Uid:
+    case parser::OmpTraitSelectorName::Value::Vendor:
+      if (maybeProps) {
+        auto isName{[](const parser::OmpTraitProperty &prop) {
+          return std::holds_alternative<parser::OmpTraitPropertyName>(prop.u);
+        }};
+        invalid = !checkPropertyList(std::get<PropertyList>(maybeProps->t),
+            isName, "Trait property should be a name");
+      }
+      break;
+    // clause-list
+    case parser::OmpTraitSelectorName::Value::Atomic_Default_Mem_Order:
+    case parser::OmpTraitSelectorName::Value::Requires:
+    case parser::OmpTraitSelectorName::Value::Simd:
+      if (maybeProps) {
+        auto isClause{[&](const parser::OmpTraitProperty &prop) {
+          return GetClauseFromProperty(prop).has_value();
+        }};
+        invalid = !checkPropertyList(std::get<PropertyList>(maybeProps->t),
+            isClause, "Trait property should be a clause");
+      }
+      break;
+    // expr-list
+    case parser::OmpTraitSelectorName::Value::Condition:
+    case parser::OmpTraitSelectorName::Value::Device_Num:
+      if (maybeProps) {
+        auto isExpr{[](const parser::OmpTraitProperty &prop) {
+          return std::holds_alternative<parser::ScalarExpr>(prop.u);
+        }};
+        invalid = !checkPropertyList(std::get<PropertyList>(maybeProps->t),
+            isExpr, "Trait property should be a scalar expression");
+      }
+      break;
+    } // switch
+  }
+
+  return !invalid;
+}
+
+void OmpStructureChecker::CheckTraitSelector(
+    const parser::OmpTraitSetSelector &traitSet,
+    const parser::OmpTraitSelector &trait) {
+  using TraitName = parser::OmpTraitSelectorName;
+  auto &[traitName, maybeProps]{trait.t};
+
+  // Only do the detailed checks if the property lists are valid.
+  if (VerifyTraitPropertyLists(traitSet, trait)) {
+    if (std::holds_alternative<llvm::omp::Directive>(traitName.u) ||
+        std::holds_alternative<std::string>(traitName.u)) {
+      // No properties here: directives don't have properties, and
+      // we don't implement any extension traits now.
+      return;
+    }
+
+    // Specific traits we want to check.
+    // Limitations:
+    // (1) The properties for these traits are defined in "Additional
+    // Definitions for the OpenMP API Specification". It's not clear how
+    // to define them in a portable way, and how to verify their validity,
+    // especially if they get replaced by their integer values (in case
+    // they are defined as enums).
+    // (2) These are entirely implementation-defined, and at the moment
+    // there is no known schema to validate these values.
+    auto v{std::get<TraitName::Value>(traitName.u)};
+    switch (v) {
+    case TraitName::Value::Arch:
+      // Unchecked, TBD(1)
+      break;
+    case TraitName::Value::Atomic_Default_Mem_Order:
+      CheckTraitADMO(traitSet, trait);
+      break;
+    case TraitName::Value::Condition:
+      CheckTraitCondition(traitSet, trait);
+      break;
+    case TraitName::Value::Device_Num:
+      CheckTraitDeviceNum(traitSet, trait);
+      break;
+    case TraitName::Value::Extension:
+      // Ignore
+      break;
+    case TraitName::Value::Isa:
+      // Unchecked, TBD(1)
+      break;
+    case TraitName::Value::Kind:
+      // Unchecked, TBD(1)
+      break;
+    case TraitName::Value::Requires:
+      CheckTraitRequires(traitSet, trait);
+      break;
+    case TraitName::Value::Simd:
+      CheckTraitSimd(traitSet, trait);
+      break;
+    case TraitName::Value::Uid:
+      // Unchecked, TBD(2)
+      break;
+    case TraitName::Value::Vendor:
+      // Unchecked, TBD(1)
+      break;
+    }
+  }
+}
+
+void OmpStructureChecker::CheckTraitADMO(
+    const parser::OmpTraitSetSelector &traitSet,
+    const parser::OmpTraitSelector &trait) {
+  auto &traitName{std::get<parser::OmpTraitSelectorName>(trait.t)};
+  auto &properties{GetTraitPropertyList(trait)};
+
+  if (properties.size() != 1) {
+    context_.Say(trait.source,
+        "%s trait requires a single clause property"_err_en_US,
+        parser::ToUpperCaseLetters(traitName.ToString()));
+  } else {
+    const parser::OmpTraitProperty &property{properties.front()};
+    auto clauseId{*GetClauseFromProperty(property)};
+    // Check that the clause belongs to the memory-order clause-set.
+    // Clause sets will hopefully be autogenerated at some point.
+    switch (clauseId) {
+    case llvm::omp::Clause::OMPC_acq_rel:
+    case llvm::omp::Clause::OMPC_acquire:
+    case llvm::omp::Clause::OMPC_relaxed:
+    case llvm::omp::Clause::OMPC_release:
+    case llvm::omp::Clause::OMPC_seq_cst:
+      break;
+    default:
+      context_.Say(property.source,
+          "%s trait requires a clause from the memory-order clause set"_err_en_US,
+          parser::ToUpperCaseLetters(traitName.ToString()));
+    }
+
+    using ClauseProperty = common::Indirection<parser::OmpClause>;
+    if (!std::holds_alternative<ClauseProperty>(property.u)) {
+      context_.Say(property.source,
+          "Invalid clause specification for %s"_err_en_US,
+          parser::ToUpperCaseLetters(getClauseName(clauseId)));
+    }
+  }
+}
+
+void OmpStructureChecker::CheckTraitCondition(
+    const parser::OmpTraitSetSelector &traitSet,
+    const parser::OmpTraitSelector &trait) {
+  auto &traitName{std::get<parser::OmpTraitSelectorName>(trait.t)};
+  auto &properties{GetTraitPropertyList(trait)};
+
+  if (properties.size() != 1) {
+    context_.Say(trait.source,
+        "%s trait requires a single expression property"_err_en_US,
+        parser::ToUpperCaseLetters(traitName.ToString()));
+  } else {
+    const parser::OmpTraitProperty &property{properties.front()};
+    auto &scalarExpr{std::get<parser::ScalarExpr>(property.u)};
+
+    auto maybeType{GetDynamicType(scalarExpr.thing)};
+    if (!maybeType || maybeType->category() != TypeCategory::Logical) {
+      context_.Say(property.source,
+          "%s trait requires a single LOGICAL expression"_err_en_US,
+          parser::ToUpperCaseLetters(traitName.ToString()));
+    }
+  }
+}
+
+void OmpStructureChecker::CheckTraitDeviceNum(
+    const parser::OmpTraitSetSelector &traitSet,
+    const parser::OmpTraitSelector &trait) {
+  auto &traitName{std::get<parser::OmpTraitSelectorName>(trait.t)};
+  auto &properties{GetTraitPropertyList(trait)};
+
+  if (properties.size() != 1) {
+    context_.Say(trait.source,
+        "%s trait requires a single expression property"_err_en_US,
+        parser::ToUpperCaseLetters(traitName.ToString()));
+  }
+  // No other checks at the moment.
+}
+
+void OmpStructureChecker::CheckTraitRequires(
+    const parser::OmpTraitSetSelector &traitSet,
+    const parser::OmpTraitSelector &trait) {
+  unsigned version{context_.langOptions().OpenMPVersion};
+  auto &traitName{std::get<parser::OmpTraitSelectorName>(trait.t)};
+  auto &properties{GetTraitPropertyList(trait)};
+
+  for (const parser::OmpTraitProperty &property : properties) {
+    auto clauseId{*GetClauseFromProperty(property)};
+    if (!llvm::omp::isAllowedClauseForDirective(
+            llvm::omp::OMPD_requires, clauseId, version)) {
+      context_.Say(property.source,
+          "%s trait requires a clause from the requirement clause set"_err_en_US,
+          parser::ToUpperCaseLetters(traitName.ToString()));
+    }
+
+    using ClauseProperty = common::Indirection<parser::OmpClause>;
+    if (!std::holds_alternative<ClauseProperty>(property.u)) {
+      context_.Say(property.source,
+          "Invalid clause specification for %s"_err_en_US,
+          parser::ToUpperCaseLetters(getClauseName(clauseId)));
+    }
+  }
+}
+
+void OmpStructureChecker::CheckTraitSimd(
+    const parser::OmpTraitSetSelector &traitSet,
+    const parser::OmpTraitSelector &trait) {
+  unsigned version{context_.langOptions().OpenMPVersion};
+  auto &traitName{std::get<parser::OmpTraitSelectorName>(trait.t)};
+  auto &properties{GetTraitPropertyList(trait)};
+
+  for (const parser::OmpTraitProperty &property : properties) {
+    auto clauseId{*GetClauseFromProperty(property)};
+    if (!llvm::omp::isAllowedClauseForDirective(
+            llvm::omp::OMPD_declare_simd, clauseId, version)) {
+      context_.Say(property.source,
+          "%s trait requires a clause that is allowed on the %s directive"_err_en_US,
+          parser::ToUpperCaseLetters(traitName.ToString()),
+          parser::ToUpperCaseLetters(
+              getDirectiveName(llvm::omp::OMPD_declare_simd)));
+    }
+
+    using ClauseProperty = common::Indirection<parser::OmpClause>;
+    if (!std::holds_alternative<ClauseProperty>(property.u)) {
+      context_.Say(property.source,
+          "Invalid clause specification for %s"_err_en_US,
+          parser::ToUpperCaseLetters(getClauseName(clauseId)));
+    }
+  }
+}
+
 llvm::StringRef OmpStructureChecker::getClauseName(llvm::omp::Clause clause) {
   return llvm::omp::getOpenMPClauseName(clause);
 }
diff --git flang/lib/Semantics/check-omp-structure.h flang/lib/Semantics/check-omp-structure.h
index 7412a2071d49..a9ac93a9149d 100644
--- flang/lib/Semantics/check-omp-structure.h
+++ flang/lib/Semantics/check-omp-structure.h
@@ -184,6 +184,32 @@ private:
   // specific clause related
   void CheckAllowedMapTypes(const parser::OmpMapType::Value &,
       const std::list<parser::OmpMapType::Value> &);
+
+  std::optional<evaluate::DynamicType> GetDynamicType(
+      const common::Indirection<parser::Expr> &);
+  const std::list<parser::OmpTraitProperty> &GetTraitPropertyList(
+      const parser::OmpTraitSelector &);
+  std::optional<llvm::omp::Clause> GetClauseFromProperty(
+      const parser::OmpTraitProperty &);
+
+  void CheckTraitSelectorList(const std::list<parser::OmpTraitSelector> &);
+  void CheckTraitSetSelector(const parser::OmpTraitSetSelector &);
+  void CheckTraitScore(const parser::OmpTraitScore &);
+  bool VerifyTraitPropertyLists(
+      const parser::OmpTraitSetSelector &, const parser::OmpTraitSelector &);
+  void CheckTraitSelector(
+      const parser::OmpTraitSetSelector &, const parser::OmpTraitSelector &);
+  void CheckTraitADMO(
+      const parser::OmpTraitSetSelector &, const parser::OmpTraitSelector &);
+  void CheckTraitCondition(
+      const parser::OmpTraitSetSelector &, const parser::OmpTraitSelector &);
+  void CheckTraitDeviceNum(
+      const parser::OmpTraitSetSelector &, const parser::OmpTraitSelector &);
+  void CheckTraitRequires(
+      const parser::OmpTraitSetSelector &, const parser::OmpTraitSelector &);
+  void CheckTraitSimd(
+      const parser::OmpTraitSetSelector &, const parser::OmpTraitSelector &);
+
   llvm::StringRef getClauseName(llvm::omp::Clause clause) override;
   llvm::StringRef getDirectiveName(llvm::omp::Directive directive) override;
 
diff --git flang/lib/Semantics/expression.cpp flang/lib/Semantics/expression.cpp
index 1dd603412f42..6949e5693d08 100644
--- flang/lib/Semantics/expression.cpp
+++ flang/lib/Semantics/expression.cpp
@@ -2834,13 +2834,12 @@ std::pair<const Symbol *, bool> ExpressionAnalyzer::ResolveGeneric(
   // Check for generic or explicit INTRINSIC of the same name in outer scopes.
   // See 15.5.5.2 for details.
   if (!symbol.owner().IsGlobal() && !symbol.owner().IsDerivedType()) {
-    for (const std::string &n : GetAllNames(context_, symbol.name())) {
-      if (const Symbol *outer{symbol.owner().parent().FindSymbol(n)}) {
-        auto pair{ResolveGeneric(*outer, actuals, adjustActuals, isSubroutine,
-            mightBeStructureConstructor)};
-        if (pair.first) {
-          return pair;
-        }
+    if (const Symbol *
+        outer{symbol.owner().parent().FindSymbol(symbol.name())}) {
+      auto pair{ResolveGeneric(*outer, actuals, adjustActuals, isSubroutine,
+          mightBeStructureConstructor)};
+      if (pair.first) {
+        return pair;
       }
     }
   }
@@ -3635,13 +3634,13 @@ MaybeExpr ExpressionAnalyzer::Analyze(const parser::Expr::Concat &x) {
 // The Name represents a user-defined intrinsic operator.
 // If the actuals match one of the specific procedures, return a function ref.
 // Otherwise report the error in messages.
-MaybeExpr ExpressionAnalyzer::AnalyzeDefinedOp(
-    const parser::Name &name, ActualArguments &&actuals) {
+MaybeExpr ExpressionAnalyzer::AnalyzeDefinedOp(const parser::Name &name,
+    ActualArguments &&actuals, const Symbol *&symbol) {
   if (auto callee{GetCalleeAndArguments(name, std::move(actuals))}) {
-    CHECK(std::holds_alternative<ProcedureDesignator>(callee->u));
-    return MakeFunctionRef(name.source,
-        std::move(std::get<ProcedureDesignator>(callee->u)),
-        std::move(callee->arguments));
+    auto &proc{std::get<evaluate::ProcedureDesignator>(callee->u)};
+    symbol = proc.GetSymbol();
+    return MakeFunctionRef(
+        name.source, std::move(proc), std::move(callee->arguments));
   } else {
     return std::nullopt;
   }
@@ -4079,8 +4078,7 @@ bool ExpressionAnalyzer::CheckIntrinsicKind(
     return true;
   } else if (foldingContext_.targetCharacteristics().CanSupportType(
                  category, kind)) {
-    Warn(common::UsageWarning::BadTypeForTarget,
-        "%s(KIND=%jd) is not an enabled type for this target"_warn_en_US,
+    Say("%s(KIND=%jd) is not an enabled type for this target"_err_en_US,
         ToUpperCase(EnumToString(category)), kind);
     return true;
   } else {
@@ -4102,20 +4100,7 @@ bool ExpressionAnalyzer::CheckIntrinsicSize(
       return false;
     }
   }
-  if (foldingContext_.targetCharacteristics().IsTypeEnabled(
-          category, kind)) { // C712, C714, C715, C727
-    return true;
-  } else if (foldingContext_.targetCharacteristics().CanSupportType(
-                 category, kind)) {
-    Warn(common::UsageWarning::BadTypeForTarget,
-        "%s*%jd is not an enabled type for this target"_warn_en_US,
-        ToUpperCase(EnumToString(category)), size);
-    return true;
-  } else {
-    Say("%s*%jd is not a supported type"_err_en_US,
-        ToUpperCase(EnumToString(category)), size);
-    return false;
-  }
+  return CheckIntrinsicKind(category, kind);
 }
 
 bool ExpressionAnalyzer::AddImpliedDo(parser::CharBlock name, int kind) {
@@ -4453,38 +4438,45 @@ MaybeExpr ArgumentAnalyzer::TryDefinedOp(
     parser::Messages buffer;
     auto restorer{context_.GetContextualMessages().SetMessages(buffer)};
     const auto &scope{context_.context().FindScope(source_)};
-    if (Symbol *symbol{scope.FindSymbol(oprName)}) {
+
+    auto FoundOne{[&](MaybeExpr &&thisResult, const Symbol &generic,
+                      const Symbol *resolution) {
       anyPossibilities = true;
-      parser::Name name{symbol->name(), symbol};
-      if (!fatalErrors_) {
-        result = context_.AnalyzeDefinedOp(name, GetActuals());
-      }
-      if (result) {
-        inaccessible = CheckAccessibleSymbol(scope, *symbol);
-        if (inaccessible) {
-          result.reset();
+      if (thisResult) {
+        if (auto thisInaccessible{CheckAccessibleSymbol(scope, generic)}) {
+          inaccessible = thisInaccessible;
         } else {
-          hit.push_back(symbol);
-          hitBuffer = std::move(buffer);
+          bool isElemental{IsElementalProcedure(DEREF(resolution))};
+          bool hitsAreNonElemental{
+              !hit.empty() && !IsElementalProcedure(DEREF(hit[0]))};
+          if (isElemental && hitsAreNonElemental) {
+            // ignore elemental resolutions in favor of a non-elemental one
+          } else {
+            if (!isElemental && !hitsAreNonElemental) {
+              hit.clear();
+            }
+            result = std::move(thisResult);
+            hit.push_back(resolution);
+            hitBuffer = std::move(buffer);
+          }
         }
       }
+    }};
+
+    if (Symbol * generic{scope.FindSymbol(oprName)}; generic && !fatalErrors_) {
+      parser::Name name{generic->name(), generic};
+      const Symbol *resultSymbol{nullptr};
+      MaybeExpr possibleResult{context_.AnalyzeDefinedOp(
+          name, ActualArguments{actuals_}, resultSymbol)};
+      FoundOne(std::move(possibleResult), *generic, resultSymbol);
     }
     for (std::size_t passIndex{0}; passIndex < actuals_.size(); ++passIndex) {
       buffer.clear();
       const Symbol *generic{nullptr};
-      if (const Symbol *binding{
-              FindBoundOp(oprName, passIndex, generic, false)}) {
-        anyPossibilities = true;
-        if (MaybeExpr thisResult{TryBoundOp(*binding, passIndex)}) {
-          if (auto thisInaccessible{
-                  CheckAccessibleSymbol(scope, DEREF(generic))}) {
-            inaccessible = thisInaccessible;
-          } else {
-            result = std::move(thisResult);
-            hit.push_back(binding);
-            hitBuffer = std::move(buffer);
-          }
-        }
+      if (const Symbol *
+          binding{FindBoundOp(
+              oprName, passIndex, generic, /*isSubroutine=*/false)}) {
+        FoundOne(TryBoundOp(*binding, passIndex), DEREF(generic), binding);
       }
     }
   }
@@ -4655,7 +4647,8 @@ std::optional<ProcedureRef> ArgumentAnalyzer::GetDefinedAssignmentProc() {
     }
     for (std::size_t i{0}; !proc && i < actuals_.size(); ++i) {
       const Symbol *generic{nullptr};
-      if (const Symbol *binding{FindBoundOp(oprName, i, generic, true)}) {
+      if (const Symbol *
+          binding{FindBoundOp(oprName, i, generic, /*isSubroutine=*/true)}) {
         if (CheckAccessibleSymbol(scope, DEREF(generic))) {
           // ignore inaccessible type-bound ASSIGNMENT(=) generic
         } else if (const Symbol *
diff --git flang/lib/Semantics/mod-file.cpp flang/lib/Semantics/mod-file.cpp
index b45f1c060da2..bef934beaacf 100644
--- flang/lib/Semantics/mod-file.cpp
+++ flang/lib/Semantics/mod-file.cpp
@@ -306,8 +306,10 @@ void ModFileWriter::PrepareRenamings(const Scope &scope) {
   // to their names in this scope, creating those new names when needed.
   auto &renamings{context_.moduleFileOutputRenamings()};
   for (SymbolRef s : symbolsNeeded) {
-    if (s->owner().kind() == Scope::Kind::DerivedType) {
-      continue; // component or binding: ok
+    if (s->owner().kind() != Scope::Kind::Module) {
+      // Not a USE'able name from a module's top scope;
+      // component, binding, dummy argument, &c.
+      continue;
     }
     const Scope *sMod{FindModuleContaining(s->owner())};
     if (!sMod || sMod == &scope) {
diff --git flang/lib/Semantics/openmp-modifiers.cpp flang/lib/Semantics/openmp-modifiers.cpp
index 9f2896229bb7..73ad7751ee51 100644
--- flang/lib/Semantics/openmp-modifiers.cpp
+++ flang/lib/Semantics/openmp-modifiers.cpp
@@ -156,6 +156,23 @@ const OmpModifierDescriptor &OmpGetDescriptor<parser::OmpChunkModifier>() {
   return desc;
 }
 
+template <>
+const OmpModifierDescriptor &OmpGetDescriptor<parser::OmpContextSelector>() {
+  static const OmpModifierDescriptor desc{
+      /*name=*/"context-selector",
+      /*props=*/
+      {
+          {50, {OmpProperty::Required, OmpProperty::Unique}},
+      },
+      /*clauses=*/
+      {
+          // The MATCH clause takes a selector as an argument, not modifier.
+          {50, {Clause::OMPC_when}},
+      },
+  };
+  return desc;
+}
+
 template <>
 const OmpModifierDescriptor &OmpGetDescriptor<parser::OmpDependenceType>() {
   static const OmpModifierDescriptor desc{
diff --git flang/lib/Semantics/resolve-names-utils.cpp flang/lib/Semantics/resolve-names-utils.cpp
index c91c522b7c73..742bb748b7ff 100644
--- flang/lib/Semantics/resolve-names-utils.cpp
+++ flang/lib/Semantics/resolve-names-utils.cpp
@@ -762,7 +762,11 @@ void SymbolMapper::MapSymbolExprs(Symbol &symbol) {
               proc.set_procInterfaces(
                   *mappedSymbol, BypassGeneric(mappedSymbol->GetUltimate()));
             } else if (const DeclTypeSpec * mappedType{MapType(proc.type())}) {
-              proc.set_type(*mappedType);
+              if (proc.type()) {
+                CHECK(*proc.type() == *mappedType);
+              } else {
+                proc.set_type(*mappedType);
+              }
             }
             if (proc.init()) {
               if (const Symbol * mapped{MapSymbol(*proc.init())}) {
diff --git flang/lib/Semantics/resolve-names.cpp flang/lib/Semantics/resolve-names.cpp
index b971c9434468..66f59726d935 100644
--- flang/lib/Semantics/resolve-names.cpp
+++ flang/lib/Semantics/resolve-names.cpp
@@ -1474,7 +1474,12 @@ public:
     return true;
   }
 
-  bool Pre(const parser::OpenMPDeclareMapperConstruct &);
+  bool Pre(const parser::OpenMPDeclareMapperConstruct &x) {
+    AddOmpSourceRange(x.source);
+    ProcessMapperSpecifier(std::get<parser::OmpMapperSpecifier>(x.t),
+        std::get<parser::OmpClauseList>(x.t));
+    return false;
+  }
 
   bool Pre(const parser::OmpMapClause &);
 
@@ -1616,6 +1621,21 @@ public:
       PopScope();
     }
   }
+  bool Pre(const parser::OmpDirectiveSpecification &x);
+
+  bool Pre(const parser::OmpTypeSpecifier &x) {
+    BeginDeclTypeSpec();
+    return true;
+  }
+  void Post(const parser::OmpTypeSpecifier &x) { //
+    EndDeclTypeSpec();
+  }
+
+private:
+  void ProcessMapperSpecifier(const parser::OmpMapperSpecifier &spec,
+      const parser::OmpClauseList &clauses);
+  void ProcessReductionSpecifier(const parser::OmpReductionSpecifier &spec,
+      const parser::OmpClauseList &clauses);
 };
 
 bool OmpVisitor::NeedsScope(const parser::OpenMPBlockConstruct &x) {
@@ -1655,37 +1675,6 @@ void OmpVisitor::Post(const parser::OpenMPBlockConstruct &x) {
   }
 }
 
-// This "manually" walks the tree of the construct, because we need
-// to resolve the type before the map clauses are processed - when
-// just following the natural flow, the map clauses gets processed before
-// the type has been fully processed.
-bool OmpVisitor::Pre(const parser::OpenMPDeclareMapperConstruct &x) {
-  AddOmpSourceRange(x.source);
-  BeginDeclTypeSpec();
-  const auto &spec{std::get<parser::OmpDeclareMapperSpecifier>(x.t)};
-  Symbol *mapperSym{nullptr};
-  if (const auto &mapperName{std::get<std::optional<parser::Name>>(spec.t)}) {
-    mapperSym =
-        &MakeSymbol(*mapperName, MiscDetails{MiscDetails::Kind::ConstructName});
-    mapperName->symbol = mapperSym;
-  } else {
-    const parser::CharBlock defaultName{"default", 7};
-    mapperSym = &MakeSymbol(
-        defaultName, Attrs{}, MiscDetails{MiscDetails::Kind::ConstructName});
-  }
-
-  PushScope(Scope::Kind::OtherConstruct, nullptr);
-  Walk(std::get<parser::TypeSpec>(spec.t));
-  const auto &varName{std::get<parser::ObjectName>(spec.t)};
-  DeclareObjectEntity(varName);
-
-  Walk(std::get<parser::OmpClauseList>(x.t));
-
-  EndDeclTypeSpec();
-  PopScope();
-  return false;
-}
-
 bool OmpVisitor::Pre(const parser::OmpMapClause &x) {
   auto &mods{OmpGetModifiers(x)};
   if (auto *mapper{OmpGetUniqueModifier<parser::OmpMapper>(mods)}) {
@@ -1713,6 +1702,83 @@ bool OmpVisitor::Pre(const parser::OmpMapClause &x) {
   return true;
 }
 
+void OmpVisitor::ProcessMapperSpecifier(const parser::OmpMapperSpecifier &spec,
+    const parser::OmpClauseList &clauses) {
+  // This "manually" walks the tree of the construct, because we need
+  // to resolve the type before the map clauses are processed - when
+  // just following the natural flow, the map clauses gets processed before
+  // the type has been fully processed.
+  BeginDeclTypeSpec();
+  if (auto &mapperName{std::get<std::optional<parser::Name>>(spec.t)}) {
+    mapperName->symbol =
+        &MakeSymbol(*mapperName, MiscDetails{MiscDetails::Kind::ConstructName});
+  } else {
+    const parser::CharBlock defaultName{"default", 7};
+    MakeSymbol(
+        defaultName, Attrs{}, MiscDetails{MiscDetails::Kind::ConstructName});
+  }
+
+  PushScope(Scope::Kind::OtherConstruct, nullptr);
+  Walk(std::get<parser::TypeSpec>(spec.t));
+  auto &varName{std::get<parser::Name>(spec.t)};
+  DeclareObjectEntity(varName);
+
+  Walk(clauses);
+  EndDeclTypeSpec();
+  PopScope();
+}
+
+void OmpVisitor::ProcessReductionSpecifier(
+    const parser::OmpReductionSpecifier &spec,
+    const parser::OmpClauseList &clauses) {
+  // Creating a new scope in case the combiner expression (or clauses) use
+  // reerved identifiers, like "omp_in". This is a temporary solution until
+  // we deal with these in a more thorough way.
+  PushScope(Scope::Kind::OtherConstruct, nullptr);
+  Walk(std::get<parser::OmpReductionIdentifier>(spec.t));
+  Walk(std::get<parser::OmpTypeNameList>(spec.t));
+  Walk(std::get<std::optional<parser::OmpReductionCombiner>>(spec.t));
+  Walk(clauses);
+  PopScope();
+}
+
+bool OmpVisitor::Pre(const parser::OmpDirectiveSpecification &x) {
+  // OmpDirectiveSpecification is only used in METADIRECTIVE at the moment.
+  // Since it contains directives and clauses, some semantic checks may
+  // not be applicable.
+  // Disable the semantic analysis for it for now to allow the compiler to
+  // parse METADIRECTIVE without flagging errors.
+  AddOmpSourceRange(x.source);
+  auto dirId{std::get<llvm::omp::Directive>(x.t)};
+  auto &maybeArgs{std::get<std::optional<std::list<parser::OmpArgument>>>(x.t)};
+  auto &maybeClauses{std::get<std::optional<parser::OmpClauseList>>(x.t)};
+
+  switch (dirId) {
+  case llvm::omp::Directive::OMPD_declare_mapper:
+    if (maybeArgs && maybeClauses) {
+      const parser::OmpArgument &first{maybeArgs->front()};
+      if (auto *spec{std::get_if<parser::OmpMapperSpecifier>(&first.u)}) {
+        ProcessMapperSpecifier(*spec, *maybeClauses);
+      }
+    }
+    break;
+  case llvm::omp::Directive::OMPD_declare_reduction:
+    if (maybeArgs && maybeClauses) {
+      const parser::OmpArgument &first{maybeArgs->front()};
+      if (auto *spec{std::get_if<parser::OmpReductionSpecifier>(&first.u)}) {
+        ProcessReductionSpecifier(*spec, *maybeClauses);
+      }
+    }
+    break;
+  default:
+    // Default processing.
+    Walk(maybeArgs);
+    Walk(maybeClauses);
+    break;
+  }
+  return false;
+}
+
 // Walk the parse tree and resolve names to symbols.
 class ResolveNamesVisitor : public virtual ScopeHandler,
                             public ModuleVisitor,
@@ -3354,6 +3420,15 @@ void ModuleVisitor::DoAddUse(SourceName location, SourceName localName,
     // use-associating the same symbol again -- ok
     return;
   }
+  if (useUltimate.owner().IsModule() && localUltimate.owner().IsSubmodule() &&
+      DoesScopeContain(&useUltimate.owner(), localUltimate)) {
+    // Within a submodule, USE'ing a symbol that comes indirectly
+    // from the ancestor module, e.g. foo in:
+    //  MODULE m1; INTERFACE; MODULE SUBROUTINE foo; END INTERFACE; END
+    //  MODULE m2; USE m1; END
+    //  SUBMODULE m1(sm); USE m2; CONTAINS; MODULE PROCEDURE foo; END; END
+    return; // ok, ignore it
+  }
 
   if (localUltimate.name() == useUltimate.name() &&
       localUltimate.owner().IsModule() && useUltimate.owner().IsModule() &&
@@ -9737,7 +9812,7 @@ void ResolveNamesVisitor::ResolveSpecificationParts(ProgramTree &node) {
       },
       node.stmt());
   Walk(node.spec());
-  bool inDeviceSubprogram = false;
+  bool inDeviceSubprogram{false};
   // If this is a function, convert result to an object. This is to prevent the
   // result from being converted later to a function symbol if it is called
   // inside the function.
diff --git flang/lib/Semantics/tools.cpp flang/lib/Semantics/tools.cpp
index 7dc2defedfba..7544731a682e 100644
--- flang/lib/Semantics/tools.cpp
+++ flang/lib/Semantics/tools.cpp
@@ -137,12 +137,6 @@ Tristate IsDefinedAssignment(
   if (!lhsType || !rhsType) {
     return Tristate::No; // error or rhs is untyped
   }
-  if (lhsType->IsUnlimitedPolymorphic()) {
-    return Tristate::No;
-  }
-  if (rhsType->IsUnlimitedPolymorphic()) {
-    return Tristate::Maybe;
-  }
   TypeCategory lhsCat{lhsType->category()};
   TypeCategory rhsCat{rhsType->category()};
   if (rhsRank > 0 && lhsRank != rhsRank) {
diff --git flang/lib/Support/Fortran-features.cpp flang/lib/Support/Fortran-features.cpp
index be84072ae6a5..be07c26e3c9e 100644
--- flang/lib/Support/Fortran-features.cpp
+++ flang/lib/Support/Fortran-features.cpp
@@ -79,7 +79,6 @@ LanguageFeatureControl::LanguageFeatureControl() {
   warnUsage_.set(UsageWarning::ImplicitShared);
   warnUsage_.set(UsageWarning::IndexVarRedefinition);
   warnUsage_.set(UsageWarning::IncompatibleImplicitInterfaces);
-  warnUsage_.set(UsageWarning::BadTypeForTarget);
   warnUsage_.set(UsageWarning::VectorSubscriptFinalization);
   warnUsage_.set(UsageWarning::UndefinedFunctionResult);
   warnUsage_.set(UsageWarning::UselessIomsg);
diff --git flang/module/cudadevice.f90 flang/module/cudadevice.f90
index af516a1866fa..00e8b3db73ad 100644
--- flang/module/cudadevice.f90
+++ flang/module/cudadevice.f90
@@ -77,6 +77,22 @@ implicit none
 
   ! Math API
 
+  interface __fadd_rn
+   attributes(device) real function __fadd_rn(a,b) bind(c, name='__nv_fadd_rn')
+  !dir$ ignore_tkr (d) a, (d) b
+    real, value :: a, b
+   end function
+  end interface
+  public :: __fadd_rn
+  
+  interface __fadd_rz
+   attributes(device) real function __fadd_rz(a,b) bind(c, name='__nv_fadd_rz')
+  !dir$ ignore_tkr (d) a, (d) b
+    real, value :: a, b
+   end function
+  end interface
+  public :: __fadd_rz
+
   interface
     attributes(device) function __fadd_rd(x, y) bind(c, name='__nv_fadd_rd')
       real, intent(in), value :: x, y
@@ -93,6 +109,343 @@ implicit none
   end interface
   public :: __fadd_ru
 
+  interface __fmul_rn
+   attributes(device) real function __fmul_rn(a,b) bind(c, name='__nv_fmul_rn')
+  !dir$ ignore_tkr (d) a, (d) b
+    real, value :: a, b
+   end function
+  end interface
+  public :: __fmul_rn
+
+  interface __fmul_rz
+   attributes(device) real function __fmul_rz(a,b) bind(c, name='__nv_fmul_rz')
+  !dir$ ignore_tkr (d) a, (d) b
+    real, value :: a, b
+   end function
+  end interface
+  public :: __fmul_rz
+
+  interface __fmul_ru
+   attributes(device) real function __fmul_ru(a,b) bind(c, name='__nv_fmul_ru')
+  !dir$ ignore_tkr (d) a, (d) b
+    real, value :: a, b
+   end function
+  end interface
+  public :: __fmul_ru
+
+  interface __fmul_rd
+   attributes(device) real function __fmul_rd(a,b) bind(c, name='__nv_fmul_rd')
+  !dir$ ignore_tkr (d) a, (d) b
+    real, value :: a, b
+   end function
+  end interface
+  public :: __fmul_rd
+
+  interface __fmaf_rn
+   attributes(device) real function __fmaf_rn(a,b,c) bind(c, name='__nv_fmaf_rn')
+  !dir$ ignore_tkr (d) a, (d) b, (d) c
+    real, value :: a, b, c
+   end function
+  end interface
+  public :: __fmaf_rn
+
+  interface __fmaf_rz
+   attributes(device) real function __fmaf_rz(a,b,c) bind(c, name='__nv_fmaf_rz')
+  !dir$ ignore_tkr (d) a, (d) b, (d) c
+    real, value :: a, b, c
+   end function
+  end interface
+  public :: __fmaf_rz
+  
+  interface __fmaf_ru
+   attributes(device) real function __fmaf_ru(a,b,c) bind(c, name='__nv_fmaf_ru')
+  !dir$ ignore_tkr (d) a, (d) b, (d) c
+    real, value :: a, b, c
+   end function
+  end interface
+  public :: __fmaf_ru
+  
+  interface __fmaf_rd
+   attributes(device) real function __fmaf_rd(a,b,c) bind(c, name='__nv_fmaf_rd')
+  !dir$ ignore_tkr (d) a, (d) b, (d) c
+    real, value :: a, b, c
+   end function
+  end interface
+  public :: __fmaf_rd
+
+  interface __frcp_rn
+   attributes(device) real function __frcp_rn(a) bind(c, name='__nv_frcp_rn')
+  !dir$ ignore_tkr (d) a
+    real, value :: a
+   end function
+  end interface
+  public :: __frcp_rn
+
+  interface __frcp_rz
+   attributes(device) real function __frcp_rz(a) bind(c, name='__nv_frcp_rz')
+  !dir$ ignore_tkr (d) a
+    real, value :: a
+   end function
+  end interface
+  public :: __frcp_rz
+
+  interface __frcp_ru
+   attributes(device) real function __frcp_ru(a) bind(c, name='__nv_frcp_ru')
+  !dir$ ignore_tkr (d) a
+    real, value :: a
+   end function
+  end interface
+  public :: __frcp_ru
+
+  interface __frcp_rd
+   attributes(device) real function __frcp_rd(a) bind(c, name='__nv_frcp_rd')
+  !dir$ ignore_tkr (d) a
+    real, value :: a
+   end function
+  end interface
+  public :: __frcp_rd
+
+  interface __fsqrt_rn
+   attributes(device) real function __fsqrt_rn(a) bind(c, name='__nv_fsqrt_rn')
+  !dir$ ignore_tkr (d) a
+    real, value :: a
+   end function
+  end interface
+  public :: __fsqrt_rn
+
+  interface __fsqrt_rz
+   attributes(device) real function __fsqrt_rz(a) bind(c, name='__nv_fsqrt_rz')
+  !dir$ ignore_tkr (d) a
+    real, value :: a
+   end function
+  end interface
+  public :: __fsqrt_rz
+
+  interface __fsqrt_ru
+   attributes(device) real function __fsqrt_ru(a) bind(c, name='__nv_fsqrt_ru')
+  !dir$ ignore_tkr (d) a
+    real, value :: a
+   end function
+  end interface
+  public :: __fsqrt_ru
+
+  interface __fsqrt_rd
+   attributes(device) real function __fsqrt_rd(a) bind(c, name='__nv_fsqrt_rd')
+  !dir$ ignore_tkr (d) a
+    real, value :: a
+   end function
+  end interface
+  public :: __fsqrt_rd
+
+  interface __fdiv_rn
+   attributes(device) real function __fdiv_rn(a,b) bind(c, name='__nv_fdiv_rn')
+  !dir$ ignore_tkr (d) a, (d) b
+    real, value :: a, b
+   end function
+  end interface
+  public :: __fdiv_rn
+
+  interface __fdiv_rz
+   attributes(device) real function __fdiv_rz(a,b) bind(c, name='__nv_fdiv_rz')
+  !dir$ ignore_tkr (d) a, (d) b
+    real, value :: a, b
+   end function
+  end interface
+  public :: __fdiv_rz
+
+  interface __fdiv_ru
+   attributes(device) real function __fdiv_ru(a,b) bind(c, name='__nv_fdiv_ru')
+  !dir$ ignore_tkr (d) a, (d) b
+    real, value :: a, b
+   end function
+  end interface
+  public :: __fdiv_ru
+
+  interface __fdiv_rd
+   attributes(device) real function __fdiv_rd(a,b) bind(c, name='__nv_fdiv_rd')
+  !dir$ ignore_tkr (d) a, (d) b
+    real, value :: a, b
+   end function
+  end interface
+  public :: __fdiv_rd
+
+  interface __dadd_rn
+   attributes(device) real(8) function __dadd_rn(a,b) bind(c, name='__nv_dadd_rn')
+  !dir$ ignore_tkr (d) a, (d) b
+    real(8), value :: a, b
+   end function
+  end interface
+  public :: __dadd_rn
+
+  interface __dadd_rz
+   attributes(device) real(8) function __dadd_rz(a,b) bind(c, name='__nv_dadd_rz')
+  !dir$ ignore_tkr (d) a, (d) b
+    real(8), value :: a, b
+   end function
+  end interface
+  public :: __dadd_rz
+
+  interface __dadd_ru
+   attributes(device) real(8) function __dadd_ru(a,b) bind(c, name='__nv_dadd_ru')
+  !dir$ ignore_tkr (d) a, (d) b
+    real(8), value :: a, b
+   end function
+  end interface
+  public :: __dadd_ru
+
+  interface __dadd_rd
+   attributes(device) real(8) function __dadd_rd(a,b) bind(c, name='__nv_dadd_rd')
+  !dir$ ignore_tkr (d) a, (d) b
+    real(8), value :: a, b
+   end function
+  end interface
+  public :: __dadd_rd
+
+  interface __dmul_rn
+   attributes(device) real(8) function __dmul_rn(a,b) bind(c, name='__nv_dmul_rn')
+  !dir$ ignore_tkr (d) a, (d) b
+    real(8), value :: a, b
+   end function
+  end interface
+  public :: __dmul_rn
+
+  interface __dmul_rz
+   attributes(device) real(8) function __dmul_rz(a,b) bind(c, name='__nv_dmul_rz')
+  !dir$ ignore_tkr (d) a, (d) b
+    real(8), value :: a, b
+   end function
+  end interface
+  public :: __dmul_rz
+
+  interface __dmul_ru
+   attributes(device) real(8) function __dmul_ru(a,b) bind(c, name='__nv_dmul_ru')
+  !dir$ ignore_tkr (d) a, (d) b
+    real(8), value :: a, b
+   end function
+  end interface
+  public :: __dmul_ru
+
+  interface __dmul_rd
+   attributes(device) real(8) function __dmul_rd(a,b) bind(c, name='__nv_dmul_rd')
+  !dir$ ignore_tkr (d) a, (d) b
+    real(8), value :: a, b
+   end function
+  end interface
+  public :: __dmul_rd
+
+  interface __fma_rn
+   attributes(device) real(8) function __fma_rn(a,b,c) bind(c, name='__nv_fma_rn')
+  !dir$ ignore_tkr (d) a, (d) b
+    real(8), value :: a, b, c
+   end function
+  end interface
+  public :: __fma_rn
+
+  interface __fma_rz
+   attributes(device) real(8) function __fma_rz(a,b,c) bind(c, name='__nv_fma_rz')
+  !dir$ ignore_tkr (d) a, (d) b
+    real(8), value :: a, b, c
+   end function
+  end interface
+  public :: __fma_rz
+
+  interface __fma_ru
+   attributes(device) real(8) function __fma_ru(a,b,c) bind(c, name='__nv_fma_ru')
+  !dir$ ignore_tkr (d) a, (d) b
+    real(8), value :: a, b, c
+   end function
+  end interface
+  public :: __fma_ru
+
+  interface __fma_rd
+   attributes(device) real(8) function __fma_rd(a,b,c) bind(c, name='__nv_fma_rd')
+  !dir$ ignore_tkr (d) a, (d) b
+    real(8), value :: a, b, c
+   end function
+  end interface
+  public :: __fma_rd
+
+  interface rsqrt
+    attributes(device) real(4) function rsqrtf(x) bind(c,name='__nv_rsqrtf')
+      real(4), value :: x
+    end function
+    attributes(device) real(8) function rsqrt(x) bind(c,name='__nv_rsqrt')
+      real(8), value :: x
+    end function
+  end interface
+  public :: rsqrt
+  
+  interface signbit
+    attributes(device) integer(4) function signbitf(x) bind(c,name='__nv_signbitf')
+      real(4), value :: x
+    end function
+    attributes(device) integer(4) function signbit(x) bind(c,name='__nv_signbitd')
+      real(8), value :: x
+    end function
+  end interface
+  public :: signbit
+  
+  interface sincos
+    attributes(device) subroutine sincosf(x, y, z) bind(c,name='__nv_sincosf')
+      real(4), value :: x
+      real(4), device :: y
+      real(4), device :: z
+    end subroutine
+    attributes(device) subroutine sincos(x, y, z) bind(c,name='__nv_sincos')
+      real(8), value :: x
+      real(8), device :: y
+      real(8), device :: z
+    end subroutine
+  end interface
+  public :: sincos
+  
+  interface sincospi
+    attributes(device) subroutine sincospif(x, y, z) bind(c,name='__nv_sincospif')
+      real(4), value :: x
+      real(4), device :: y
+      real(4), device :: z
+    end subroutine
+    attributes(device) subroutine sincospi(x, y, z) bind(c,name='__nv_sincospi')
+      real(8), value :: x
+      real(8), device :: y
+      real(8), device :: z
+    end subroutine
+  end interface
+  public :: sincospi
+  
+  interface mulhi
+   attributes(device) integer function __mulhi(i,j) bind(c,name='__nv_mulhi')
+  !dir$ ignore_tkr (d) i, (d) j
+    integer, value :: i,j
+   end function
+  end interface
+  public :: mulhi
+
+  interface umulhi
+   attributes(device) integer function __umulhi(i,j) bind(c,name='__nv_umulhi')
+  !dir$ ignore_tkr (d) i, (d) j
+    integer, value :: i,j
+   end function
+  end interface
+  public :: umulhi
+  
+  interface mul64hi
+   attributes(device) integer(8) function __mul64hi(i,j) bind(c,name='__nv_mul64hi')
+  !dir$ ignore_tkr (d) i, (d) j
+    integer(8), value :: i,j
+   end function
+  end interface
+  public :: mul64hi
+  
+  interface umul64hi
+   attributes(device) integer(8) function __umul64hi(i,j) bind(c,name='__nv_umul64hi')
+  !dir$ ignore_tkr (d) i, (d) j
+    integer(8), value :: i,j
+   end function
+  end interface
+  public :: umul64hi
+
+
   ! Atomic Operations
 
   interface atomicadd
diff --git flang/runtime/CMakeLists.txt flang/runtime/CMakeLists.txt
index 04e120cc5d37..974488ba8c39 100644
--- flang/runtime/CMakeLists.txt
+++ flang/runtime/CMakeLists.txt
@@ -83,10 +83,15 @@ endfunction ()
 # Runtime includes are in Flang-RT's source dir.
 include_directories(BEFORE "${FLANG_RT_SOURCE_DIR}/include")
 
+set(linked_libraries "")
+
 # function checks
 find_package(Backtrace)
 set(HAVE_BACKTRACE ${Backtrace_FOUND})
 set(BACKTRACE_HEADER ${Backtrace_HEADER})
+if(HAVE_BACKTRACE)
+  list(APPEND linked_libraries ${Backtrace_LIBRARY})
+endif()
 
 include(CheckCXXSymbolExists)
 include(CheckCXXSourceCompiles)
@@ -299,12 +304,16 @@ runtime_source_files(sources SUBDIR "runtime")
 if (NOT DEFINED MSVC)
   add_flang_library(flang_rt.runtime
     ${sources}
+    LINK_LIBS
+    ${linked_libraries}
 
     INSTALL_WITH_TOOLCHAIN
   )
 else()
   add_flang_library(flang_rt.runtime
     ${sources}
+    LINK_LIBS
+    ${linked_libraries}
   )
   set(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreaded)
   add_flang_library(flang_rt.runtime.static ${sources}
diff --git flang/test/Analysis/AliasAnalysis/alias-analysis-omp-private-allocatable.mlir flang/test/Analysis/AliasAnalysis/alias-analysis-omp-private-allocatable.mlir
index 5116622364fa..e19885c71a9f 100644
--- flang/test/Analysis/AliasAnalysis/alias-analysis-omp-private-allocatable.mlir
+++ flang/test/Analysis/AliasAnalysis/alias-analysis-omp-private-allocatable.mlir
@@ -20,15 +20,7 @@
 // CHECK: ar2#0 <-> ar1#1: NoAlias
 // CHECK: ar2#1 <-> ar1#1: NoAlias
 
-omp.private {type = private} @_QFmysubEar1_private_ref_box_heap_Uxf64 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>> alloc {
-^bb0(%arg0: !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>):
-  %0 = fir.alloca !fir.box<!fir.heap<!fir.array<?xf64>>> {bindc_name = "ar1", pinned, uniq_name = "_QFmysubEar1"}
-  %5:2 = hlfir.declare %0 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFmysubEar1"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>)
-  omp.yield(%5#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>)
-} dealloc {
-^bb0(%arg0: !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>):
-  omp.yield
-}
+omp.private {type = private} @_QFmysubEar1_private_ref_box_heap_Uxf64 : !fir.box<!fir.heap<!fir.array<?xf64>>>
 func.func @testPrivateAllocatable(%arg0: !fir.ref<i32> {fir.bindc_name = "ns"}, %arg1: !fir.ref<i32> {fir.bindc_name = "ne"}) {
   %0 = fir.dummy_scope : !fir.dscope
   %1 = fir.alloca !fir.box<!fir.heap<!fir.array<?xf64>>> {bindc_name = "ar1", uniq_name = "_QFmysubEar1"}
diff --git flang/test/Analysis/AliasAnalysis/alias-analysis-omp-teams-distribute-private-ptr.mlir flang/test/Analysis/AliasAnalysis/alias-analysis-omp-teams-distribute-private-ptr.mlir
index 78207d21c45b..b60fbe4152fc 100644
--- flang/test/Analysis/AliasAnalysis/alias-analysis-omp-teams-distribute-private-ptr.mlir
+++ flang/test/Analysis/AliasAnalysis/alias-analysis-omp-teams-distribute-private-ptr.mlir
@@ -17,18 +17,8 @@
 // CHECK-LABEL: Testing : "_QQmain"
 // CHECK-DAG:   ptrA#0 <-> ArrayA#0: MayAlias
 
-omp.private {type = private} @_QFEi_private_ref_i32 : !fir.ref<i32> alloc {
-^bb0(%arg0: !fir.ref<i32>):
-  %0 = fir.alloca i32 {bindc_name = "i", pinned, uniq_name = "_QFEi"}
-  %1:2 = hlfir.declare %0 {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-  omp.yield(%1#0 : !fir.ref<i32>)
-}
-omp.private {type = firstprivate} @_QFEptra_firstprivate_ref_box_ptr_Uxi32 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>> alloc {
-^bb0(%arg0: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>):
-  %0 = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>> {bindc_name = "ptra", pinned, uniq_name = "_QFEptra"}
-  %1:2 = hlfir.declare %0 {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFEptra"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>)
-  omp.yield(%1#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>)
-} copy {
+omp.private {type = private} @_QFEi_private_ref_i32 : i32
+omp.private {type = firstprivate} @_QFEptra_firstprivate_ref_box_ptr_Uxi32 : !fir.box<!fir.ptr<!fir.array<?xi32>>> copy {
 ^bb0(%arg0: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, %arg1: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>):
   %0 = fir.load %arg0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
   fir.store %0 to %arg1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
diff --git flang/test/Analysis/AliasAnalysis/alias-analysis-omp-teams-distribute-private.mlir flang/test/Analysis/AliasAnalysis/alias-analysis-omp-teams-distribute-private.mlir
index 4668b2c215c8..7f60a6fa0803 100644
--- flang/test/Analysis/AliasAnalysis/alias-analysis-omp-teams-distribute-private.mlir
+++ flang/test/Analysis/AliasAnalysis/alias-analysis-omp-teams-distribute-private.mlir
@@ -21,26 +21,10 @@
 // CHECK-DAG: tmp_private_array#0 <-> unnamed_array#0: NoAlias
 // CHECK-DAG: tmp_private_array#1 <-> unnamed_array#0: NoAlias
 
-omp.private {type = private} @_QFEi_private_ref_i32 : !fir.ref<i32> alloc {
-^bb0(%arg0: !fir.ref<i32>):
-  %0 = fir.alloca i32 {bindc_name = "i", pinned, uniq_name = "_QFEi"}
-  %1:2 = hlfir.declare %0 {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-  omp.yield(%1#0 : !fir.ref<i32>)
-}
-omp.private {type = private} @_QFEj_private_ref_i32 : !fir.ref<i32> alloc {
-^bb0(%arg0: !fir.ref<i32>):
-  %0 = fir.alloca i32 {bindc_name = "j", pinned, uniq_name = "_QFEj"}
-  %1:2 = hlfir.declare %0 {uniq_name = "_QFEj"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-  omp.yield(%1#0 : !fir.ref<i32>)
-}
-omp.private {type = private} @_QFEtmp_private_ref_2xi32 : !fir.ref<!fir.array<2xi32>> alloc {
-^bb0(%arg0: !fir.ref<!fir.array<2xi32>>):
-  %c2 = arith.constant 2 : index
-  %0 = fir.alloca !fir.array<2xi32> {bindc_name = "tmp", pinned, uniq_name = "_QFEtmp"}
-  %1 = fir.shape %c2 : (index) -> !fir.shape<1>
-  %2:2 = hlfir.declare %0(%1) {uniq_name = "_QFEtmp"} : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<2xi32>>, !fir.ref<!fir.array<2xi32>>)
-  omp.yield(%2#0 : !fir.ref<!fir.array<2xi32>>)
-}
+omp.private {type = private} @_QFEi_private_ref_i32 : i32
+omp.private {type = private} @_QFEj_private_ref_i32 : i32
+omp.private {type = private} @_QFEtmp_private_ref_2xi32 : !fir.array<2xi32>
+
 func.func @_QQmain() attributes {fir.bindc_name = "main"} {
   %0 = fir.address_of(@_QFEarraya) : !fir.ref<!fir.array<10x10xi32>>
   %c10 = arith.constant 10 : index
diff --git flang/test/Evaluate/bug124618.f90 flang/test/Evaluate/bug124618.f90
new file mode 100644
index 000000000000..939985e588af
--- /dev/null
+++ flang/test/Evaluate/bug124618.f90
@@ -0,0 +1,5 @@
+! RUN: %flang_fc1 -fsyntax-only %s 2>&1 | FileCheck --allow-empty %s
+!CHECK-NOT: error:
+real x
+print *, char(48, kind=size([x])) ! folds down to 1
+end
diff --git flang/test/Evaluate/rewrite-out_of_range.F90 flang/test/Evaluate/rewrite-out_of_range.F90
index b5df610ff2fb..9196bba591e6 100644
--- flang/test/Evaluate/rewrite-out_of_range.F90
+++ flang/test/Evaluate/rewrite-out_of_range.F90
@@ -1,5 +1,7 @@
 ! Tests rewriting of OUT_OF_RANGE()
-! RUN: %flang_fc1 -fdebug-unparse -cpp %s 2>&1 | FileCheck %s --check-prefixes=CHECK%if target=x86_64{{.*}} %{,CHECK-X86-64%}
+! REQUIRES: target=x86-64{{.*}}
+! REQUIRES: system-linux
+! RUN: %flang_fc1 -fdebug-unparse -cpp %s 2>&1 | FileCheck %s --check-prefixes=CHECK%if target=x86_64{{.*}} %{%if system-linux %{,CHECK-X86-64%}%}
 
 logical round
 
diff --git flang/test/Fir/CUDA/cuda-allocate.fir flang/test/Fir/CUDA/cuda-allocate.fir
index 08573110821c..095ad92d5deb 100644
--- flang/test/Fir/CUDA/cuda-allocate.fir
+++ flang/test/Fir/CUDA/cuda-allocate.fir
@@ -19,7 +19,7 @@ func.func @_QPsub1() {
 // CHECK: %[[DESC:.*]] = fir.convert %[[DESC_RT_CALL]] : (!fir.ref<!fir.box<none>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
 // CHECK: %[[DECL_DESC:.*]]:2 = hlfir.declare %[[DESC]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
 // CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 // CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK: %{{.*}} = fir.call @_FortranAAllocatableDeallocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@@ -47,7 +47,7 @@ func.func @_QPsub3() {
 // CHECK: %[[A:.*]]:2 = hlfir.declare %[[A_ADDR]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMmod1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
 
 // CHECK: %[[A_BOX:.*]] = fir.convert %[[A]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: fir.call @_FortranACUFAllocatableAllocateSync(%[[A_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocateSync(%[[A_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 // CHECK: %[[A_BOX:.*]] = fir.convert %[[A]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK: fir.call @_FortranACUFAllocatableDeallocate(%[[A_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@@ -87,7 +87,7 @@ func.func @_QPsub5() {
 }
 
 // CHECK-LABEL: func.func @_QPsub5()
-// CHECK: fir.call @_FortranACUFAllocatableAllocate({{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocate({{.*}}) : (!fir.ref<!fir.box<none>>, i64, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 // CHECK: fir.call @_FortranAAllocatableDeallocate({{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 
@@ -118,7 +118,7 @@ func.func @_QQsub6() attributes {fir.bindc_name = "test"} {
 // CHECK: %[[B:.*]]:2 = hlfir.declare %[[B_ADDR]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMdataEb"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
 // CHECK: _FortranAAllocatableSetBounds
 // CHECK: %[[B_BOX:.*]] = fir.convert %[[B]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: fir.call @_FortranACUFAllocatableAllocateSync(%[[B_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocateSync(%[[B_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 
 func.func @_QPallocate_source() {
@@ -142,7 +142,7 @@ func.func @_QPallocate_source() {
 // CHECK: %[[SOURCE:.*]] = fir.load %[[DECL_HOST]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
 // CHECK: %[[DEV_CONV:.*]] = fir.convert %[[DECL_DEV]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK: %[[SOURCE_CONV:.*]] = fir.convert %[[SOURCE]] : (!fir.box<!fir.heap<!fir.array<?x?xf32>>>) -> !fir.box<none>
-// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocateSource(%[[DEV_CONV]], %[[SOURCE_CONV]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocateSource(%[[DEV_CONV]], %[[SOURCE_CONV]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i64, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 
 fir.global @_QMmod1Ea_d {data_attr = #cuf.cuda<device>} : !fir.box<!fir.heap<!fir.array<?x?xf32>>> {
@@ -179,7 +179,7 @@ func.func @_QQallocate_stream() {
 // CHECK: %[[STREAM_ALLOCA:.*]] = fir.alloca i64 {bindc_name = "stream1", uniq_name = "_QFEstream1"}
 // CHECK: %[[STREAM:.*]] = fir.declare %[[STREAM_ALLOCA]] {uniq_name = "_QFEstream1"} : (!fir.ref<i64>) -> !fir.ref<i64>
 // CHECK: %[[STREAM_LOAD:.*]] = fir.load %[[STREAM]] : !fir.ref<i64>
-// CHECK: fir.call @_FortranACUFAllocatableAllocate(%{{.*}}, %[[STREAM_LOAD]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocate(%{{.*}}, %[[STREAM_LOAD]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 
 func.func @_QPp_alloc() {
@@ -255,4 +255,19 @@ func.func @_QMmod1Ppointer_source_global() {
 // CHECK-LABEL: func.func @_QMmod1Ppointer_source_global()
 // CHECK: fir.call @_FortranACUFPointerAllocateSourceSync
 
+func.func @_QQpinned() attributes {fir.bindc_name = "testasync"} {
+  %0 = cuf.alloc !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", data_attr = #cuf.cuda<pinned>, uniq_name = "_QFEa"} -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+  %4 = fir.declare %0 {data_attr = #cuf.cuda<pinned>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+  %13 = fir.alloca !fir.logical<4> {bindc_name = "pinnedflag", uniq_name = "_QFEpinnedflag"}
+  %14 = fir.declare %13 {uniq_name = "_QFEpinnedflag"} : (!fir.ref<!fir.logical<4>>) -> !fir.ref<!fir.logical<4>>
+  %18 = cuf.allocate %4 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> pinned(%14 : !fir.ref<!fir.logical<4>>) {data_attr = #cuf.cuda<pinned>, hasStat} -> i32
+  return
+}
+
+// CHECK-LABEL: func.func @_QQpinned() attributes {fir.bindc_name = "testasync"} {
+// CHECK: %[[PINNED:.*]] = fir.alloca !fir.logical<4> {bindc_name = "pinnedflag", uniq_name = "_QFEpinnedflag"}
+// CHECK: %[[DECL_PINNED:.*]] = fir.declare %[[PINNED]] {uniq_name = "_QFEpinnedflag"} : (!fir.ref<!fir.logical<4>>) -> !fir.ref<!fir.logical<4>>
+// CHECK: %[[CONV_PINNED:.*]] = fir.convert %[[DECL_PINNED]] : (!fir.ref<!fir.logical<4>>) -> !fir.ref<i1>
+// CHECK: fir.call @_FortranACUFAllocatableAllocate(%{{.*}}, %{{.*}}, %[[CONV_PINNED]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+
 } // end of module
diff --git flang/test/Fir/boxproc-openmp.fir flang/test/Fir/boxproc-openmp.fir
index 9db053ad93c6..4f62b0a4a42b 100644
--- flang/test/Fir/boxproc-openmp.fir
+++ flang/test/Fir/boxproc-openmp.fir
@@ -3,26 +3,13 @@
 // Check minimally, only arguments, yields and the private types.
 
 // Test a private declaration with one region (alloc)
-//CHECK: omp.private {type = private}  @_QFsub1Et1_private_ref_rec__QFsub1Tt : !fir.ref<!fir.type<_QFsub1TtUnboxProc{p1:() -> ()}>> alloc {
-omp.private {type = private} @_QFsub1Et1_private_ref_rec__QFsub1Tt : !fir.ref<!fir.type<_QFsub1Tt{p1:!fir.boxproc<() -> ()>}>> alloc {
-//CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.type<_QFsub1TtUnboxProc{p1:() -> ()}>>):
-^bb0(%arg0: !fir.ref<!fir.type<_QFsub1Tt{p1:!fir.boxproc<() -> ()>}>>):
-  %c1_i32 = arith.constant 1 : i32
-  %0 = fir.alloca !fir.type<_QFsub1Tt{p1:!fir.boxproc<() -> ()>}> {bindc_name = "t1", pinned, uniq_name = "_QFsub1Et1"}
-  %1 = fir.declare %0 {uniq_name = "_QFsub1Et1"} : (!fir.ref<!fir.type<_QFsub1Tt{p1:!fir.boxproc<() -> ()>}>>) -> !fir.ref<!fir.type<_QFsub1Tt{p1:!fir.boxproc<() -> ()>}>>
-  %2 = fir.embox %1 : (!fir.ref<!fir.type<_QFsub1Tt{p1:!fir.boxproc<() -> ()>}>>) -> !fir.box<!fir.type<_QFsub1Tt{p1:!fir.boxproc<() -> ()>}>>
-  %3 = fir.address_of(@_QQclXea6256ba131ddd9c2210e68030a0edd3) : !fir.ref<!fir.char<1,49>>
-  %4 = fir.convert %2 : (!fir.box<!fir.type<_QFsub1Tt{p1:!fir.boxproc<() -> ()>}>>) -> !fir.box<none>
-  %5 = fir.convert %3 : (!fir.ref<!fir.char<1,49>>) -> !fir.ref<i8>
-  fir.call @_FortranAInitialize(%4, %5, %c1_i32) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> ()
-//CHECK: omp.yield(%{{.*}} : !fir.ref<!fir.type<_QFsub1TtUnboxProc{p1:() -> ()}>>)
-  omp.yield(%1 : !fir.ref<!fir.type<_QFsub1Tt{p1:!fir.boxproc<() -> ()>}>>)
-}
+//CHECK: omp.private {type = private}  @_QFsub1Et1_private_rec__QFsub1Tt : !fir.type<_QFsub1TtUnboxProc{p1:() -> ()}>{{$}}
+omp.private {type = private} @_QFsub1Et1_private_rec__QFsub1Tt : !fir.type<_QFsub1Tt{p1:!fir.boxproc<() -> ()>}>
 func.func @_QPsub1() {
   %0 = fir.alloca !fir.type<_QFsub1Tt{p1:!fir.boxproc<() -> ()>}> {bindc_name = "t1", uniq_name = "_QFsub1Et1"}
   %1 = fir.declare %0 {uniq_name = "_QFsub1Et1"} : (!fir.ref<!fir.type<_QFsub1Tt{p1:!fir.boxproc<() -> ()>}>>) -> !fir.ref<!fir.type<_QFsub1Tt{p1:!fir.boxproc<() -> ()>}>>
-//CHECK: omp.parallel private(@_QFsub1Et1_private_ref_rec__QFsub1Tt %{{.*}} -> %{{.*}} : !fir.ref<!fir.type<_QFsub1TtUnboxProc{p1:() -> ()}>>) {
-  omp.parallel private(@_QFsub1Et1_private_ref_rec__QFsub1Tt %1 -> %arg0 : !fir.ref<!fir.type<_QFsub1Tt{p1:!fir.boxproc<() -> ()>}>>) {
+//CHECK: omp.parallel private(@_QFsub1Et1_private_rec__QFsub1Tt %{{.*}} -> %{{.*}} : !fir.ref<!fir.type<_QFsub1TtUnboxProc{p1:() -> ()}>>) {
+  omp.parallel private(@_QFsub1Et1_private_rec__QFsub1Tt %1 -> %arg0 : !fir.ref<!fir.type<_QFsub1Tt{p1:!fir.boxproc<() -> ()>}>>) {
     %2 = fir.declare %arg0 {uniq_name = "_QFsub1Et1"} : (!fir.ref<!fir.type<_QFsub1Tt{p1:!fir.boxproc<() -> ()>}>>) -> !fir.ref<!fir.type<_QFsub1Tt{p1:!fir.boxproc<() -> ()>}>>
     omp.terminator
   }
@@ -31,11 +18,11 @@ func.func @_QPsub1() {
 
 
 // Test a private declaration with all regions (alloc, copy, dealloc)
-//CHECK: omp.private {type = firstprivate} @_QFsub2Et1_firstprivate_ref_box_heap_rec__QFsub2Tt : 
-//CHECK-SAME: !fir.ref<!fir.box<!fir.heap<!fir.type<_QFsub2TtUnboxProc{p1:() -> ()}>>>> alloc {
-omp.private {type = firstprivate} @_QFsub2Et1_firstprivate_ref_box_heap_rec__QFsub2Tt : !fir.ref<!fir.box<!fir.heap<!fir.type<_QFsub2Tt{p1:!fir.boxproc<() -> ()>}>>>> alloc {
-//CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.box<!fir.heap<!fir.type<_QFsub2TtUnboxProc{p1:() -> ()}>>>>):
-^bb0(%arg0: !fir.ref<!fir.box<!fir.heap<!fir.type<_QFsub2Tt{p1:!fir.boxproc<() -> ()>}>>>>):
+//CHECK: omp.private {type = firstprivate} @_QFsub2Et1_firstprivate_box_heap_rec__QFsub2Tt :
+//CHECK-SAME: [[TYPE:!fir.box<!fir.heap<!fir.type<_QFsub2TtUnboxProc\{p1:\(\) -> \(\)\}>>>]] init {
+omp.private {type = firstprivate} @_QFsub2Et1_firstprivate_box_heap_rec__QFsub2Tt : !fir.box<!fir.heap<!fir.type<_QFsub2Tt{p1:!fir.boxproc<() -> ()>}>>> init {
+//CHECK: ^bb0(%{{.*}}: !fir.ref<[[TYPE]]>, %{{.*}}: !fir.ref<[[TYPE]]>):
+^bb0(%arg0: !fir.ref<!fir.box<!fir.heap<!fir.type<_QFsub2Tt{p1:!fir.boxproc<() -> ()>}>>>>, %arg1:!fir.ref<!fir.box<!fir.heap<!fir.type<_QFsub2Tt{p1:!fir.boxproc<() -> ()>}>>>>):
   %0 = fir.alloca !fir.box<!fir.heap<!fir.type<_QFsub2Tt{p1:!fir.boxproc<() -> ()>}>>> {bindc_name = "t1", pinned, uniq_name = "_QFsub2Et1"}
   %1 = fir.declare %0 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub2Et1"} : (!fir.ref<!fir.box<!fir.heap<!fir.type<_QFsub2Tt{p1:!fir.boxproc<() -> ()>}>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.type<_QFsub2Tt{p1:!fir.boxproc<() -> ()>}>>>>
 //CHECK: omp.yield(%{{.*}} : !fir.ref<!fir.box<!fir.heap<!fir.type<_QFsub2TtUnboxProc{p1:() -> ()}>>>>)
@@ -70,9 +57,9 @@ omp.private {type = firstprivate} @_QFsub2Et1_firstprivate_ref_box_heap_rec__QFs
 func.func @_QPsub2() {
   %0 = fir.alloca !fir.box<!fir.heap<!fir.type<_QFsub2Tt{p1:!fir.boxproc<() -> ()>}>>> {bindc_name = "t1", uniq_name = "_QFsub2Et1"}
   %1 = fir.declare %0 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub2Et1"} : (!fir.ref<!fir.box<!fir.heap<!fir.type<_QFsub2Tt{p1:!fir.boxproc<() -> ()>}>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.type<_QFsub2Tt{p1:!fir.boxproc<() -> ()>}>>>>
-//CHECK: omp.parallel private(@_QFsub2Et1_firstprivate_ref_box_heap_rec__QFsub2Tt %{{.*}} -> %{{.*}} :
+//CHECK: omp.parallel private(@_QFsub2Et1_firstprivate_box_heap_rec__QFsub2Tt %{{.*}} -> %{{.*}} :
 //CHECK-SAME: !fir.ref<!fir.box<!fir.heap<!fir.type<_QFsub2TtUnboxProc{p1:() -> ()}>>>>) {
-  omp.parallel private(@_QFsub2Et1_firstprivate_ref_box_heap_rec__QFsub2Tt %1 -> %arg0 : !fir.ref<!fir.box<!fir.heap<!fir.type<_QFsub2Tt{p1:!fir.boxproc<() -> ()>}>>>>) {
+  omp.parallel private(@_QFsub2Et1_firstprivate_box_heap_rec__QFsub2Tt %1 -> %arg0 : !fir.ref<!fir.box<!fir.heap<!fir.type<_QFsub2Tt{p1:!fir.boxproc<() -> ()>}>>>>) {
     %2 = fir.declare %arg0 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub2Et1"} : (!fir.ref<!fir.box<!fir.heap<!fir.type<_QFsub2Tt{p1:!fir.boxproc<() -> ()>}>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.type<_QFsub2Tt{p1:!fir.boxproc<() -> ()>}>>>>
     omp.terminator
   }
diff --git flang/test/HLFIR/assign-codegen.fir flang/test/HLFIR/assign-codegen.fir
index 581d1ab0e773..7e03aa0bd464 100644
--- flang/test/HLFIR/assign-codegen.fir
+++ flang/test/HLFIR/assign-codegen.fir
@@ -427,3 +427,57 @@ func.func @test_upoly_expr_assignment(%arg0: !fir.class<!fir.array<?xnone>> {fir
 // CHECK:           }
 // CHECK:           return
 // CHECK:         }
+
+func.func @test_scalar_box(%arg0: f32, %arg1: !fir.box<!fir.ptr<f32>>) {
+  %x = fir.declare %arg1 {uniq_name = "x"} : (!fir.box<!fir.ptr<f32>>) -> !fir.box<!fir.ptr<f32>>
+  hlfir.assign %arg0 to %x : f32, !fir.box<!fir.ptr<f32>>
+  return
+}
+// CHECK-LABEL:   func.func @test_scalar_box(
+// CHECK-SAME:                               %[[VAL_0:.*]]: f32,
+// CHECK-SAME:                               %[[VAL_1:.*]]: !fir.box<!fir.ptr<f32>>) {
+// CHECK:           %[[VAL_2:.*]] = fir.declare %[[VAL_1]] {uniq_name = "x"} : (!fir.box<!fir.ptr<f32>>) -> !fir.box<!fir.ptr<f32>>
+// CHECK:           %[[VAL_3:.*]] = fir.box_addr %[[VAL_2]] : (!fir.box<!fir.ptr<f32>>) -> !fir.ptr<f32>
+// CHECK:           fir.store %[[VAL_0]] to %[[VAL_3]] : !fir.ptr<f32>
+
+func.func @test_scalar_opt_box(%arg0: f32, %arg1: !fir.box<!fir.ptr<f32>>) {
+  %x = fir.declare %arg1 {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "x"} : (!fir.box<!fir.ptr<f32>>) -> !fir.box<!fir.ptr<f32>>
+  hlfir.assign %arg0 to %x : f32, !fir.box<!fir.ptr<f32>>
+  return
+}
+// CHECK-LABEL:   func.func @test_scalar_opt_box(
+// CHECK-SAME:                                   %[[VAL_0:.*]]: f32,
+// CHECK-SAME:                                   %[[VAL_1:.*]]: !fir.box<!fir.ptr<f32>>) {
+// CHECK:           %[[VAL_2:.*]] = fir.declare %[[VAL_1]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "x"} : (!fir.box<!fir.ptr<f32>>) -> !fir.box<!fir.ptr<f32>>
+// CHECK:           %[[VAL_3:.*]] = fir.is_present %[[VAL_2]] : (!fir.box<!fir.ptr<f32>>) -> i1
+// CHECK:           %[[VAL_4:.*]] = fir.if %[[VAL_3]] -> (!fir.ptr<f32>) {
+// CHECK:             %[[VAL_5:.*]] = fir.box_addr %[[VAL_2]] : (!fir.box<!fir.ptr<f32>>) -> !fir.ptr<f32>
+// CHECK:             fir.result %[[VAL_5]] : !fir.ptr<f32>
+// CHECK:           } else {
+// CHECK:             %[[VAL_6:.*]] = fir.absent !fir.ptr<f32>
+// CHECK:             fir.result %[[VAL_6]] : !fir.ptr<f32>
+// CHECK:           }
+// CHECK:           fir.store %[[VAL_0]] to %[[VAL_4]] : !fir.ptr<f32>
+
+func.func @test_scalar_opt_char_box(%arg0: !fir.ref<!fir.char<1,10>>, %arg1: !fir.box<!fir.char<1,?>>) {
+  %x = fir.declare %arg1 {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "x"} : (!fir.box<!fir.char<1,?>>) -> !fir.box<!fir.char<1,?>>
+  hlfir.assign %arg0 to %x : !fir.ref<!fir.char<1,10>>, !fir.box<!fir.char<1,?>>
+  return
+}
+// CHECK-LABEL:   func.func @test_scalar_opt_char_box(
+// CHECK-SAME:                                        %[[VAL_0:.*]]: !fir.ref<!fir.char<1,10>>,
+// CHECK-SAME:                                        %[[VAL_1:.*]]: !fir.box<!fir.char<1,?>>) {
+// CHECK:           %[[VAL_2:.*]] = fir.declare %[[VAL_1]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "x"} : (!fir.box<!fir.char<1,?>>) -> !fir.box<!fir.char<1,?>>
+// CHECK:           %[[VAL_3:.*]] = arith.constant 10 : index
+// CHECK:           %[[VAL_4:.*]] = fir.is_present %[[VAL_2]] : (!fir.box<!fir.char<1,?>>) -> i1
+// CHECK:           %[[VAL_5:.*]]:2 = fir.if %[[VAL_4]] -> (!fir.ref<!fir.char<1,?>>, index) {
+// CHECK:             %[[VAL_6:.*]] = fir.box_addr %[[VAL_2]] : (!fir.box<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,?>>
+// CHECK:             %[[VAL_7:.*]] = fir.box_elesize %[[VAL_2]] : (!fir.box<!fir.char<1,?>>) -> index
+// CHECK:             fir.result %[[VAL_6]], %[[VAL_7]] : !fir.ref<!fir.char<1,?>>, index
+// CHECK:           } else {
+// CHECK:             %[[VAL_8:.*]] = fir.absent !fir.ref<!fir.char<1,?>>
+// CHECK:             %[[VAL_9:.*]] = fir.zero_bits index
+// CHECK:             fir.result %[[VAL_8]], %[[VAL_9]] : !fir.ref<!fir.char<1,?>>, index
+// CHECK:           }
+// ...
+// CHECK:           fir.call @llvm.memmove.p0.p0.i64(
diff --git flang/test/HLFIR/maxval-lowering.fir flang/test/HLFIR/maxval-lowering.fir
index 7e025c41c6ae..fbf75d905054 100644
--- flang/test/HLFIR/maxval-lowering.fir
+++ flang/test/HLFIR/maxval-lowering.fir
@@ -216,3 +216,25 @@ func.func @_QPmaxval6(%arg0: !fir.box<!fir.array<?x!fir.char<1,?>>> {fir.bindc_n
 // CHECK:         hlfir.destroy %[[ASEXPR]]
 // CHECK-NEXT:    return
 // CHECK-NEXT:  }
+
+func.func @_QPmaxval_opt_mask(%arg0: !fir.box<!fir.array<?x?xf32>> {fir.bindc_name = "input"}, %arg1: !fir.ref<!fir.logical<4>> {fir.bindc_name = "mask", fir.optional}) -> f32 {
+  %0 = fir.dummy_scope : !fir.dscope
+  %1:2 = hlfir.declare %arg0 dummy_scope %0 {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFmaxval_opt_maskEinput"} : (!fir.box<!fir.array<?x?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.box<!fir.array<?x?xf32>>)
+  %2:2 = hlfir.declare %arg1 dummy_scope %0 {fortran_attrs = #fir.var_attrs<intent_in, optional>, uniq_name = "_QFmaxval_opt_maskEmask"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+  %3 = fir.alloca f32 {bindc_name = "maxval_1", uniq_name = "_QFmaxval_opt_maskEmaxval_1"}
+  %4:2 = hlfir.declare %3 {uniq_name = "_QFmaxval_opt_maskEmaxval_1"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+  %5 = fir.is_present %2#0 : (!fir.ref<!fir.logical<4>>) -> i1
+  %6 = fir.embox %2#1 : (!fir.ref<!fir.logical<4>>) -> !fir.box<!fir.logical<4>>
+  %7 = fir.absent !fir.box<!fir.logical<4>>
+  %8 = arith.select %5, %6, %7 : !fir.box<!fir.logical<4>>
+  %9 = hlfir.maxval %1#0 mask %8 : (!fir.box<!fir.array<?x?xf32>>, !fir.box<!fir.logical<4>>) -> f32
+  hlfir.assign %9 to %4#0 : f32, !fir.ref<f32>
+  %10 = fir.load %4#1 : !fir.ref<f32>
+  return %10 : f32
+}
+// CHECK-LABEL:   func.func @_QPmaxval_opt_mask(
+// CHECK:           %[[VAL_10:.*]] = fir.embox %{{.*}} : (!fir.ref<!fir.logical<4>>) -> !fir.box<!fir.logical<4>>
+// CHECK:           %[[VAL_11:.*]] = fir.absent !fir.box<!fir.logical<4>>
+// CHECK:           %[[VAL_12:.*]] = arith.select %{{.*}}, %[[VAL_10]], %[[VAL_11]] : !fir.box<!fir.logical<4>>
+// CHECK:           %[[VAL_17:.*]] = fir.convert %[[VAL_12]] : (!fir.box<!fir.logical<4>>) -> !fir.box<none>                                                                  
+// CHECK:           %[[VAL_18:.*]] = fir.call @_FortranAMaxvalReal4(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> f32
diff --git flang/test/HLFIR/opt-variable-assign-omp.fir flang/test/HLFIR/opt-variable-assign-omp.fir
index 10cb2b4408fb..f3ba53283c74 100755
--- flang/test/HLFIR/opt-variable-assign-omp.fir
+++ flang/test/HLFIR/opt-variable-assign-omp.fir
@@ -7,8 +7,8 @@
 // TODO: we can't currently optimize this assign because alias analysis doesn't
 // know that the block arguments of the copy region cannot alias.
 
-omp.private {type = firstprivate} @_QFFbEl_firstprivate_box_Uxi32 : !fir.ref<!fir.box<!fir.array<?xi32>>> alloc {
-^bb0(%arg0: !fir.ref<!fir.box<!fir.array<?xi32>>>):
+omp.private {type = firstprivate} @_QFFbEl_firstprivate_box_Uxi32 : !fir.box<!fir.array<?xi32>> init {
+^bb0(%arg0: !fir.ref<!fir.box<!fir.array<?xi32>>>, %arg1: !fir.ref<!fir.box<!fir.array<?xi32>>>):
   %0 = fir.load %arg0 : !fir.ref<!fir.box<!fir.array<?xi32>>>
   %c0 = arith.constant 0 : index
   %1:3 = fir.box_dims %0, %c0 : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
@@ -20,9 +20,8 @@ omp.private {type = firstprivate} @_QFFbEl_firstprivate_box_Uxi32 : !fir.ref<!fi
   %5:3 = fir.box_dims %0, %c0_0 : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
   %6 = fir.shape_shift %5#0, %5#1 : (index, index) -> !fir.shapeshift<1>
   %7 = fir.rebox %4#0(%6) : (!fir.box<!fir.array<?xi32>>, !fir.shapeshift<1>) -> !fir.box<!fir.array<?xi32>>
-  %8 = fir.alloca !fir.box<!fir.array<?xi32>>
-  fir.store %7 to %8 : !fir.ref<!fir.box<!fir.array<?xi32>>>
-  omp.yield(%8 : !fir.ref<!fir.box<!fir.array<?xi32>>>)
+  fir.store %7 to %arg1 : !fir.ref<!fir.box<!fir.array<?xi32>>>
+  omp.yield(%arg1 : !fir.ref<!fir.box<!fir.array<?xi32>>>)
 } copy {
 ^bb0(%arg0: !fir.ref<!fir.box<!fir.array<?xi32>>>, %arg1 : !fir.ref<!fir.box<!fir.array<?xi32>>>):
   %0 = fir.load %arg0 {test.ptr = "load_from_block_arg"} : !fir.ref<!fir.box<!fir.array<?xi32>>>
diff --git flang/test/HLFIR/simplify-hlfir-intrinsics-reshape.fir flang/test/HLFIR/simplify-hlfir-intrinsics-reshape.fir
new file mode 100644
index 000000000000..afbd3bcd6d98
--- /dev/null
+++ flang/test/HLFIR/simplify-hlfir-intrinsics-reshape.fir
@@ -0,0 +1,228 @@
+// Test hlfir.reshape simplification to hlfir.elemental:
+// RUN: fir-opt --simplify-hlfir-intrinsics %s | FileCheck %s
+
+func.func @reshape_simple(%arg0: !fir.box<!fir.array<?xf32>>, %arg1: !fir.ref<!fir.array<1xi32>>) -> !hlfir.expr<?xf32> {
+  %res = hlfir.reshape %arg0 %arg1 : (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<1xi32>>) -> !hlfir.expr<?xf32>
+  return %res : !hlfir.expr<?xf32>
+}
+// CHECK-LABEL:   func.func @reshape_simple(
+// CHECK-SAME:                              %[[VAL_0:.*]]: !fir.box<!fir.array<?xf32>>,
+// CHECK-SAME:                              %[[VAL_1:.*]]: !fir.ref<!fir.array<1xi32>>) -> !hlfir.expr<?xf32> {
+// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_4:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_2]])  : (!fir.ref<!fir.array<1xi32>>, index) -> !fir.ref<i32>
+// CHECK:           %[[VAL_5:.*]] = fir.load %[[VAL_4]] : !fir.ref<i32>
+// CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (i32) -> !fir.shape<1>
+// CHECK:           %[[VAL_7:.*]] = hlfir.elemental %[[VAL_6]] unordered : (!fir.shape<1>) -> !hlfir.expr<?xf32> {
+// CHECK:           ^bb0(%[[VAL_8:.*]]: index):
+// CHECK:             %[[VAL_9:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_3]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
+// CHECK:             %[[VAL_10:.*]] = arith.subi %[[VAL_9]]#0, %[[VAL_2]] overflow<nuw> : index
+// CHECK:             %[[VAL_11:.*]] = arith.addi %[[VAL_8]], %[[VAL_10]] overflow<nuw> : index
+// CHECK:             %[[VAL_12:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_11]])  : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+// CHECK:             %[[VAL_13:.*]] = fir.load %[[VAL_12]] : !fir.ref<f32>
+// CHECK:             hlfir.yield_element %[[VAL_13]] : f32
+// CHECK:           }
+// CHECK:           return %[[VAL_7]] : !hlfir.expr<?xf32>
+// CHECK:         }
+
+func.func @reshape_with_pad(%arg0: !fir.box<!fir.array<?x?x?xf32>>, %arg1: !fir.ref<!fir.array<2xi32>>, %arg2: !fir.box<!fir.array<?x?x?xf32>>) -> !hlfir.expr<?x?xf32> {
+  %res = hlfir.reshape %arg0 %arg1 pad %arg2 : (!fir.box<!fir.array<?x?x?xf32>>, !fir.ref<!fir.array<2xi32>>, !fir.box<!fir.array<?x?x?xf32>>) -> !hlfir.expr<?x?xf32>
+  return %res : !hlfir.expr<?x?xf32>
+}
+// CHECK-LABEL:   func.func @reshape_with_pad(
+// CHECK-SAME:                                %[[VAL_0:.*]]: !fir.box<!fir.array<?x?x?xf32>>,
+// CHECK-SAME:                                %[[VAL_1:.*]]: !fir.ref<!fir.array<2xi32>>,
+// CHECK-SAME:                                %[[VAL_2:.*]]: !fir.box<!fir.array<?x?x?xf32>>) -> !hlfir.expr<?x?xf32> {
+// CHECK:           %[[VAL_3:.*]] = arith.constant 2 : index
+// CHECK:           %[[VAL_4:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_5:.*]] = arith.constant 0 : index
+// CHECK:           %[[ARRAY_DIM0:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_5]] : (!fir.box<!fir.array<?x?x?xf32>>, index) -> (index, index, index)
+// CHECK:           %[[ARRAY_DIM1:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_4]] : (!fir.box<!fir.array<?x?x?xf32>>, index) -> (index, index, index)
+// CHECK:           %[[ARRAY_DIM2:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_3]] : (!fir.box<!fir.array<?x?x?xf32>>, index) -> (index, index, index)
+// CHECK:           %[[VAL_9:.*]] = arith.muli %[[ARRAY_DIM0]]#1, %[[ARRAY_DIM1]]#1 overflow<nuw> : index
+// CHECK:           %[[ARRAY_SIZE:.*]] = arith.muli %[[VAL_9]], %[[ARRAY_DIM2]]#1 overflow<nuw> : index
+// CHECK:           %[[VAL_16:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_4]])  : (!fir.ref<!fir.array<2xi32>>, index) -> !fir.ref<i32>
+// CHECK:           %[[VAL_17:.*]] = fir.load %[[VAL_16]] : !fir.ref<i32>
+// CHECK:           %[[VAL_18:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_3]])  : (!fir.ref<!fir.array<2xi32>>, index) -> !fir.ref<i32>
+// CHECK:           %[[VAL_19:.*]] = fir.load %[[VAL_18]] : !fir.ref<i32>
+// CHECK:           %[[VAL_20:.*]] = fir.shape %[[VAL_17]], %[[VAL_19]] : (i32, i32) -> !fir.shape<2>
+// CHECK:           %[[VAL_21:.*]] = hlfir.elemental %[[VAL_20]] unordered : (!fir.shape<2>) -> !hlfir.expr<?x?xf32> {
+// CHECK:           ^bb0(%[[VAL_22:.*]]: index, %[[VAL_23:.*]]: index):
+// CHECK:             %[[VAL_24:.*]] = arith.subi %[[VAL_23]], %[[VAL_4]] overflow<nuw> : index
+// CHECK:             %[[VAL_25:.*]] = fir.convert %[[VAL_17]] : (i32) -> index
+// CHECK:             %[[VAL_26:.*]] = arith.muli %[[VAL_24]], %[[VAL_25]] overflow<nuw> : index
+// CHECK:             %[[VAL_27:.*]] = arith.subi %[[VAL_22]], %[[VAL_4]] overflow<nuw> : index
+// CHECK:             %[[LINEAR_INDEX:.*]] = arith.addi %[[VAL_26]], %[[VAL_27]] overflow<nuw> : index
+// CHECK:             %[[IS_WITHIN_ARRAY:.*]] = arith.cmpi ult, %[[LINEAR_INDEX]], %[[ARRAY_SIZE]] : index
+// CHECK:             %[[VAL_30:.*]] = fir.if %[[IS_WITHIN_ARRAY]] -> (f32) {
+// CHECK:               %[[VAL_31:.*]] = arith.remui %[[LINEAR_INDEX]], %[[ARRAY_DIM0]]#1 : index
+// CHECK:               %[[VAL_32:.*]] = arith.divui %[[LINEAR_INDEX]], %[[ARRAY_DIM0]]#1 : index
+// CHECK:               %[[ARRAY_IDX0:.*]] = arith.addi %[[VAL_31]], %[[VAL_4]] overflow<nuw> : index
+// CHECK:               %[[VAL_34:.*]] = arith.remui %[[VAL_32]], %[[ARRAY_DIM1]]#1 : index
+// CHECK:               %[[VAL_35:.*]] = arith.divui %[[VAL_32]], %[[ARRAY_DIM1]]#1 : index
+// CHECK:               %[[ARRAY_IDX1:.*]] = arith.addi %[[VAL_34]], %[[VAL_4]] overflow<nuw> : index
+// CHECK:               %[[ARRAY_IDX2:.*]] = arith.addi %[[VAL_35]], %[[VAL_4]] overflow<nuw> : index
+// CHECK:               %[[VAL_38:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_5]] : (!fir.box<!fir.array<?x?x?xf32>>, index) -> (index, index, index)
+// CHECK:               %[[VAL_39:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_4]] : (!fir.box<!fir.array<?x?x?xf32>>, index) -> (index, index, index)
+// CHECK:               %[[VAL_40:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_3]] : (!fir.box<!fir.array<?x?x?xf32>>, index) -> (index, index, index)
+// CHECK:               %[[VAL_41:.*]] = arith.subi %[[VAL_38]]#0, %[[VAL_4]] overflow<nuw> : index
+// CHECK:               %[[VAL_42:.*]] = arith.addi %[[ARRAY_IDX0]], %[[VAL_41]] overflow<nuw> : index
+// CHECK:               %[[VAL_43:.*]] = arith.subi %[[VAL_39]]#0, %[[VAL_4]] overflow<nuw> : index
+// CHECK:               %[[VAL_44:.*]] = arith.addi %[[ARRAY_IDX1]], %[[VAL_43]] overflow<nuw> : index
+// CHECK:               %[[VAL_45:.*]] = arith.subi %[[VAL_40]]#0, %[[VAL_4]] overflow<nuw> : index
+// CHECK:               %[[VAL_46:.*]] = arith.addi %[[ARRAY_IDX2]], %[[VAL_45]] overflow<nuw> : index
+// CHECK:               %[[VAL_47:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_42]], %[[VAL_44]], %[[VAL_46]])  : (!fir.box<!fir.array<?x?x?xf32>>, index, index, index) -> !fir.ref<f32>
+// CHECK:               %[[VAL_48:.*]] = fir.load %[[VAL_47]] : !fir.ref<f32>
+// CHECK:               fir.result %[[VAL_48]] : f32
+// CHECK:             } else {
+// CHECK:               %[[PAD_DIM0:.*]]:3 = fir.box_dims %[[VAL_2]], %[[VAL_5]] : (!fir.box<!fir.array<?x?x?xf32>>, index) -> (index, index, index)
+// CHECK:               %[[PAD_DIM1:.*]]:3 = fir.box_dims %[[VAL_2]], %[[VAL_4]] : (!fir.box<!fir.array<?x?x?xf32>>, index) -> (index, index, index)
+// CHECK:               %[[PAD_DIM2:.*]]:3 = fir.box_dims %[[VAL_2]], %[[VAL_3]] : (!fir.box<!fir.array<?x?x?xf32>>, index) -> (index, index, index)
+// CHECK:               %[[PAD_LINEAR_INDEX:.*]] = arith.subi %[[LINEAR_INDEX]], %[[ARRAY_SIZE]] overflow<nuw> : index
+// CHECK:               %[[VAL_51:.*]] = arith.remui %[[PAD_LINEAR_INDEX]], %[[PAD_DIM0]]#1 : index
+// CHECK:               %[[VAL_52:.*]] = arith.divui %[[PAD_LINEAR_INDEX]], %[[PAD_DIM0]]#1 : index
+// CHECK:               %[[PAD_IDX0:.*]] = arith.addi %[[VAL_51]], %[[VAL_4]] overflow<nuw> : index
+// CHECK:               %[[VAL_54:.*]] = arith.remui %[[VAL_52]], %[[PAD_DIM1]]#1 : index
+// CHECK:               %[[VAL_55:.*]] = arith.divui %[[VAL_52]], %[[PAD_DIM1]]#1 : index
+// CHECK:               %[[PAD_IDX1:.*]] = arith.addi %[[VAL_54]], %[[VAL_4]] overflow<nuw> : index
+// CHECK:               %[[VAL_56:.*]] = arith.remui %[[VAL_55]], %[[PAD_DIM2]]#1 : index
+// CHECK:               %[[PAD_IDX2:.*]] = arith.addi %[[VAL_56]], %[[VAL_4]] overflow<nuw> : index
+// CHECK:               %[[VAL_58:.*]]:3 = fir.box_dims %[[VAL_2]], %[[VAL_5]] : (!fir.box<!fir.array<?x?x?xf32>>, index) -> (index, index, index)
+// CHECK:               %[[VAL_59:.*]]:3 = fir.box_dims %[[VAL_2]], %[[VAL_4]] : (!fir.box<!fir.array<?x?x?xf32>>, index) -> (index, index, index)
+// CHECK:               %[[VAL_60:.*]]:3 = fir.box_dims %[[VAL_2]], %[[VAL_3]] : (!fir.box<!fir.array<?x?x?xf32>>, index) -> (index, index, index)
+// CHECK:               %[[VAL_61:.*]] = arith.subi %[[VAL_58]]#0, %[[VAL_4]] overflow<nuw> : index
+// CHECK:               %[[VAL_62:.*]] = arith.addi %[[PAD_IDX0]], %[[VAL_61]] overflow<nuw> : index
+// CHECK:               %[[VAL_63:.*]] = arith.subi %[[VAL_59]]#0, %[[VAL_4]] overflow<nuw> : index
+// CHECK:               %[[VAL_64:.*]] = arith.addi %[[PAD_IDX1]], %[[VAL_63]] overflow<nuw> : index
+// CHECK:               %[[VAL_65:.*]] = arith.subi %[[VAL_60]]#0, %[[VAL_4]] overflow<nuw> : index
+// CHECK:               %[[VAL_66:.*]] = arith.addi %[[PAD_IDX2]], %[[VAL_65]] overflow<nuw> : index
+// CHECK:               %[[VAL_67:.*]] = hlfir.designate %[[VAL_2]] (%[[VAL_62]], %[[VAL_64]], %[[VAL_66]])  : (!fir.box<!fir.array<?x?x?xf32>>, index, index, index) -> !fir.ref<f32>
+// CHECK:               %[[VAL_68:.*]] = fir.load %[[VAL_67]] : !fir.ref<f32>
+// CHECK:               fir.result %[[VAL_68]] : f32
+// CHECK:             }
+// CHECK:             hlfir.yield_element %[[VAL_30]] : f32
+// CHECK:           }
+// CHECK:           return %[[VAL_21]] : !hlfir.expr<?x?xf32>
+// CHECK:         }
+
+func.func @reshape_derived_obj(%arg0: !fir.ref<!fir.array<10x!fir.type<whatever>>>, %arg1: !fir.ref<!fir.array<1xi32>>) -> !hlfir.expr<?x!fir.type<whatever>> {
+  %res = hlfir.reshape %arg0 %arg1 : (!fir.ref<!fir.array<10x!fir.type<whatever>>>, !fir.ref<!fir.array<1xi32>>) -> !hlfir.expr<?x!fir.type<whatever>>
+  return %res : !hlfir.expr<?x!fir.type<whatever>>
+}
+// CHECK-LABEL:   func.func @reshape_derived_obj(
+// CHECK-SAME:                                   %[[VAL_0:.*]]: !fir.ref<!fir.array<10x!fir.type<whatever>>>,
+// CHECK-SAME:                                   %[[VAL_1:.*]]: !fir.ref<!fir.array<1xi32>>) -> !hlfir.expr<?x!fir.type<whatever>> {
+// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_3:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_2]])  : (!fir.ref<!fir.array<1xi32>>, index) -> !fir.ref<i32>
+// CHECK:           %[[VAL_4:.*]] = fir.load %[[VAL_3]] : !fir.ref<i32>
+// CHECK:           %[[VAL_5:.*]] = fir.shape %[[VAL_4]] : (i32) -> !fir.shape<1>
+// CHECK:           %[[VAL_6:.*]] = hlfir.elemental %[[VAL_5]] unordered : (!fir.shape<1>) -> !hlfir.expr<?x!fir.type<whatever>> {
+// CHECK:           ^bb0(%[[VAL_7:.*]]: index):
+// CHECK:             %[[VAL_8:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_7]])  : (!fir.ref<!fir.array<10x!fir.type<whatever>>>, index) -> !fir.ref<!fir.type<whatever>>
+// CHECK:             hlfir.yield_element %[[VAL_8]] : !fir.ref<!fir.type<whatever>>
+// CHECK:           }
+// CHECK:           return %[[VAL_6]] : !hlfir.expr<?x!fir.type<whatever>>
+// CHECK:         }
+
+func.func @reshape_derived_expr(%arg0: !hlfir.expr<?x!fir.type<whatever>>, %arg1: !fir.ref<!fir.array<1xi32>>) -> !hlfir.expr<?x!fir.type<whatever>> {
+  %res = hlfir.reshape %arg0 %arg1 : (!hlfir.expr<?x!fir.type<whatever>>, !fir.ref<!fir.array<1xi32>>) -> !hlfir.expr<?x!fir.type<whatever>>
+  return %res : !hlfir.expr<?x!fir.type<whatever>>
+}
+// CHECK-LABEL:   func.func @reshape_derived_expr(
+// CHECK-SAME:                                    %[[VAL_0:.*]]: !hlfir.expr<?x!fir.type<whatever>>,
+// CHECK-SAME:                                    %[[VAL_1:.*]]: !fir.ref<!fir.array<1xi32>>) -> !hlfir.expr<?x!fir.type<whatever>> {
+// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_3:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_2]])  : (!fir.ref<!fir.array<1xi32>>, index) -> !fir.ref<i32>
+// CHECK:           %[[VAL_4:.*]] = fir.load %[[VAL_3]] : !fir.ref<i32>
+// CHECK:           %[[VAL_5:.*]] = fir.shape %[[VAL_4]] : (i32) -> !fir.shape<1>
+// CHECK:           %[[VAL_6:.*]] = hlfir.elemental %[[VAL_5]] unordered : (!fir.shape<1>) -> !hlfir.expr<?x!fir.type<whatever>> {
+// CHECK:           ^bb0(%[[VAL_7:.*]]: index):
+// CHECK:             %[[VAL_8:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_7]] : (!hlfir.expr<?x!fir.type<whatever>>, index) -> !hlfir.expr<!fir.type<whatever>>
+// CHECK:             hlfir.yield_element %[[VAL_8]] : !hlfir.expr<!fir.type<whatever>>
+// CHECK:           }
+// CHECK:           return %[[VAL_6]] : !hlfir.expr<?x!fir.type<whatever>>
+// CHECK:         }
+
+func.func @reshape_poly_obj(%arg0: !fir.class<!fir.array<?x!fir.type<whatever>>>, %arg1: !fir.ref<!fir.array<1xi32>>) -> !hlfir.expr<?x!fir.type<whatever>?> {
+  %res = hlfir.reshape %arg0 %arg1 : (!fir.class<!fir.array<?x!fir.type<whatever>>>, !fir.ref<!fir.array<1xi32>>) -> !hlfir.expr<?x!fir.type<whatever>?>
+  return %res : !hlfir.expr<?x!fir.type<whatever>?>
+}
+// CHECK-LABEL:   func.func @reshape_poly_obj(
+// CHECK-SAME:                                %[[VAL_0:.*]]: !fir.class<!fir.array<?x!fir.type<whatever>>>,
+// CHECK-SAME:                                %[[VAL_1:.*]]: !fir.ref<!fir.array<1xi32>>) -> !hlfir.expr<?x!fir.type<whatever>?> {
+// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_4:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_2]])  : (!fir.ref<!fir.array<1xi32>>, index) -> !fir.ref<i32>
+// CHECK:           %[[VAL_5:.*]] = fir.load %[[VAL_4]] : !fir.ref<i32>
+// CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (i32) -> !fir.shape<1>
+// CHECK:           %[[VAL_7:.*]] = hlfir.elemental %[[VAL_6]] mold %[[VAL_0]] unordered : (!fir.shape<1>, !fir.class<!fir.array<?x!fir.type<whatever>>>) -> !hlfir.expr<?x!fir.type<whatever>?> {
+// CHECK:           ^bb0(%[[VAL_8:.*]]: index):
+// CHECK:             %[[VAL_9:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_3]] : (!fir.class<!fir.array<?x!fir.type<whatever>>>, index) -> (index, index, index)
+// CHECK:             %[[VAL_10:.*]] = arith.subi %[[VAL_9]]#0, %[[VAL_2]] overflow<nuw> : index
+// CHECK:             %[[VAL_11:.*]] = arith.addi %[[VAL_8]], %[[VAL_10]] overflow<nuw> : index
+// CHECK:             %[[VAL_12:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_11]])  : (!fir.class<!fir.array<?x!fir.type<whatever>>>, index) -> !fir.class<!fir.type<whatever>>
+// CHECK:             hlfir.yield_element %[[VAL_12]] : !fir.class<!fir.type<whatever>>
+// CHECK:           }
+// CHECK:           return %[[VAL_7]] : !hlfir.expr<?x!fir.type<whatever>?>
+// CHECK:         }
+
+func.func @reshape_poly_expr(%arg0: !hlfir.expr<?x!fir.type<whatever>?>, %arg1: !fir.ref<!fir.array<1xi32>>) -> !hlfir.expr<?x!fir.type<whatever>?> {
+  %res = hlfir.reshape %arg0 %arg1 : (!hlfir.expr<?x!fir.type<whatever>?>, !fir.ref<!fir.array<1xi32>>) -> !hlfir.expr<?x!fir.type<whatever>?>
+  return %res : !hlfir.expr<?x!fir.type<whatever>?>
+}
+// CHECK-LABEL:   func.func @reshape_poly_expr(
+// CHECK-SAME:                                 %[[VAL_0:.*]]: !hlfir.expr<?x!fir.type<whatever>?>,
+// CHECK-SAME:                                 %[[VAL_1:.*]]: !fir.ref<!fir.array<1xi32>>) -> !hlfir.expr<?x!fir.type<whatever>?> {
+// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_3:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_2]])  : (!fir.ref<!fir.array<1xi32>>, index) -> !fir.ref<i32>
+// CHECK:           %[[VAL_4:.*]] = fir.load %[[VAL_3]] : !fir.ref<i32>
+// CHECK:           %[[VAL_5:.*]] = fir.shape %[[VAL_4]] : (i32) -> !fir.shape<1>
+// CHECK:           %[[VAL_6:.*]] = hlfir.elemental %[[VAL_5]] mold %[[VAL_0]] unordered : (!fir.shape<1>, !hlfir.expr<?x!fir.type<whatever>?>) -> !hlfir.expr<?x!fir.type<whatever>?> {
+// CHECK:           ^bb0(%[[VAL_7:.*]]: index):
+// CHECK:             %[[VAL_8:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_7]] : (!hlfir.expr<?x!fir.type<whatever>?>, index) -> !hlfir.expr<!fir.type<whatever>?>
+// CHECK:             hlfir.yield_element %[[VAL_8]] : !hlfir.expr<!fir.type<whatever>?>
+// CHECK:           }
+// CHECK:           return %[[VAL_6]] : !hlfir.expr<?x!fir.type<whatever>?>
+// CHECK:         }
+
+func.func @reshape_char(%arg0: !fir.box<!fir.array<?x!fir.char<2,?>>>, %arg1: !fir.ref<!fir.array<1xi32>>) -> !hlfir.expr<?x!fir.char<2,?>> {
+  %res = hlfir.reshape %arg0 %arg1 : (!fir.box<!fir.array<?x!fir.char<2,?>>>, !fir.ref<!fir.array<1xi32>>) -> !hlfir.expr<?x!fir.char<2,?>>
+  return %res : !hlfir.expr<?x!fir.char<2,?>>
+}
+// CHECK-LABEL:   func.func @reshape_char(
+// CHECK-SAME:                            %[[VAL_0:.*]]: !fir.box<!fir.array<?x!fir.char<2,?>>>,
+// CHECK-SAME:                            %[[VAL_1:.*]]: !fir.ref<!fir.array<1xi32>>) -> !hlfir.expr<?x!fir.char<2,?>> {
+// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_4:.*]] = arith.constant 2 : index
+// CHECK:           %[[VAL_5:.*]] = fir.box_elesize %[[VAL_0]] : (!fir.box<!fir.array<?x!fir.char<2,?>>>) -> index
+// CHECK:           %[[VAL_6:.*]] = arith.divsi %[[VAL_5]], %[[VAL_4]] : index
+// CHECK:           %[[VAL_7:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_2]])  : (!fir.ref<!fir.array<1xi32>>, index) -> !fir.ref<i32>
+// CHECK:           %[[VAL_8:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
+// CHECK:           %[[VAL_9:.*]] = fir.shape %[[VAL_8]] : (i32) -> !fir.shape<1>
+// CHECK:           %[[VAL_10:.*]] = hlfir.elemental %[[VAL_9]] typeparams %[[VAL_6]] unordered : (!fir.shape<1>, index) -> !hlfir.expr<?x!fir.char<2,?>> {
+// CHECK:           ^bb0(%[[VAL_11:.*]]: index):
+// CHECK:             %[[VAL_12:.*]] = fir.box_elesize %[[VAL_0]] : (!fir.box<!fir.array<?x!fir.char<2,?>>>) -> index
+// CHECK:             %[[VAL_13:.*]] = arith.divsi %[[VAL_12]], %[[VAL_4]] : index
+// CHECK:             %[[VAL_14:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_3]] : (!fir.box<!fir.array<?x!fir.char<2,?>>>, index) -> (index, index, index)
+// CHECK:             %[[VAL_15:.*]] = arith.subi %[[VAL_14]]#0, %[[VAL_2]] overflow<nuw> : index
+// CHECK:             %[[VAL_16:.*]] = arith.addi %[[VAL_11]], %[[VAL_15]] overflow<nuw> : index
+// CHECK:             %[[VAL_17:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_16]])  typeparams %[[VAL_13]] : (!fir.box<!fir.array<?x!fir.char<2,?>>>, index, index) -> !fir.boxchar<2>
+// CHECK:             hlfir.yield_element %[[VAL_17]] : !fir.boxchar<2>
+// CHECK:           }
+// CHECK:           return %[[VAL_10]] : !hlfir.expr<?x!fir.char<2,?>>
+// CHECK:         }
+
+func.func @reshape_negative_result_array_have_different_types(%arg0: !fir.box<!fir.array<?x!fir.char<2,1>>>, %arg1: !fir.ref<!fir.array<1xi32>>) -> !hlfir.expr<?x!fir.char<2,2>> {
+  %res = hlfir.reshape %arg0 %arg1 : (!fir.box<!fir.array<?x!fir.char<2,1>>>, !fir.ref<!fir.array<1xi32>>) -> !hlfir.expr<?x!fir.char<2,2>>
+  return %res : !hlfir.expr<?x!fir.char<2,2>>
+}
+// CHECK-LABEL:   func.func @reshape_negative_result_array_have_different_types(
+// CHECK:           hlfir.reshape %{{.*}} %{{.*}} : (!fir.box<!fir.array<?x!fir.char<2>>>, !fir.ref<!fir.array<1xi32>>) -> !hlfir.expr<?x!fir.char<2,2>>
+
+func.func @reshape_negative_array_pad_have_different_types(%arg0: !fir.box<!fir.array<?x!fir.char<2,2>>>, %arg1: !fir.ref<!fir.array<1xi32>>, %arg2: !fir.box<!fir.array<?x!fir.char<2,1>>>) -> !hlfir.expr<?x!fir.char<2,2>> {
+  %res = hlfir.reshape %arg0 %arg1 pad %arg2 : (!fir.box<!fir.array<?x!fir.char<2,2>>>, !fir.ref<!fir.array<1xi32>>, !fir.box<!fir.array<?x!fir.char<2,1>>>) -> !hlfir.expr<?x!fir.char<2,2>>
+  return %res : !hlfir.expr<?x!fir.char<2,2>>
+}
+// CHECK-LABEL:   func.func @reshape_negative_array_pad_have_different_types(
+// CHECK:           hlfir.reshape %{{.*}} %{{.*}} pad %{{.*}} : (!fir.box<!fir.array<?x!fir.char<2,2>>>, !fir.ref<!fir.array<1xi32>>, !fir.box<!fir.array<?x!fir.char<2>>>) -> !hlfir.expr<?x!fir.char<2,2>>
diff --git flang/test/Integration/OpenMP/copyprivate.f90 flang/test/Integration/OpenMP/copyprivate.f90
index 79003ebc3461..3bae003ea8d8 100644
--- flang/test/Integration/OpenMP/copyprivate.f90
+++ flang/test/Integration/OpenMP/copyprivate.f90
@@ -9,17 +9,17 @@
 !RUN: %flang_fc1 -emit-llvm -fopenmp %s -o - | FileCheck %s
 
 !CHECK-DAG: define internal void @_copy_box_Uxi32(ptr captures(none) %{{.*}}, ptr captures(none) %{{.*}})
-!CHECK-DAG: define internal void @_copy_10xi32(ptr captures(none) %{{.*}}, ptr captures(none) %{{.*}})
+!CHECK-DAG: define internal void @_copy_box_10xi32(ptr captures(none) %{{.*}}, ptr captures(none) %{{.*}})
 !CHECK-DAG: define internal void @_copy_i64(ptr captures(none) %{{.*}}, ptr captures(none) %{{.*}})
 !CHECK-DAG: define internal void @_copy_box_Uxi64(ptr captures(none) %{{.*}}, ptr captures(none) %{{.*}})
 !CHECK-DAG: define internal void @_copy_f32(ptr captures(none) %{{.*}}, ptr captures(none) %{{.*}})
-!CHECK-DAG: define internal void @_copy_2x3xf32(ptr captures(none) %{{.*}}, ptr captures(none) %{{.*}})
+!CHECK-DAG: define internal void @_copy_box_2x3xf32(ptr captures(none) %{{.*}}, ptr captures(none) %{{.*}})
 !CHECK-DAG: define internal void @_copy_z32(ptr captures(none) %{{.*}}, ptr captures(none) %{{.*}})
-!CHECK-DAG: define internal void @_copy_10xz32(ptr captures(none) %{{.*}}, ptr captures(none) %{{.*}})
+!CHECK-DAG: define internal void @_copy_box_10xz32(ptr captures(none) %{{.*}}, ptr captures(none) %{{.*}})
 !CHECK-DAG: define internal void @_copy_l32(ptr captures(none) %{{.*}}, ptr captures(none) %{{.*}})
-!CHECK-DAG: define internal void @_copy_5xl32(ptr captures(none) %{{.*}}, ptr captures(none) %{{.*}})
+!CHECK-DAG: define internal void @_copy_box_5xl32(ptr captures(none) %{{.*}}, ptr captures(none) %{{.*}})
 !CHECK-DAG: define internal void @_copy_c8x8(ptr captures(none) %{{.*}}, ptr captures(none) %{{.*}})
-!CHECK-DAG: define internal void @_copy_10xc8x8(ptr captures(none) %{{.*}}, ptr captures(none) %{{.*}})
+!CHECK-DAG: define internal void @_copy_box_10xc8x8(ptr captures(none) %{{.*}}, ptr captures(none) %{{.*}})
 !CHECK-DAG: define internal void @_copy_c16x5(ptr captures(none) %{{.*}}, ptr captures(none) %{{.*}})
 !CHECK-DAG: define internal void @_copy_rec__QFtest_typesTdt(ptr captures(none) %{{.*}}, ptr captures(none) %{{.*}})
 !CHECK-DAG: define internal void @_copy_box_heap_Uxi32(ptr captures(none) %{{.*}}, ptr captures(none) %{{.*}})
@@ -33,8 +33,12 @@
 !CHECK-NEXT:  }
 
 !CHECK-LABEL: define internal void @test_scalar_..omp_par({{.*}})
-!CHECK:         %[[I:.*]] = alloca i32, i64 1
-!CHECK:         %[[J:.*]] = alloca i32, i64 1
+!CHECK-NEXT: omp.par.entry:
+!CHECK:         %[[TID_ADDR:.*]] = alloca i32, align 4
+!CHECK:         %[[I:.*]] = alloca i32, align 4
+!CHECK:         %[[J:.*]] = alloca i32, align 4
+!CHECK:         br label %[[OMP_REDUCTION_INIT:.*]]
+
 !CHECK:         %[[DID_IT:.*]] = alloca i32
 !CHECK:         store i32 0, ptr %[[DID_IT]]
 !CHECK:         %[[THREAD_NUM1:.*]] = call i32 @__kmpc_global_thread_num(ptr @[[LOC:.*]])
diff --git flang/test/Integration/OpenMP/parallel-private-reduction-worstcase.f90 flang/test/Integration/OpenMP/parallel-private-reduction-worstcase.f90
index 0173847b7323..6facce56123a 100644
--- flang/test/Integration/OpenMP/parallel-private-reduction-worstcase.f90
+++ flang/test/Integration/OpenMP/parallel-private-reduction-worstcase.f90
@@ -32,69 +32,69 @@ end subroutine
 ! CHECK-LABEL: define internal void @worst_case_..omp_par
 ! CHECK-NEXT:  omp.par.entry:
 !                [reduction alloc regions inlined here]
-! CHECK:         br label %omp.private.latealloc
+! CHECK:         br label %omp.private.init
 
-! CHECK:       omp.private.latealloc:                            ; preds = %omp.par.entry
-! CHECK-NEXT:  br label %omp.private.alloc5
+! CHECK:       omp.private.init:                            ; preds = %omp.par.entry
+! CHECK-NEXT:  br label %omp.private.init7
 
-! CHECK:       omp.private.alloc5:                               ; preds = %omp.private.latealloc
+! CHECK:       omp.private.init7:                               ; preds = %omp.private.init
 !                [begin private alloc for first var]
 !                [read the length from the mold argument]
 !                [if it is non-zero...]
-! CHECK:         br i1 {{.*}}, label %omp.private.alloc6, label %omp.private.alloc7
+! CHECK:         br i1 {{.*}}, label %omp.private.init8, label %omp.private.init9
 
-! CHECK:       omp.private.alloc7:                               ; preds = %omp.private.alloc5
+! CHECK:       omp.private.init9:                               ; preds = %omp.private.init7
 !                [finish private alloc for first var with zero extent]
-! CHECK:         br label %omp.private.alloc8
+! CHECK:         br label %omp.private.init10
 
-! CHECK:       omp.private.alloc8:                               ; preds = %omp.private.alloc6, %omp.private.alloc7
-! CHECK-NEXT:    br label %omp.region.cont4
+! CHECK:       omp.private.init10:                               ; preds = %omp.private.init8, %omp.private.init9
+! CHECK-NEXT:    br label %omp.region.cont6
 
-! CHECK:       omp.region.cont4:                                 ; preds = %omp.private.alloc8
+! CHECK:       omp.region.cont6:                                 ; preds = %omp.private.init10
 ! CHECK-NEXT:    %{{.*}} = phi ptr
-! CHECK-NEXT:    br label %omp.private.alloc
+! CHECK-NEXT:    br label %omp.private.init1
 
-! CHECK:       omp.private.alloc:                                ; preds = %omp.region.cont4
+! CHECK:       omp.private.init1:                                ; preds = %omp.region.cont6
 !                [begin private alloc for first var]
 !                [read the length from the mold argument]
 !                [if it is non-zero...]
-! CHECK:         br i1 %{{.*}}, label %omp.private.alloc1, label %omp.private.alloc2
+! CHECK:         br i1 %{{.*}}, label %omp.private.init2, label %omp.private.init3
 
-! CHECK:       omp.private.alloc2:                               ; preds = %omp.private.alloc
+! CHECK:       omp.private.init3:                               ; preds = %omp.private.init1
 !                [finish private alloc for second var with zero extent]
-! CHECK:         br label %omp.private.alloc3
+! CHECK:         br label %omp.private.init4
 
-! CHECK:       omp.private.alloc3:                               ; preds = %omp.private.alloc1, %omp.private.alloc2
+! CHECK:       omp.private.init4:                               ; preds = %omp.private.init2, %omp.private.init3
 ! CHECK-NEXT:    br label %omp.region.cont
 
-! CHECK:       omp.region.cont:                                  ; preds = %omp.private.alloc3
+! CHECK:       omp.region.cont:                                  ; preds = %omp.private.init4
 ! CHECK-NEXT:    %{{.*}} = phi ptr
 ! CHECK-NEXT:    br label %omp.private.copy
 
 ! CHECK:       omp.private.copy:                                 ; preds = %omp.region.cont
-! CHECK-NEXT:    br label %omp.private.copy10
+! CHECK-NEXT:    br label %omp.private.copy12
 
-! CHECK:       omp.private.copy10:                               ; preds = %omp.private.copy
+! CHECK:       omp.private.copy12:                               ; preds = %omp.private.copy
 !                [begin firstprivate copy for first var]
 !                [read the length, is it non-zero?]
-! CHECK:         br i1 %{{.*}}, label %omp.private.copy11, label %omp.private.copy12
+! CHECK:         br i1 %{{.*}}, label %omp.private.copy13, label %omp.private.copy14
 
-! CHECK:       omp.private.copy12:                               ; preds = %omp.private.copy11, %omp.private.copy10
-! CHECK-NEXT:    br label %omp.region.cont9
+! CHECK:       omp.private.copy14:                               ; preds = %omp.private.copy13, %omp.private.copy12
+! CHECK-NEXT:    br label %omp.region.cont11
 
-! CHECK:       omp.region.cont9:                                 ; preds = %omp.private.copy12
+! CHECK:       omp.region.cont11:                                 ; preds = %omp.private.copy14
 ! CHECK-NEXT:    %{{.*}} = phi ptr
-! CHECK-NEXT:    br label %omp.private.copy14
+! CHECK-NEXT:    br label %omp.private.copy16
 
-! CHECK:       omp.private.copy14:                               ; preds = %omp.region.cont9
+! CHECK:       omp.private.copy16:                               ; preds = %omp.region.cont11
 !                [begin firstprivate copy for second var]
 !                [read the length, is it non-zero?]
-! CHECK:         br i1 %{{.*}}, label %omp.private.copy15, label %omp.private.copy16
+! CHECK:         br i1 %{{.*}}, label %omp.private.copy17, label %omp.private.copy18
 
-! CHECK:       omp.private.copy16:                               ; preds = %omp.private.copy15, %omp.private.copy14
-! CHECK-NEXT:    br label %omp.region.cont13
+! CHECK:       omp.private.copy18:                               ; preds = %omp.private.copy17, %omp.private.copy16
+! CHECK-NEXT:    br label %omp.region.cont15
 
-! CHECK:       omp.region.cont13:                                ; preds = %omp.private.copy16
+! CHECK:       omp.region.cont15:                                ; preds = %omp.private.copy18
 ! CHECK-NEXT:    %{{.*}} = phi ptr
 ! CHECK-NEXT:    br label %omp.region.after_alloca
 
@@ -111,44 +111,44 @@ end subroutine
 ! CHECK:       omp.reduction.neutral:                            ; preds = %omp.reduction.init
 !                [start of reduction initialization region]
 !                [null check:]
-! CHECK:         br i1 %{{.*}}, label %omp.reduction.neutral18, label %omp.reduction.neutral19
+! CHECK:         br i1 %{{.*}}, label %omp.reduction.neutral20, label %omp.reduction.neutral21
 
-! CHECK:       omp.reduction.neutral19:                          ; preds = %omp.reduction.neutral
+! CHECK:       omp.reduction.neutral21:                          ; preds = %omp.reduction.neutral
 !                [malloc and assign the default value to the reduction variable]
-! CHECK:         br label %omp.reduction.neutral20
+! CHECK:         br label %omp.reduction.neutral22
 
-! CHECK:       omp.reduction.neutral20:                          ; preds = %omp.reduction.neutral18, %omp.reduction.neutral19
-! CHECK-NEXT:    br label %omp.region.cont17
+! CHECK:       omp.reduction.neutral22:                          ; preds = %omp.reduction.neutral20, %omp.reduction.neutral21
+! CHECK-NEXT:    br label %omp.region.cont19
 
-! CHECK:       omp.region.cont17:                                ; preds = %omp.reduction.neutral20
+! CHECK:       omp.region.cont19:                                ; preds = %omp.reduction.neutral22
 ! CHECK-NEXT:    %{{.*}} = phi ptr
-! CHECK-NEXT:    br label %omp.reduction.neutral22
+! CHECK-NEXT:    br label %omp.reduction.neutral24
 
-! CHECK:       omp.reduction.neutral22:                          ; preds = %omp.region.cont17
+! CHECK:       omp.reduction.neutral24:                          ; preds = %omp.region.cont19
 !                [start of reduction initialization region]
 !                [null check:]
-! CHECK:         br i1 %{{.*}}, label %omp.reduction.neutral23, label %omp.reduction.neutral24
+! CHECK:         br i1 %{{.*}}, label %omp.reduction.neutral25, label %omp.reduction.neutral26
 
-! CHECK:       omp.reduction.neutral24:                          ; preds = %omp.reduction.neutral22
+! CHECK:       omp.reduction.neutral26:                          ; preds = %omp.reduction.neutral24
 !                [malloc and assign the default value to the reduction variable]
-! CHECK:         br label %omp.reduction.neutral25
+! CHECK:         br label %omp.reduction.neutral27
 
-! CHECK:       omp.reduction.neutral25:                          ; preds = %omp.reduction.neutral23, %omp.reduction.neutral24
-! CHECK-NEXT:    br label %omp.region.cont21
+! CHECK:       omp.reduction.neutral27:                          ; preds = %omp.reduction.neutral25, %omp.reduction.neutral26
+! CHECK-NEXT:    br label %omp.region.cont23
 
-! CHECK:       omp.region.cont21:                                ; preds = %omp.reduction.neutral25
+! CHECK:       omp.region.cont23:                                ; preds = %omp.reduction.neutral27
 ! CHECK-NEXT:    %{{.*}} = phi ptr
-! CHECK-NEXT:    br label %omp.par.region27
+! CHECK-NEXT:    br label %omp.par.region29
 
-! CHECK:       omp.par.region27:                                 ; preds = %omp.region.cont21
+! CHECK:       omp.par.region29:                                 ; preds = %omp.region.cont23
 !                [call SUM runtime function]
 !                [if (sum(a) == 1)]
-! CHECK:         br i1 %{{.*}}, label %omp.par.region28, label %omp.par.region29
+! CHECK:         br i1 %{{.*}}, label %omp.par.region30, label %omp.par.region31
 
-! CHECK:       omp.par.region29:                                 ; preds = %omp.par.region27
-! CHECK-NEXT:    br label %omp.region.cont26
+! CHECK:       omp.par.region31:                                 ; preds = %omp.par.region29
+! CHECK-NEXT:    br label %omp.region.cont28
 
-! CHECK:       omp.region.cont26:                                ; preds = %omp.par.region28, %omp.par.region29
+! CHECK:       omp.region.cont28:                                ; preds = %omp.par.region30, %omp.par.region31
 !                [omp parallel region done, call into the runtime to complete reduction]
 ! CHECK:         %[[VAL_233:.*]] = call i32 @__kmpc_reduce(
 ! CHECK:         switch i32 %[[VAL_233]], label %reduce.finalize [
@@ -156,16 +156,16 @@ end subroutine
 ! CHECK-NEXT:      i32 2, label %reduce.switch.atomic
 ! CHECK-NEXT:    ]
 
-! CHECK:       reduce.switch.atomic:                             ; preds = %omp.region.cont26
+! CHECK:       reduce.switch.atomic:                             ; preds = %omp.region.cont28
 ! CHECK-NEXT:    unreachable
 
-! CHECK:       reduce.switch.nonatomic:                          ; preds = %omp.region.cont26
+! CHECK:       reduce.switch.nonatomic:                          ; preds = %omp.region.cont28
 ! CHECK-NEXT:    %[[red_private_value_0:.*]] = load ptr, ptr %{{.*}}, align 8
 ! CHECK-NEXT:    br label %omp.reduction.nonatomic.body
 
 !              [various blocks implementing the reduction]
 
-! CHECK:       omp.region.cont35:                                ; preds =
+! CHECK:       omp.region.cont37:                                ; preds =
 ! CHECK-NEXT:    %{{.*}} = phi ptr
 ! CHECK-NEXT:    call void @__kmpc_end_reduce(
 ! CHECK-NEXT:    br label %reduce.finalize
@@ -179,87 +179,45 @@ end subroutine
 
 ! CHECK:       omp.reduction.cleanup:                            ; preds = %omp.par.pre_finalize
 !                [null check]
-! CHECK:         br i1 %{{.*}}, label %omp.reduction.cleanup41, label %omp.reduction.cleanup42
+! CHECK:         br i1 %{{.*}}, label %omp.reduction.cleanup43, label %omp.reduction.cleanup44
 
-! CHECK:       omp.reduction.cleanup42:                          ; preds = %omp.reduction.cleanup41, %omp.reduction.cleanup
-! CHECK-NEXT:    br label %omp.region.cont40
+! CHECK:       omp.reduction.cleanup44:                          ; preds = %omp.reduction.cleanup43, %omp.reduction.cleanup
+! CHECK-NEXT:    br label %omp.region.cont42
 
-! CHECK:       omp.region.cont40:                                ; preds = %omp.reduction.cleanup42
+! CHECK:       omp.region.cont42:                                ; preds = %omp.reduction.cleanup44
 ! CHECK-NEXT:    %{{.*}} = load ptr, ptr
-! CHECK-NEXT:    br label %omp.reduction.cleanup44
-
-! CHECK:       omp.reduction.cleanup44:                          ; preds = %omp.region.cont40
-!                [null check]
-! CHECK:         br i1 %{{.*}}, label %omp.reduction.cleanup45, label %omp.reduction.cleanup46
-
-! CHECK:       omp.reduction.cleanup46:                          ; preds = %omp.reduction.cleanup45, %omp.reduction.cleanup44
-! CHECK-NEXT:    br label %omp.region.cont43
-
-! CHECK:       omp.region.cont43:                                ; preds = %omp.reduction.cleanup46
-! CHECK-NEXT:    br label %omp.private.dealloc
-
-! CHECK:       omp.private.dealloc:                              ; preds = %omp.region.cont43
-!                [null check]
-! CHECK:         br i1 %{{.*}}, label %omp.private.dealloc48, label %omp.private.dealloc49
-
-! CHECK:       omp.private.dealloc49:                            ; preds = %omp.private.dealloc48, %omp.private.dealloc
-! CHECK-NEXT:    br label %omp.region.cont47
-
-! CHECK:       omp.region.cont47:                                ; preds = %omp.private.dealloc49
-! CHECK-NEXT:    br label %omp.private.dealloc51
-
-! CHECK:       omp.private.dealloc51:                            ; preds = %omp.region.cont47
-!                [null check]
-! CHECK:         br i1 %{{.*}}, label %omp.private.dealloc52, label %omp.private.dealloc53
-
-! CHECK:       omp.private.dealloc53:                            ; preds = %omp.private.dealloc52, %omp.private.dealloc51
-! CHECK-NEXT:    br label %omp.region.cont50
-
-! CHECK:       omp.region.cont50:                                ; preds = %omp.private.dealloc53
-! CHECK-NEXT:    br label %omp.par.outlined.exit.exitStub
-
-! CHECK:       omp.private.dealloc52:                            ; preds = %omp.private.dealloc51
-!                [dealloc memory]
-! CHECK:         br label %omp.private.dealloc53
-
-! CHECK:       omp.private.dealloc48:                            ; preds = %omp.private.dealloc
-!                [dealloc memory]
-! CHECK:         br label %omp.private.dealloc49
-
-! CHECK:       omp.reduction.cleanup45:                          ; preds = %omp.reduction.cleanup44
-! CHECK-NEXT:    call void @free(
 ! CHECK-NEXT:    br label %omp.reduction.cleanup46
 
-! CHECK:       omp.reduction.cleanup41:                          ; preds = %omp.reduction.cleanup
-! CHECK-NEXT:    call void @free(
-! CHECK-NEXT:    br label %omp.reduction.cleanup42
+! CHECK:       omp.reduction.cleanup46:                          ; preds = %omp.region.cont42
+!                [null check]
+! CHECK:         br i1 %{{.*}}, label %omp.reduction.cleanup47, label %omp.reduction.cleanup48
 
-! CHECK:       omp.par.region28:                                 ; preds = %omp.par.region27
+! CHECK:       omp.par.region30:                                 ; preds = %omp.par.region29
 ! CHECK-NEXT:    call void @_FortranAStopStatement
 
-! CHECK:       omp.reduction.neutral23:                          ; preds = %omp.reduction.neutral22
+! CHECK:       omp.reduction.neutral25:                          ; preds = %omp.reduction.neutral24
 !                [source length was zero: finish initializing array]
-! CHECK:         br label %omp.reduction.neutral25
+! CHECK:         br label %omp.reduction.neutral27
 
-! CHECK:       omp.reduction.neutral18:                          ; preds = %omp.reduction.neutral
+! CHECK:       omp.reduction.neutral20:                          ; preds = %omp.reduction.neutral
 !                [source length was zero: finish initializing array]
-! CHECK:         br label %omp.reduction.neutral20
+! CHECK:         br label %omp.reduction.neutral22
 
-! CHECK:       omp.private.copy15:                               ; preds = %omp.private.copy14
+! CHECK:       omp.private.copy17:                               ; preds = %omp.private.copy16
 !                [source length was non-zero: call assign runtime]
-! CHECK:         br label %omp.private.copy16
+! CHECK:         br label %omp.private.copy18
 
-! CHECK:       omp.private.copy11:                               ; preds = %omp.private.copy10
+! CHECK:       omp.private.copy13:                               ; preds = %omp.private.copy12
 !                [source length was non-zero: call assign runtime]
-! CHECK:         br label %omp.private.copy12
+! CHECK:         br label %omp.private.copy14
 
-! CHECK:       omp.private.alloc1:                               ; preds = %omp.private.alloc
+! CHECK:       omp.private.init2:                               ; preds = %omp.private.init1
 !                [var extent was non-zero: malloc a private array]
-! CHECK:         br label %omp.private.alloc3
+! CHECK:         br label %omp.private.init4
 
-! CHECK:       omp.private.alloc6:                               ; preds = %omp.private.alloc5
+! CHECK:       omp.private.init8:                               ; preds = %omp.private.init7
 !                [var extent was non-zero: malloc a private array]
-! CHECK:         br label %omp.private.alloc8
+! CHECK:         br label %omp.private.init10
 
-! CHECK:       omp.par.outlined.exit.exitStub:                   ; preds = %omp.region.cont50
+! CHECK:       omp.par.outlined.exit.exitStub:                   ; preds = %omp.region.cont52
 ! CHECK-NEXT:    ret void
diff --git flang/test/Integration/OpenMP/private-global.f90 flang/test/Integration/OpenMP/private-global.f90
index 07dbe86e5ec9..39d7e2274cff 100644
--- flang/test/Integration/OpenMP/private-global.f90
+++ flang/test/Integration/OpenMP/private-global.f90
@@ -21,20 +21,24 @@ End Program
 ! CHECK:         %[[VAL_10:.*]] = load i32, ptr %[[VAL_11:.*]], align 4
 ! CHECK:         store i32 %[[VAL_10]], ptr %[[VAL_9]], align 4
 ! CHECK:         %[[VAL_12:.*]] = load i32, ptr %[[VAL_9]], align 4
-! CHECK:         %[[PRIV_TABLE:.*]] = alloca [10 x i32], i64 1, align 4
+! CHECK:         %[[PRIV_BOX_ALLOC:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, align 8
 ! ...
 ! check that we use the private copy of table for the assignment
 ! CHECK:       omp.par.region1:
 ! CHECK:         %[[ELEMENTAL_TMP:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, align 8
 ! CHECK:         %[[TABLE_BOX_ADDR:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, align 8
 ! CHECK:         %[[BOXED_FIFTY:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8
+! CHECK:         %[[FIFTY:.*]] = alloca i32, i64 1, align 4
+! CHECK:         %[[INTERMEDIATE:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, align 8
 ! CHECK:         %[[TABLE_BOX_ADDR2:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, i64 1, align 8
-! CHECK:         %[[TABLE_BOX_VAL:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } { ptr undef, i64 ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64), i32 20240719, i8 1, i8 9, i8 0, i8 0, [1 x [3 x i64]] {{\[\[}}3 x i64] [i64 1, i64 10, i64 ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64)]] }, ptr %[[PRIV_TABLE]], 0
-! CHECK:         store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[TABLE_BOX_VAL]], ptr %[[TABLE_BOX_ADDR]], align 8
-! CHECK :         %[[TABLE_BOX_VAL2:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[TABLE_BOX_ADDR]], align 8
-! CHECK :         store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[TABLE_BOX_VAL2]], ptr %[[TABLE_BOX_ADDR2]], align 8
-! CHECK:         call void @llvm.memcpy.p0.p0.i32(ptr %[[TABLE_BOX_ADDR2]], ptr %[[TABLE_BOX_ADDR]], i32 48, i1 false)
+! CHECK:         call void @llvm.memcpy.p0.p0.i32(ptr %[[INTERMEDIATE]], ptr %[[PRIV_BOX_ALLOC]], i32 48, i1 false)
+! CHECK:         store i32 50, ptr %[[FIFTY]], align 4
+! CHECK:         %[[FIFTY_BOX_VAL:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8 } { ptr undef, i64 ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64), i32 20240719, i8 0, i8 9, i8 0, i8 0 }, ptr %[[FIFTY]], 0
+! CHECK:         store { ptr, i64, i32, i8, i8, i8, i8 } %[[FIFTY_BOX_VAL]], ptr %[[BOXED_FIFTY]], align 8
+! CHECK:         call void @llvm.memcpy.p0.p0.i32(ptr %[[TABLE_BOX_ADDR2]], ptr %[[INTERMEDIATE]], i32 48, i1 false)
 ! CHECK:         call void @_FortranAAssign(ptr %[[TABLE_BOX_ADDR2]], ptr %[[BOXED_FIFTY]], ptr @{{.*}}, i32 9)
+! CHECK:         call void @llvm.memcpy.p0.p0.i32(ptr %[[TABLE_BOX_ADDR]], ptr %[[PRIV_BOX_ALLOC]], i32 48, i1 false)
+! CHECK:         %[[PRIV_TABLE:.*]] = call ptr @malloc(i64 ptrtoint (ptr getelementptr ([10 x i32], ptr null, i32 1) to i64))
 ! ...
 ! check that we use the private copy of table for table/=50
 ! CHECK:       omp.par.region3:
@@ -43,5 +47,3 @@ End Program
 ! CHECK:         %[[VAL_46:.*]] = mul nsw i64 %[[VAL_45]], 1
 ! CHECK:         %[[VAL_47:.*]] = add nsw i64 %[[VAL_46]], 0
 ! CHECK:         %[[VAL_48:.*]] = getelementptr i32, ptr %[[PRIV_TABLE]], i64 %[[VAL_47]]
-! CHECK:         %[[VAL_49:.*]] = load i32, ptr %[[VAL_48]], align 4
-! CHECK:         %[[VAL_50:.*]] = icmp ne i32 %[[VAL_49]], 50
diff --git flang/test/Lower/HLFIR/convert-variable.f90 flang/test/Lower/HLFIR/convert-variable.f90
index 7acb1be578b9..07b91d0f34a0 100644
--- flang/test/Lower/HLFIR/convert-variable.f90
+++ flang/test/Lower/HLFIR/convert-variable.f90
@@ -1,5 +1,5 @@
 ! Test lowering of variables to fir.declare
-! RUN: bbc -emit-hlfir %s -o - | FileCheck %s
+! RUN: bbc -emit-hlfir %s -o - | FileCheck %s --check-prefixes=CHECK,%if flang-supports-f128-math %{F128%} %else %{F64%}
 
 subroutine scalar_numeric(x)
   integer :: x
@@ -68,13 +68,16 @@ end subroutine
 ! CHECK:  %[[VAL_1:.*]] = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in, optional, target>, uniq_name = "_QFscalar_numeric_attributesEx"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 
 subroutine scalar_numeric_attributes_2(x)
-  real(16), value :: x(100)
+  integer, parameter :: rk = merge(16, 8, selected_real_kind(33, 4931)==16)
+  real(rk), value :: x(100)
 end subroutine
 ! CHECK-LABEL: func.func @_QPscalar_numeric_attributes_2(
-! CHECK-SAME:    %[[VAL_0:.*]]: !fir.ref<!fir.array<100xf128>>
+! F128-SAME:    %[[VAL_0:.*]]: !fir.ref<!fir.array<100xf128>>
+! F64-SAME:    %[[VAL_0:.*]]: !fir.ref<!fir.array<100xf64>>
 ! CHECK:  %[[VAL_1:.*]] = arith.constant 100 : index
 ! CHECK:  %[[VAL_2:.*]] = fir.shape %[[VAL_1]] : (index) -> !fir.shape<1>
-! CHECK:  %[[VAL_3:.*]] = hlfir.declare %[[VAL_0]](%[[VAL_2]]) dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QFscalar_numeric_attributes_2Ex"} : (!fir.ref<!fir.array<100xf128>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xf128>>, !fir.ref<!fir.array<100xf128>>)
+! F128:  %[[VAL_3:.*]] = hlfir.declare %[[VAL_0]](%[[VAL_2]]) dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QFscalar_numeric_attributes_2Ex"} : (!fir.ref<!fir.array<100xf128>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xf128>>, !fir.ref<!fir.array<100xf128>>)
+! F64:  %[[VAL_3:.*]] = hlfir.declare %[[VAL_0]](%[[VAL_2]]) dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QFscalar_numeric_attributes_2Ex"} : (!fir.ref<!fir.array<100xf64>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xf64>>, !fir.ref<!fir.array<100xf64>>)
 
 subroutine scalar_numeric_attributes_3(x)
   real, intent(in) :: x
diff --git flang/test/Lower/Intrinsics/abs.f90 flang/test/Lower/Intrinsics/abs.f90
index e5e4b79e9f79..7150cb2d352f 100644
--- flang/test/Lower/Intrinsics/abs.f90
+++ flang/test/Lower/Intrinsics/abs.f90
@@ -1,7 +1,7 @@
-! RUN: bbc -emit-fir -hlfir=false %s -o - | FileCheck %s --check-prefixes="CHECK,CMPLX,CMPLX-PRECISE"
+! RUN: bbc -emit-fir -hlfir=false %s -o - | FileCheck %s --check-prefixes=CHECK,CMPLX,CMPLX-PRECISE,%if flang-supports-f128-math %{F128%} %else %{F64%}
 ! RUN: bbc -emit-fir -hlfir=false --math-runtime=precise %s -o - | FileCheck %s --check-prefixes="CMPLX,CMPLX-PRECISE"
 ! RUN: bbc --force-mlir-complex -emit-fir -hlfir=false %s -o - | FileCheck %s --check-prefixes="CMPLX,CMPLX-FAST"
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir %s -o - | FileCheck %s --check-prefixes="CHECK,CMPLX,CMPLX-PRECISE"
+! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir %s -o - | FileCheck %s --check-prefixes=CHECK,CMPLX,CMPLX-PRECISE,%if flang-supports-f128-math %{F128%} %else %{F64%}
 ! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -mllvm --math-runtime=precise %s -o - | FileCheck %s --check-prefixes="CMPLX,CMPLX-PRECISE"
 ! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -mllvm --force-mlir-complex %s -o - | FileCheck %s --check-prefixes="CMPLX,CMPLX-FAST"
 ! RUN: %flang_fc1 -fapprox-func -emit-fir -flang-deprecated-no-hlfir %s -o - | FileCheck %s --check-prefixes="CMPLX,CMPLX-APPROX"
@@ -85,13 +85,18 @@ subroutine abs_testd(a, b)
 end subroutine
 
 ! CHECK-LABEL: func @_QPabs_testr16(
-! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<f128>{{.*}}, %[[VAL_1:.*]]: !fir.ref<f128>{{.*}}) {
+! F128-SAME:  %[[VAL_0:.*]]: !fir.ref<f128>{{.*}}, %[[VAL_1:.*]]: !fir.ref<f128>{{.*}}) {
+! F64-SAME:  %[[VAL_0:.*]]: !fir.ref<f64>{{.*}}, %[[VAL_1:.*]]: !fir.ref<f64>{{.*}}) {
 subroutine abs_testr16(a, b)
-! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref<f128>
-! CHECK: %[[VAL_3:.*]] = math.absf %[[VAL_2]] {{.*}}: f128
-! CHECK: fir.store %[[VAL_3]] to %[[VAL_1]] : !fir.ref<f128>
+! F128: %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref<f128>
+! F64: %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref<f64>
+! F128: %[[VAL_3:.*]] = math.absf %[[VAL_2]] {{.*}}: f128
+! F64: %[[VAL_3:.*]] = math.absf %[[VAL_2]] {{.*}}: f64
+! F128: fir.store %[[VAL_3]] to %[[VAL_1]] : !fir.ref<f128>
+! F64: fir.store %[[VAL_3]] to %[[VAL_1]] : !fir.ref<f64>
 ! CHECK: return
-  real(kind=16) :: a, b
+  integer, parameter :: rk = merge(16, 8, selected_real_kind(33, 4931)==16)
+  real(kind=rk) :: a, b
   b = abs(a)
 end subroutine
 
diff --git flang/test/Lower/Intrinsics/aint.f90 flang/test/Lower/Intrinsics/aint.f90
index fb459953a06c..b82b63abbf61 100644
--- flang/test/Lower/Intrinsics/aint.f90
+++ flang/test/Lower/Intrinsics/aint.f90
@@ -1,41 +1,31 @@
-! RUN: bbc -emit-fir -hlfir=false %s -o - | FileCheck %s
+! RUN: bbc -emit-hlfir %s -o - | FileCheck %s --check-prefixes=CHECK%if target=x86_64{{.*}} %{,CHECK-KIND10%}
 
 ! CHECK-LABEL: func @_QPaint_test(
-! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<f32>{{.*}}, %[[VAL_1:.*]]: !fir.ref<f32>{{.*}}) {
+! CHECK-SAME: %[[VAL_0_b:.*]]: !fir.ref<f32>{{.*}}, %[[VAL_1_b:.*]]: !fir.ref<f32>{{.*}}) {
 subroutine aint_test(a, b)
-! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref<f32>
+! CHECK: %[[VAL_0:.*]]:2 = hlfir.declare %[[VAL_0_b]]
+! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_1_b]]
+! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_0]]#0 : !fir.ref<f32>
 ! CHECK: %[[VAL_3:.*]] = fir.call @llvm.trunc.f32(%[[VAL_2]]) {{.*}}: (f32) -> f32
-! CHECK: fir.store %[[VAL_3]] to %[[VAL_1]] : !fir.ref<f32>
+! CHECK: hlfir.assign %[[VAL_3]] to %[[VAL_1]]#0 : f32, !fir.ref<f32>
 ! CHECK: return
   real :: a, b
   b = aint(a)
 end subroutine
 
 ! CHECK-LABEL: func.func @_QPaint_test_real8(
-! CHECK-SAME:                                %[[VAL_0:.*]]: !fir.ref<f64> {fir.bindc_name = "a"},
-! CHECK-SAME:                                %[[VAL_1:.*]]: !fir.ref<f64> {fir.bindc_name = "b"}) {
-! CHECK:         %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref<f64>
-! CHECK:         %[[VAL_3:.*]] = fir.call @llvm.trunc.f64(%[[VAL_2]]) {{.*}}: (f64) -> f64
-! CHECK:         fir.store %[[VAL_3]] to %[[VAL_1]] : !fir.ref<f64>
-! CHECK:         return
-! CHECK:       }
+! CHECK:   fir.call @llvm.trunc.f64({{.*}}) {{.*}}: (f64) -> f64
 
 subroutine aint_test_real8(a, b)
   real(8) :: a, b
   b = aint(a)
 end subroutine
 
-! CHECK-LABEL: func.func @_QPaint_test_real10(
-! CHECK-SAME:                                 %[[VAL_0:.*]]: !fir.ref<f80> {fir.bindc_name = "a"},
-! CHECK-SAME:                                 %[[VAL_1:.*]]: !fir.ref<f80> {fir.bindc_name = "b"}) {
-! CHECK:         %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref<f80>
-! CHECK:         %[[VAL_3:.*]] = fir.call @llvm.trunc.f80(%[[VAL_2]]) {{.*}}: (f80) -> f80
-! CHECK:         fir.store %[[VAL_3]] to %[[VAL_1]] : !fir.ref<f80>
-! CHECK:         return
-! CHECK:       }
-
+! CHECK-KIND10-LABEL: func.func @_QPaint_test_real10(
+! CHECK-KIND10:   fir.call @llvm.trunc.f80({{.*}}) {{.*}}: (f80) -> f80
 subroutine aint_test_real10(a, b)
-  real(10) :: a, b
+  integer, parameter :: kind10 = merge(10, 4, selected_real_kind(p=18).eq.10)
+  real(kind10) :: a, b
   b = aint(a)
 end subroutine
 
diff --git flang/test/Lower/Intrinsics/anint.f90 flang/test/Lower/Intrinsics/anint.f90
index 4148d18f15b3..a7b24648ca0b 100644
--- flang/test/Lower/Intrinsics/anint.f90
+++ flang/test/Lower/Intrinsics/anint.f90
@@ -1,11 +1,13 @@
-! RUN: bbc -emit-fir -hlfir=false %s -o - | FileCheck %s
+! RUN: bbc -emit-hlfir %s -o - | FileCheck %s --check-prefixes=CHECK%if target=x86_64{{.*}} %{,CHECK-KIND10%}
 
 ! CHECK-LABEL: func.func @_QPanint_test(
-! CHECK-SAME:                           %[[VAL_0:.*]]: !fir.ref<f32> {fir.bindc_name = "a"},
-! CHECK-SAME:                           %[[VAL_1:.*]]: !fir.ref<f32> {fir.bindc_name = "b"}) {
-! CHECK:         %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref<f32>
+! CHECK-SAME:                           %[[VAL_0_b:.*]]: !fir.ref<f32> {fir.bindc_name = "a"},
+! CHECK-SAME:                           %[[VAL_1_b:.*]]: !fir.ref<f32> {fir.bindc_name = "b"}) {
+! CHECK:         %[[VAL_0:.*]]:2 = hlfir.declare %[[VAL_0_b]]
+! CHECK:         %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_1_b]]
+! CHECK:         %[[VAL_2:.*]] = fir.load %[[VAL_0]]#0 : !fir.ref<f32>
 ! CHECK:         %[[VAL_3:.*]] = llvm.intr.round(%[[VAL_2]]) : (f32) -> f32
-! CHECK:         fir.store %[[VAL_3]] to %[[VAL_1]] : !fir.ref<f32>
+! CHECK:         hlfir.assign %[[VAL_3]] to %[[VAL_1]]#0 : f32, !fir.ref<f32>
 ! CHECK:         return
 ! CHECK:       }
 
@@ -15,30 +17,19 @@ subroutine anint_test(a, b)
 end subroutine
 
 ! CHECK-LABEL: func.func @_QPanint_test_real8(
-! CHECK-SAME:                                 %[[VAL_0:.*]]: !fir.ref<f64> {fir.bindc_name = "a"},
-! CHECK-SAME:                                 %[[VAL_1:.*]]: !fir.ref<f64> {fir.bindc_name = "b"}) {
-! CHECK:         %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref<f64>
-! CHECK:         %[[VAL_3:.*]] = llvm.intr.round(%[[VAL_2]]) : (f64) -> f64
-! CHECK:         fir.store %[[VAL_3]] to %[[VAL_1]] : !fir.ref<f64>
-! CHECK:         return
-! CHECK:       }
+! CHECK:    llvm.intr.round(%{{.*}}) : (f64) -> f64
 
 subroutine anint_test_real8(a, b)
   real(8) :: a, b
   b = anint(a)
 end subroutine
 
-! CHECK-LABEL: func.func @_QPanint_test_real10(
-! CHECK-SAME:                                  %[[VAL_0:.*]]: !fir.ref<f80> {fir.bindc_name = "a"},
-! CHECK-SAME:                                  %[[VAL_1:.*]]: !fir.ref<f80> {fir.bindc_name = "b"}) {
-! CHECK:         %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref<f80>
-! CHECK:         %[[VAL_3:.*]] = llvm.intr.round(%[[VAL_2]]) : (f80) -> f80
-! CHECK:         fir.store %[[VAL_3]] to %[[VAL_1]] : !fir.ref<f80>
-! CHECK:         return
-! CHECK:       }
+! CHECK-KIND10-LABEL: func.func @_QPanint_test_real10(
+! CHECK-KIND10:    llvm.intr.round(%{{.*}}) : (f80) -> f80
 
 subroutine anint_test_real10(a, b)
-  real(10) :: a, b
+  integer, parameter :: kind10 = merge(10, 4, selected_real_kind(p=18).eq.10)
+  real(kind10) :: a, b
   b = anint(a)
 end subroutine
 
diff --git flang/test/Lower/Intrinsics/dot_product.f90 flang/test/Lower/Intrinsics/dot_product.f90
index 9a825c4b9acf..62694a70555d 100644
--- flang/test/Lower/Intrinsics/dot_product.f90
+++ flang/test/Lower/Intrinsics/dot_product.f90
@@ -1,14 +1,16 @@
-! RUN: bbc -emit-fir -hlfir=false %s -o - | FileCheck %s
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir -O0 %s -o - | FileCheck %s --check-prefixes=CHECK%if target=x86_64{{.*}} %{,CHECK-KIND10%}%if flang-supports-f128-math %{,CHECK-KIND16%}
 
 ! DOT_PROD
 ! CHECK-LABEL: dot_prod_int_default
-! CHECK-SAME: %[[x:arg0]]: !fir.box<!fir.array<?xi32>>
-! CHECK-SAME: %[[y:arg1]]: !fir.box<!fir.array<?xi32>>
-! CHECK-SAME: %[[z:arg2]]: !fir.box<!fir.array<?xi32>>
 subroutine dot_prod_int_default (x, y, z)
   integer, dimension(1:) :: x,y
   integer, dimension(1:) :: z
+  ! CHECK: %[[x1:.*]] = fir.declare{{.*}}x"
+  ! CHECK: %[[x:.*]] = fir.rebox %[[x1]]{{.*}}
+  ! CHECK: %[[y1:.*]] = fir.declare{{.*}}y"
+  ! CHECK: %[[y:.*]] = fir.rebox %[[y1]]{{.*}}
+  ! CHECK: %[[z1:.*]] = fir.declare{{.*}}z"
+  ! CHECK: %[[z:.*]] = fir.rebox %[[z1]]{{.*}}
   ! CHECK-DAG: %[[x_conv:.*]] = fir.convert %[[x]] : (!fir.box<!fir.array<?xi32>>) -> !fir.box<none>
   ! CHECK-DAG: %[[y_conv:.*]] = fir.convert %[[y]] : (!fir.box<!fir.array<?xi32>>) -> !fir.box<none>
   ! CHECK-DAG: %[[res:.*]] = fir.call @_FortranADotProductInteger4(%[[x_conv]], %[[y_conv]], %{{[0-9]+}}, %{{.*}}) {{.*}}: (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@@ -16,268 +18,172 @@ subroutine dot_prod_int_default (x, y, z)
 end subroutine
 
 ! CHECK-LABEL: dot_prod_int_kind_1
-! CHECK-SAME: %[[x:arg0]]: !fir.box<!fir.array<?xi8>>
-! CHECK-SAME: %[[y:arg1]]: !fir.box<!fir.array<?xi8>>
-! CHECK-SAME: %[[z:arg2]]: !fir.box<!fir.array<?xi8>>
 subroutine dot_prod_int_kind_1 (x, y, z)
   integer(kind=1), dimension(1:) :: x,y
   integer(kind=1), dimension(1:) :: z
-  ! CHECK-DAG: %[[x_conv:.*]] = fir.convert %[[x]] : (!fir.box<!fir.array<?xi8>>) -> !fir.box<none>
-  ! CHECK-DAG: %[[y_conv:.*]] = fir.convert %[[y]] : (!fir.box<!fir.array<?xi8>>) -> !fir.box<none>
-  ! CHECK-DAG: %[[res:.*]] = fir.call @_FortranADotProductInteger1(%[[x_conv]], %[[y_conv]], %{{[0-9]+}}, %{{.*}}) {{.*}}: (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> i8
+  ! CHECK: fir.call @_FortranADotProductInteger1(%{{.*}}, %{{.*}}, %{{[0-9]+}}, %{{.*}}) {{.*}}: (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> i8
   z = dot_product(x,y)
 end subroutine
 
 ! CHECK-LABEL: dot_prod_int_kind_2
-! CHECK-SAME: %[[x:arg0]]: !fir.box<!fir.array<?xi16>>
-! CHECK-SAME: %[[y:arg1]]: !fir.box<!fir.array<?xi16>>
-! CHECK-SAME: %[[z:arg2]]: !fir.box<!fir.array<?xi16>>
 subroutine dot_prod_int_kind_2 (x, y, z)
   integer(kind=2), dimension(1:) :: x,y
   integer(kind=2), dimension(1:) :: z
-  ! CHECK-DAG: %[[x_conv:.*]] = fir.convert %[[x]] : (!fir.box<!fir.array<?xi16>>) -> !fir.box<none>
-  ! CHECK-DAG: %[[y_conv:.*]] = fir.convert %[[y]] : (!fir.box<!fir.array<?xi16>>) -> !fir.box<none>
-  ! CHECK-DAG: %[[res:.*]] = fir.call @_FortranADotProductInteger2(%[[x_conv]], %[[y_conv]], %{{[0-9]+}}, %{{.*}}) {{.*}}: (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> i16
+  ! CHECK: fir.call @_FortranADotProductInteger2(%{{.*}}, %{{.*}}, %{{[0-9]+}}, %{{.*}}) {{.*}}: (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> i16
   z = dot_product(x,y)
 end subroutine
 
 ! CHECK-LABEL: dot_prod_int_kind_4
-! CHECK-SAME: %[[x:arg0]]: !fir.box<!fir.array<?xi32>>
-! CHECK-SAME: %[[y:arg1]]: !fir.box<!fir.array<?xi32>>
-! CHECK-SAME: %[[z:arg2]]: !fir.box<!fir.array<?xi32>>
 subroutine dot_prod_int_kind_4 (x, y, z)
   integer(kind=4), dimension(1:) :: x,y
   integer(kind=4), dimension(1:) :: z
-  ! CHECK-DAG: %[[x_conv:.*]] = fir.convert %[[x]] : (!fir.box<!fir.array<?xi32>>) -> !fir.box<none>
-  ! CHECK-DAG: %[[y_conv:.*]] = fir.convert %[[y]] : (!fir.box<!fir.array<?xi32>>) -> !fir.box<none>
-  ! CHECK-DAG: %[[res:.*]] = fir.call @_FortranADotProductInteger4(%[[x_conv]], %[[y_conv]], %{{[0-9]+}}, %{{.*}}) {{.*}}: (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+  ! CHECK: fir.call @_FortranADotProductInteger4(%{{.*}}, %{{.*}}, %{{[0-9]+}}, %{{.*}}) {{.*}}: (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> i32
   z = dot_product(x,y)
 end subroutine
 
 ! CHECK-LABEL: dot_prod_int_kind_8
-! CHECK-SAME: %[[x:arg0]]: !fir.box<!fir.array<?xi64>>
-! CHECK-SAME: %[[y:arg1]]: !fir.box<!fir.array<?xi64>>
-! CHECK-SAME: %[[z:arg2]]: !fir.box<!fir.array<?xi64>>
 subroutine dot_prod_int_kind_8 (x, y, z)
   integer(kind=8), dimension(1:) :: x,y
   integer(kind=8), dimension(1:) :: z
-  ! CHECK-DAG: %[[x_conv:.*]] = fir.convert %[[x]] : (!fir.box<!fir.array<?xi64>>) -> !fir.box<none>
-  ! CHECK-DAG: %[[y_conv:.*]] = fir.convert %[[y]] : (!fir.box<!fir.array<?xi64>>) -> !fir.box<none>
-  ! CHECK-DAG: %[[res:.*]] = fir.call @_FortranADotProductInteger8(%[[x_conv]], %[[y_conv]], %{{[0-9]+}}, %{{.*}}) {{.*}}: (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> i64
+  ! CHECK: fir.call @_FortranADotProductInteger8(%{{.*}}, %{{.*}}, %{{[0-9]+}}, %{{.*}}) {{.*}}: (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> i64
   z = dot_product(x,y)
 end subroutine
 
 ! CHECK-LABEL: dot_prod_int_kind_16
-! CHECK-SAME: %[[x:arg0]]: !fir.box<!fir.array<?xi128>>
-! CHECK-SAME: %[[y:arg1]]: !fir.box<!fir.array<?xi128>>
-! CHECK-SAME: %[[z:arg2]]: !fir.box<!fir.array<?xi128>>
 subroutine dot_prod_int_kind_16 (x, y, z)
   integer(kind=16), dimension(1:) :: x,y
   integer(kind=16), dimension(1:) :: z
-  ! CHECK-DAG: %[[x_conv:.*]] = fir.convert %[[x]] : (!fir.box<!fir.array<?xi128>>) -> !fir.box<none>
-  ! CHECK-DAG: %[[y_conv:.*]] = fir.convert %[[y]] : (!fir.box<!fir.array<?xi128>>) -> !fir.box<none>
-  ! CHECK-DAG: %[[res:.*]] = fir.call @_FortranADotProductInteger16(%[[x_conv]], %[[y_conv]], %{{[0-9]+}}, %{{.*}}) {{.*}}: (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> i128
+  ! CHECK: fir.call @_FortranADotProductInteger16(%{{.*}}, %{{.*}}, %{{[0-9]+}}, %{{.*}}) {{.*}}: (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> i128
   z = dot_product(x,y)
 end subroutine
 
 ! CHECK-LABEL: dot_prod_real_kind_default
-! CHECK-SAME: %[[x:arg0]]: !fir.box<!fir.array<?xf32>>
-! CHECK-SAME: %[[y:arg1]]: !fir.box<!fir.array<?xf32>>
-! CHECK-SAME: %[[z:arg2]]: !fir.box<!fir.array<?xf32>>
 subroutine dot_prod_real_kind_default (x, y, z)
   real, dimension(1:) :: x,y
   real, dimension(1:) :: z
-  ! CHECK-DAG: %[[x_conv:.*]] = fir.convert %[[x]] : (!fir.box<!fir.array<?xf32>>) -> !fir.box<none>
-  ! CHECK-DAG: %[[y_conv:.*]] = fir.convert %[[y]] : (!fir.box<!fir.array<?xf32>>) -> !fir.box<none>
-  ! CHECK-DAG: %[[res:.*]] = fir.call @_FortranADotProductReal4(%[[x_conv]], %[[y_conv]], %{{[0-9]+}}, %{{.*}}) {{.*}}: (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> f32
+  ! CHECK: fir.call @_FortranADotProductReal4(%{{.*}}, %{{.*}}, %{{[0-9]+}}, %{{.*}}) {{.*}}: (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> f32
   z = dot_product(x,y)
 end subroutine
 
 ! CHECK-LABEL: dot_prod_real_kind_4
-! CHECK-SAME: %[[x:arg0]]: !fir.box<!fir.array<?xf32>>
-! CHECK-SAME: %[[y:arg1]]: !fir.box<!fir.array<?xf32>>
-! CHECK-SAME: %[[z:arg2]]: !fir.box<!fir.array<?xf32>>
 subroutine dot_prod_real_kind_4 (x, y, z)
   real(kind=4), dimension(1:) :: x,y
   real(kind=4), dimension(1:) :: z
-  ! CHECK-DAG: %[[x_conv:.*]] = fir.convert %[[x]] : (!fir.box<!fir.array<?xf32>>) -> !fir.box<none>
-  ! CHECK-DAG: %[[y_conv:.*]] = fir.convert %[[y]] : (!fir.box<!fir.array<?xf32>>) -> !fir.box<none>
-  ! CHECK-DAG: %[[res:.*]] = fir.call @_FortranADotProductReal4(%[[x_conv]], %[[y_conv]], %{{[0-9]+}}, %{{.*}}) {{.*}}: (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> f32
+  ! CHECK: fir.call @_FortranADotProductReal4(%{{.*}}, %{{.*}}, %{{[0-9]+}}, %{{.*}}) {{.*}}: (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> f32
   z = dot_product(x,y)
 end subroutine
 
 ! CHECK-LABEL: dot_prod_real_kind_8
-! CHECK-SAME: %[[x:arg0]]: !fir.box<!fir.array<?xf64>>
-! CHECK-SAME: %[[y:arg1]]: !fir.box<!fir.array<?xf64>>
-! CHECK-SAME: %[[z:arg2]]: !fir.box<!fir.array<?xf64>>
 subroutine dot_prod_real_kind_8 (x, y, z)
   real(kind=8), dimension(1:) :: x,y
   real(kind=8), dimension(1:) :: z
-  ! CHECK-DAG: %[[x_conv:.*]] = fir.convert %[[x]] : (!fir.box<!fir.array<?xf64>>) -> !fir.box<none>
-  ! CHECK-DAG: %[[y_conv:.*]] = fir.convert %[[y]] : (!fir.box<!fir.array<?xf64>>) -> !fir.box<none>
-  ! CHECK-DAG: %[[res:.*]] = fir.call @_FortranADotProductReal8(%[[x_conv]], %[[y_conv]], %{{[0-9]+}}, %{{.*}}) {{.*}}: (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> f64
+  ! CHECK: fir.call @_FortranADotProductReal8(%{{.*}}, %{{.*}}, %{{[0-9]+}}, %{{.*}}) {{.*}}: (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> f64
   z = dot_product(x,y)
 end subroutine
 
-! CHECK-LABEL: dot_prod_real_kind_10
-! CHECK-SAME: %[[x:arg0]]: !fir.box<!fir.array<?xf80>>
-! CHECK-SAME: %[[y:arg1]]: !fir.box<!fir.array<?xf80>>
-! CHECK-SAME: %[[z:arg2]]: !fir.box<!fir.array<?xf80>>
+! CHECK-KIND10-LABEL: dot_prod_real_kind_10
 subroutine dot_prod_real_kind_10 (x, y, z)
-  real(kind=10), dimension(1:) :: x,y
-  real(kind=10), dimension(1:) :: z
-  ! CHECK-DAG: %[[x_conv:.*]] = fir.convert %[[x]] : (!fir.box<!fir.array<?xf80>>) -> !fir.box<none>
-  ! CHECK-DAG: %[[y_conv:.*]] = fir.convert %[[y]] : (!fir.box<!fir.array<?xf80>>) -> !fir.box<none>
-  ! CHECK-DAG: %[[res:.*]] = fir.call @_FortranADotProductReal10(%[[x_conv]], %[[y_conv]], %{{[0-9]+}}, %{{.*}}) {{.*}}: (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> f80
+  integer, parameter :: kind10 = merge(10, 4, selected_real_kind(p=18).eq.10)
+  real(kind=kind10), dimension(1:) :: x,y
+  real(kind=kind10), dimension(1:) :: z
+  ! CHECK-KIND10: fir.call @_FortranADotProductReal10(%{{.*}}, %{{.*}}, %{{[0-9]+}}, %{{.*}}) {{.*}}: (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> f80
   z = dot_product(x,y)
 end subroutine
 
-! CHECK-LABEL: dot_prod_real_kind_16
-! CHECK-SAME: %[[x:arg0]]: !fir.box<!fir.array<?xf128>>
-! CHECK-SAME: %[[y:arg1]]: !fir.box<!fir.array<?xf128>>
-! CHECK-SAME: %[[z:arg2]]: !fir.box<!fir.array<?xf128>>
+! CHECK-KIND16-LABEL: dot_prod_real_kind_16
 subroutine dot_prod_real_kind_16 (x, y, z)
-  real(kind=16), dimension(1:) :: x,y
-  real(kind=16), dimension(1:) :: z
-  ! CHECK-DAG: %[[x_conv:.*]] = fir.convert %[[x]] : (!fir.box<!fir.array<?xf128>>) -> !fir.box<none>
-  ! CHECK-DAG: %[[y_conv:.*]] = fir.convert %[[y]] : (!fir.box<!fir.array<?xf128>>) -> !fir.box<none>
-  ! CHECK-DAG: %[[res:.*]] = fir.call @_FortranADotProductReal16(%[[x_conv]], %[[y_conv]], %{{[0-9]+}}, %{{.*}}) {{.*}}: (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> f128
+  integer, parameter :: kind16 = merge(16, 4, selected_real_kind(p=33).eq.16)
+  real(kind=kind16), dimension(1:) :: x,y
+  real(kind=kind16), dimension(1:) :: z
+  ! CHECK-KIND16: fir.call @_FortranADotProductReal16(%{{.*}}, %{{.*}}, %{{[0-9]+}}, %{{.*}}) {{.*}}: (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> f128
   z = dot_product(x,y)
 end subroutine
 
 ! CHECK-LABEL: dot_prod_double_default
-! CHECK-SAME: %[[x:arg0]]: !fir.box<!fir.array<?xf64>>
-! CHECK-SAME: %[[y:arg1]]: !fir.box<!fir.array<?xf64>>
-! CHECK-SAME: %[[z:arg2]]: !fir.box<!fir.array<?xf64>>
 subroutine dot_prod_double_default (x, y, z)
   double precision, dimension(1:) :: x,y
   double precision, dimension(1:) :: z
-  ! CHECK-DAG: %[[x_conv:.*]] = fir.convert %[[x]] : (!fir.box<!fir.array<?xf64>>) -> !fir.box<none>
-  ! CHECK-DAG: %[[y_conv:.*]] = fir.convert %[[y]] : (!fir.box<!fir.array<?xf64>>) -> !fir.box<none>
-  ! CHECK-DAG: %[[res:.*]] = fir.call @_FortranADotProductReal8(%[[x_conv]], %[[y_conv]], %{{[0-9]+}}, %{{.*}}) {{.*}}: (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> f64
+  ! CHECK: fir.call @_FortranADotProductReal8(%{{.*}}, %{{.*}}, %{{[0-9]+}}, %{{.*}}) {{.*}}: (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> f64
   z = dot_product(x,y)
 end subroutine
 
 ! CHECK-LABEL: dot_prod_complex_default
-! CHECK-SAME: %[[x:arg0]]: !fir.box<!fir.array<?xcomplex<f32>>>
-! CHECK-SAME: %[[y:arg1]]: !fir.box<!fir.array<?xcomplex<f32>>>
-! CHECK-SAME: %[[z:arg2]]: !fir.box<!fir.array<?xcomplex<f32>>>
 subroutine dot_prod_complex_default (x, y, z)
   complex, dimension(1:) :: x,y
   complex, dimension(1:) :: z
-  ! CHECK-DAG: %[[res:.*]] = fir.alloca complex<f32>
-  ! CHECK-DAG: %[[x_conv:.*]] = fir.convert %[[x]] : (!fir.box<!fir.array<?xcomplex<f32>>>) -> !fir.box<none>
-  ! CHECK-DAG: %[[y_conv:.*]] = fir.convert %[[y]] : (!fir.box<!fir.array<?xcomplex<f32>>>) -> !fir.box<none>
-  ! CHECK-DAG: fir.call @_FortranACppDotProductComplex4(%[[res]], %[[x_conv]], %[[y_conv]], %{{[0-9]+}}, %{{.*}}) {{.*}}: (!fir.ref<complex<f32>>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
+  ! CHECK: %[[res:.*]] = fir.alloca complex<f32>
+  ! CHECK: fir.call @_FortranACppDotProductComplex4(%[[res]], %{{.*}}, %{{.*}}, %{{[0-9]+}}, %{{.*}}) {{.*}}: (!fir.ref<complex<f32>>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
   z = dot_product(x,y)
 end subroutine
 
 ! CHECK-LABEL: dot_prod_complex_kind_4
-! CHECK-SAME: %[[x:arg0]]: !fir.box<!fir.array<?xcomplex<f32>>>
-! CHECK-SAME: %[[y:arg1]]: !fir.box<!fir.array<?xcomplex<f32>>>
-! CHECK-SAME: %[[z:arg2]]: !fir.box<!fir.array<?xcomplex<f32>>>
 subroutine dot_prod_complex_kind_4 (x, y, z)
   complex(kind=4), dimension(1:) :: x,y
   complex(kind=4), dimension(1:) :: z
-  ! CHECK-DAG: %[[res:.*]] = fir.alloca complex<f32>
-  ! CHECK-DAG: %[[x_conv:.*]] = fir.convert %[[x]] : (!fir.box<!fir.array<?xcomplex<f32>>>) -> !fir.box<none>
-  ! CHECK-DAG: %[[y_conv:.*]] = fir.convert %[[y]] : (!fir.box<!fir.array<?xcomplex<f32>>>) -> !fir.box<none>
-  ! CHECK-DAG: fir.call @_FortranACppDotProductComplex4(%[[res]], %[[x_conv]], %[[y_conv]], %{{[0-9]+}}, %{{.*}}) {{.*}}: (!fir.ref<complex<f32>>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
+  ! CHECK: %[[res:.*]] = fir.alloca complex<f32>
+  ! CHECK: fir.call @_FortranACppDotProductComplex4(%[[res]], %{{.*}}, %{{.*}}, %{{[0-9]+}}, %{{.*}}) {{.*}}: (!fir.ref<complex<f32>>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
   z = dot_product(x,y)
 end subroutine
 
 ! CHECK-LABEL: dot_prod_complex_kind_8
-! CHECK-SAME: %[[x:arg0]]: !fir.box<!fir.array<?xcomplex<f64>>>
-! CHECK-SAME: %[[y:arg1]]: !fir.box<!fir.array<?xcomplex<f64>>>
-! CHECK-SAME: %[[z:arg2]]: !fir.box<!fir.array<?xcomplex<f64>>>
 subroutine dot_prod_complex_kind_8 (x, y, z)
   complex(kind=8), dimension(1:) :: x,y
   complex(kind=8), dimension(1:) :: z
-  ! CHECK-DAG: %[[res:.*]] = fir.alloca complex<f64>
-  ! CHECK-DAG: %[[x_conv:.*]] = fir.convert %[[x]] : (!fir.box<!fir.array<?xcomplex<f64>>>) -> !fir.box<none>
-  ! CHECK-DAG: %[[y_conv:.*]] = fir.convert %[[y]] : (!fir.box<!fir.array<?xcomplex<f64>>>) -> !fir.box<none>
-  ! CHECK-DAG: fir.call @_FortranACppDotProductComplex8(%[[res]], %[[x_conv]], %[[y_conv]], %{{[0-9]+}}, %{{.*}}) {{.*}}: (!fir.ref<complex<f64>>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
+  ! CHECK: %[[res:.*]] = fir.alloca complex<f64>
+  ! CHECK: fir.call @_FortranACppDotProductComplex8(%[[res]], %{{.*}}, %{{.*}}, %{{[0-9]+}}, %{{.*}}) {{.*}}: (!fir.ref<complex<f64>>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
   z = dot_product(x,y)
 end subroutine
 
-! CHECK-LABEL: dot_prod_complex_kind_10
-! CHECK-SAME: %[[x:arg0]]: !fir.box<!fir.array<?xcomplex<f80>>>
-! CHECK-SAME: %[[y:arg1]]: !fir.box<!fir.array<?xcomplex<f80>>>
-! CHECK-SAME: %[[z:arg2]]: !fir.box<!fir.array<?xcomplex<f80>>>
+! CHECK-KIND10-LABEL: dot_prod_complex_kind_10
 subroutine dot_prod_complex_kind_10 (x, y, z)
-  complex(kind=10), dimension(1:) :: x,y
-  complex(kind=10), dimension(1:) :: z
-  ! CHECK-DAG: %[[res:.*]] = fir.alloca complex<f80>
-  ! CHECK-DAG: %[[x_conv:.*]] = fir.convert %[[x]] : (!fir.box<!fir.array<?xcomplex<f80>>>) -> !fir.box<none>
-  ! CHECK-DAG: %[[y_conv:.*]] = fir.convert %[[y]] : (!fir.box<!fir.array<?xcomplex<f80>>>) -> !fir.box<none>
-  ! CHECK-DAG: fir.call @_FortranACppDotProductComplex10(%[[res]], %[[x_conv]], %[[y_conv]], %{{[0-9]+}}, %{{.*}}) {{.*}}: (!fir.ref<complex<f80>>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
+  integer, parameter :: kind10 = merge(10, 4, selected_real_kind(p=18).eq.10)
+  complex(kind=kind10), dimension(1:) :: x,y
+  complex(kind=kind10), dimension(1:) :: z
+  ! CHECK-KIND10: %[[res:.*]] = fir.alloca complex<f80>
+  ! CHECK-KIND10: fir.call @_FortranACppDotProductComplex10(%[[res]], %{{.*}}, %{{.*}}, %{{[0-9]+}}, %{{.*}}) {{.*}}: (!fir.ref<complex<f80>>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
   z = dot_product(x,y)
 end subroutine
 
-! CHECK-LABEL: dot_prod_complex_kind_16
-! CHECK-SAME: %[[x:arg0]]: !fir.box<!fir.array<?xcomplex<f128>>>
-! CHECK-SAME: %[[y:arg1]]: !fir.box<!fir.array<?xcomplex<f128>>>
-! CHECK-SAME: %[[z:arg2]]: !fir.box<!fir.array<?xcomplex<f128>>>
+! CHECK-KIND16-LABEL: dot_prod_complex_kind_16
 subroutine dot_prod_complex_kind_16 (x, y, z)
-  complex(kind=16), dimension(1:) :: x,y
-  complex(kind=16), dimension(1:) :: z
-  ! CHECK-DAG: %[[res:.*]] = fir.alloca complex<f128>
-  ! CHECK-DAG: %[[x_conv:.*]] = fir.convert %[[x]] : (!fir.box<!fir.array<?xcomplex<f128>>>) -> !fir.box<none>
-  ! CHECK-DAG: %[[y_conv:.*]] = fir.convert %[[y]] : (!fir.box<!fir.array<?xcomplex<f128>>>) -> !fir.box<none>
-  ! CHECK-DAG: fir.call @_FortranACppDotProductComplex16(%[[res]], %[[x_conv]], %[[y_conv]], %{{[0-9]+}}, %{{.*}}) {{.*}}: (!fir.ref<complex<f128>>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
+  integer, parameter :: kind16 = merge(16, 4, selected_real_kind(p=33).eq.16)
+  complex(kind=kind16), dimension(1:) :: x,y
+  complex(kind=kind16), dimension(1:) :: z
+  ! CHECK-KIND16: %[[res:.*]] = fir.alloca complex<f128>
+  ! CHECK-KIND16: fir.call @_FortranACppDotProductComplex16(%[[res]], %{{.*}}, %{{.*}}, %{{[0-9]+}}, %{{.*}}) {{.*}}: (!fir.ref<complex<f128>>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
   z = dot_product(x,y)
 end subroutine
 
 ! CHECK-LABEL: dot_prod_logical
-! CHECK-SAME: %[[x:arg0]]: !fir.box<!fir.array<?x!fir.logical<4>>>
-! CHECK-SAME: %[[y:arg1]]: !fir.box<!fir.array<?x!fir.logical<4>>>
-! CHECK-SAME: %[[z:arg2]]: !fir.box<!fir.array<?x!fir.logical<4>>>
 subroutine dot_prod_logical (x, y, z)
   logical, dimension(1:) :: x,y
   logical, dimension(1:) :: z
-  ! CHECK-DAG: %[[x_conv:.*]] = fir.convert %[[x]] : (!fir.box<!fir.array<?x!fir.logical<4>>>) -> !fir.box<none>
-  ! CHECK-DAG: %[[y_conv:.*]] = fir.convert %[[y]] : (!fir.box<!fir.array<?x!fir.logical<4>>>) -> !fir.box<none>
-  ! CHECK-DAG: %[[res:.*]] = fir.call @_FortranADotProductLogical(%[[x_conv]], %[[y_conv]], %{{[0-9]+}}, %{{.*}}) {{.*}}: (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> i1
+  ! CHECK: fir.call @_FortranADotProductLogical(%{{.*}}, %{{.*}}, %{{[0-9]+}}, %{{.*}}) {{.*}}: (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> i1
   z = dot_product(x,y)
 end subroutine
 
 ! CHECK-LABEL: dot_product_mixed_int_real
-! CHECK-SAME: %[[x:arg0]]: !fir.box<!fir.array<?xi32>>
-! CHECK-SAME: %[[y:arg1]]: !fir.box<!fir.array<?xf32>>
-! CHECK-SAME: %[[z:arg2]]: !fir.box<!fir.array<?xf32>>
 subroutine dot_product_mixed_int_real(x, y, z)
   integer, dimension(1:) :: x
   real, dimension(1:) :: y, z
-  ! CHECK-DAG: %[[x_conv:.*]] = fir.convert %[[x]] : (!fir.box<!fir.array<?xi32>>) -> !fir.box<none>
-  ! CHECK-DAG: %[[y_conv:.*]] = fir.convert %[[y]] : (!fir.box<!fir.array<?xf32>>) -> !fir.box<none>
-  ! CHECK-DAG: %[[res:.*]] = fir.call @_FortranADotProductReal4(%[[x_conv]], %[[y_conv]], %{{[0-9]+}}, %{{.*}}) {{.*}}: (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> f32
+  ! CHECK: fir.call @_FortranADotProductReal4(%{{.*}}, %{{.*}}, %{{[0-9]+}}, %{{.*}}) {{.*}}: (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> f32
   z = dot_product(x,y)
 end subroutine
 
 ! CHECK-LABEL: dot_product_mixed_int_complex
-! CHECK-SAME: %[[x:arg0]]: !fir.box<!fir.array<?xi32>>
-! CHECK-SAME: %[[y:arg1]]: !fir.box<!fir.array<?xcomplex<f32>>>
-! CHECK-SAME: %[[z:arg2]]: !fir.box<!fir.array<?xcomplex<f32>>>
 subroutine dot_product_mixed_int_complex(x, y, z)
   integer, dimension(1:) :: x
   complex, dimension(1:) :: y, z
-  ! CHECK-DAG: %[[res:.*]] = fir.alloca complex<f32>
-  ! CHECK-DAG: %[[x_conv:.*]] = fir.convert %[[x]] : (!fir.box<!fir.array<?xi32>>) -> !fir.box<none>
-  ! CHECK-DAG: %[[y_conv:.*]] = fir.convert %[[y]] : (!fir.box<!fir.array<?xcomplex<f32>>>) -> !fir.box<none>
-  ! CHECK-DAG: fir.call @_FortranACppDotProductComplex4(%[[res]], %[[x_conv]], %[[y_conv]], %{{[0-9]+}}, %{{.*}}) {{.*}}: (!fir.ref<complex<f32>>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
+  ! CHECK: %[[res:.*]] = fir.alloca complex<f32>
+  ! CHECK: fir.call @_FortranACppDotProductComplex4(%[[res]], %{{.*}}, %{{.*}}, %{{[0-9]+}}, %{{.*}}) {{.*}}: (!fir.ref<complex<f32>>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
   z = dot_product(x,y)
 end subroutine
 
 ! CHECK-LABEL: dot_product_mixed_real_complex
-! CHECK-SAME: %[[x:arg0]]: !fir.box<!fir.array<?xf32>>
-! CHECK-SAME: %[[y:arg1]]: !fir.box<!fir.array<?xcomplex<f32>>>
-! CHECK-SAME: %[[z:arg2]]: !fir.box<!fir.array<?xcomplex<f32>>>
 subroutine dot_product_mixed_real_complex(x, y, z)
   real, dimension(1:) :: x
   complex, dimension(1:) :: y, z
-  ! CHECK-DAG: %[[res:.*]] = fir.alloca complex<f32>
-  ! CHECK-DAG: %[[x_conv:.*]] = fir.convert %[[x]] : (!fir.box<!fir.array<?xf32>>) -> !fir.box<none>
-  ! CHECK-DAG: %[[y_conv:.*]] = fir.convert %[[y]] : (!fir.box<!fir.array<?xcomplex<f32>>>) -> !fir.box<none>
-  ! CHECK-DAG: fir.call @_FortranACppDotProductComplex4(%[[res]], %[[x_conv]], %[[y_conv]], %{{[0-9]+}}, %{{.*}}) {{.*}}: (!fir.ref<complex<f32>>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
+  ! CHECK: %[[res:.*]] = fir.alloca complex<f32>
+  ! CHECK: fir.call @_FortranACppDotProductComplex4(%[[res]], %{{.*}}, %{{.*}}, %{{[0-9]+}}, %{{.*}}) {{.*}}: (!fir.ref<complex<f32>>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
   z = dot_product(x,y)
 end subroutine
diff --git flang/test/Lower/Intrinsics/exponent.f90 flang/test/Lower/Intrinsics/exponent.f90
index e4db238c6d5a..f5f0603ee13c 100644
--- flang/test/Lower/Intrinsics/exponent.f90
+++ flang/test/Lower/Intrinsics/exponent.f90
@@ -1,41 +1,38 @@
-! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc -emit-hlfir %s -o - | FileCheck %s --check-prefixes=CHECK%if target=x86_64{{.*}} %{,CHECK-KIND10%}%if flang-supports-f128-math %{,CHECK-KIND16%}
 
 ! EXPONENT
-! CHECK-LABEL: exponent_test
-subroutine exponent_test
 
-    integer :: i1, i2, i3, i4
-  ! CHECK: %[[i0:.*]] = fir.alloca i32 {bindc_name = "i1", uniq_name = "_QFexponent_testEi1"}
-  ! CHECK: %[[i1:.*]] = fir.alloca i32 {bindc_name = "i2", uniq_name = "_QFexponent_testEi2"}
-  ! CHECK: %[[i2:.*]] = fir.alloca i32 {bindc_name = "i3", uniq_name = "_QFexponent_testEi3"}
-  ! CHECK: %[[i3:.*]] = fir.alloca i32 {bindc_name = "i4", uniq_name = "_QFexponent_testEi4"}
-  
-    real(kind = 4) :: x1
-    real(kind = 8) :: x2
-    real(kind = 10) :: x3
-    real(kind = 16) :: x4
-  ! CHECK: %[[x0:.*]] = fir.alloca f32 {bindc_name = "x1", uniq_name = "_QFexponent_testEx1"}
-  ! CHECK: %[[x1:.*]] = fir.alloca f64 {bindc_name = "x2", uniq_name = "_QFexponent_testEx2"}
-  ! CHECK: %[[x2:.*]] = fir.alloca f80 {bindc_name = "x3", uniq_name = "_QFexponent_testEx3"}
-  ! CHECK: %[[x3:.*]] = fir.alloca f128 {bindc_name = "x4", uniq_name = "_QFexponent_testEx4"}
-  
-    i1 = exponent(x1)
-  ! CHECK: %[[temp0:.*]] = fir.load %[[x0:.*]] : !fir.ref<f32>
-  ! CHECK: %[[result0:.*]] = fir.call @_FortranAExponent4_4(%[[temp0:.*]]) {{.*}}: (f32) -> i32
-  ! CHECK: fir.store %[[result0:.*]] to %[[i0:.*]] : !fir.ref<i32>
-  
-    i2 = exponent(x2)
-  ! CHECK: %[[temp1:.*]] = fir.load %[[x1:.*]] : !fir.ref<f64>
-  ! CHECK: %[[result1:.*]] = fir.call @_FortranAExponent8_4(%[[temp1:.*]]) {{.*}}: (f64) -> i32
-  ! CHECK: fir.store %[[result1:.*]] to %[[i1:.*]] : !fir.ref<i32>
-  
-    i3 = exponent(x3)
-  ! CHECK: %[[temp2:.*]] = fir.load %[[x2:.*]] : !fir.ref<f80>
-  ! CHECK: %[[result2:.*]] = fir.call @_FortranAExponent10_4(%[[temp2:.*]]) {{.*}}: (f80) -> i32
-  ! CHECK: fir.store %[[result2:.*]] to %[[i2:.*]] : !fir.ref<i32>
-  
-    i4 = exponent(x4)
-  ! CHECK: %[[temp3:.*]] = fir.load %[[x3:.*]] : !fir.ref<f128>
-  ! CHECK: %[[result3:.*]] = fir.call @_FortranAExponent16_4(%[[temp3:.*]]) {{.*}}: (f128) -> i32
-  ! CHECK: fir.store %[[result3:.*]] to %[[i3:.*]] : !fir.ref<i32>
-  end subroutine exponent_test
+! CHECK-LABEL: exponent_test(
+subroutine exponent_test(i1, i2, x4, x8)
+    integer :: i1, i2, i3
+    real(kind = 4) :: x4
+    real(kind = 8) :: x8
+
+    i1 = exponent(x4)
+  ! CHECK: %[[temp0:.*]] = fir.load %{{.*}} : !fir.ref<f32>
+  ! CHECK: fir.call @_FortranAExponent4_4(%[[temp0:.*]]) {{.*}}: (f32) -> i32
+
+    i2 = exponent(x8)
+  ! CHECK: %[[temp1:.*]] = fir.load %{{.*}} : !fir.ref<f64>
+  ! CHECK: fir.call @_FortranAExponent8_4(%[[temp1:.*]]) {{.*}}: (f64) -> i32
+end subroutine exponent_test
+
+! CHECK-KIND10-LABEL: exponent_10(
+subroutine exponent_10(i, x10)
+    integer :: i
+    integer, parameter :: kind10 = merge(10, 4, selected_real_kind(p=18).eq.10)
+    real(kind = kind10) :: x10
+    i = exponent(x10)
+  ! CHECK-KIND10: %[[temp2:.*]] = fir.load %{{.*}} : !fir.ref<f80>
+  ! CHECK-KIND10: fir.call @_FortranAExponent10_4(%[[temp2:.*]]) {{.*}}: (f80) -> i32
+end subroutine
+
+! CHECK-KIND16-LABEL: exponent_16(
+subroutine exponent_16(i, x16)
+    integer :: i
+    integer, parameter :: kind16 = merge(16, 4, selected_real_kind(p=33).eq.16)
+    real(kind = kind16) :: x16
+    i = exponent(x16)
+  ! CHECK-KIND16: %[[temp2:.*]] = fir.load %{{.*}} : !fir.ref<f128>
+  ! CHECK-KIND16: fir.call @_FortranAExponent16_4(%[[temp2:.*]]) {{.*}}: (f128) -> i32
+end subroutine
diff --git flang/test/Lower/Intrinsics/fma_real16.f90 flang/test/Lower/Intrinsics/fma_real16.f90
index 62cf2fbcefbf..ecc32635be9d 100644
--- flang/test/Lower/Intrinsics/fma_real16.f90
+++ flang/test/Lower/Intrinsics/fma_real16.f90
@@ -1,3 +1,4 @@
+! REQUIRES: flang-supports-f128-math
 ! RUN: bbc -emit-fir %s -o - | FileCheck %s
 ! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
 ! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
diff --git flang/test/Lower/Intrinsics/fraction.f90 flang/test/Lower/Intrinsics/fraction.f90
index f9fff725eb27..594beb97cc5a 100644
--- flang/test/Lower/Intrinsics/fraction.f90
+++ flang/test/Lower/Intrinsics/fraction.f90
@@ -1,35 +1,35 @@
-! RUN: bbc -emit-fir %s -o - | FileCheck %s
- 
+! RUN: bbc -emit-hlfir %s -o - | FileCheck %s --check-prefixes=CHECK%if target=x86_64{{.*}} %{,CHECK-KIND10%}%if flang-supports-f128-math %{,CHECK-KIND16%}
+
 ! FRACTION
-! CHECK-LABE: fraction_test
-subroutine fraction_test
 
-    real(kind=4) :: x1 = 178.1387e-4
-    real(kind=8) :: x2 = 178.1387e-4
-    real(kind=10) :: x3 = 178.1387e-4
-    real(kind=16) :: x4 = 178.1387e-4
-  ! CHECK: %[[r0:.*]] = fir.address_of(@_QFfraction_testEx1) : !fir.ref<f32>
-  ! CHECK: %[[r1:.*]] = fir.address_of(@_QFfraction_testEx2) : !fir.ref<f64>
-  ! CHECK: %[[r2:.*]] = fir.address_of(@_QFfraction_testEx3) : !fir.ref<f80>
-  ! CHECK: %[[r3:.*]] = fir.address_of(@_QFfraction_testEx4) : !fir.ref<f128>
-  
-    x1 = fraction(x1)
-  ! CHECK: %[[temp0:.*]] = fir.load %[[r0:.*]] : !fir.ref<f32>
-  ! CHECK: %[[result0:.*]] = fir.call @_FortranAFraction4(%[[temp0:.*]]) {{.*}}: (f32) -> f32
-  ! CHECK: fir.store %[[result0:.*]] to %[[r0:.*]] : !fir.ref<f32>
-  
-    x2 = fraction(x2)
-  ! CHECK: %[[temp1:.*]] = fir.load %[[r1:.*]] : !fir.ref<f64>
-  ! CHECK: %[[result1:.*]] = fir.call @_FortranAFraction8(%[[temp1:.*]]) {{.*}}: (f64) -> f64
-  ! CHECK: fir.store %[[result1:.*]] to %[[r1:.*]] : !fir.ref<f64>
-  
-    x3 = fraction(x3)
-  ! CHECK: %[[temp2:.*]] = fir.load %[[r2:.*]] : !fir.ref<f80>
-  ! CHECK: %[[result2:.*]] = fir.call @_FortranAFraction10(%[[temp2:.*]]) {{.*}}: (f80) -> f80
-  ! CHECK: fir.store %[[result2:.*]] to %[[r2:.*]] : !fir.ref<f80>
-  
-    x4 = fraction(x4)
-  ! CHECK: %[[temp3:.*]] = fir.load %[[r3:.*]] : !fir.ref<f128>
-  ! CHECK: %[[result3:.*]] = fir.call @_FortranAFraction16(%[[temp3:.*]]) {{.*}}: (f128) -> f128
-  ! CHECK: fir.store %[[result3:.*]] to %[[r3:.*]] : !fir.ref<f128>
-  end subroutine fraction_test
+! CHECK-LABEL: fraction_test(
+subroutine fraction_test(res4, res8, x4, x8)
+    real(kind = 4) :: x4, res4
+    real(kind = 8) :: x8, res8
+
+    res4 = fraction(x4)
+  ! CHECK: %[[temp0:.*]] = fir.load %{{.*}} : !fir.ref<f32>
+  ! CHECK: fir.call @_FortranAFraction4(%[[temp0:.*]]) {{.*}}: (f32) -> f32
+
+    res8 = fraction(x8)
+  ! CHECK: %[[temp1:.*]] = fir.load %{{.*}} : !fir.ref<f64>
+  ! CHECK: fir.call @_FortranAFraction8(%[[temp1:.*]]) {{.*}}: (f64) -> f64
+end subroutine fraction_test
+
+! CHECK-KIND10-LABEL: fraction_10(
+subroutine fraction_10(res10, x10)
+    integer, parameter :: kind10 = merge(10, 4, selected_real_kind(p=18).eq.10)
+    real(kind = kind10) :: x10, res10
+    res10 = fraction(x10)
+  ! CHECK-KIND10: %[[temp2:.*]] = fir.load %{{.*}} : !fir.ref<f80>
+  ! CHECK-KIND10: fir.call @_FortranAFraction10(%[[temp2:.*]]) {{.*}}: (f80) -> f80
+end subroutine
+
+! CHECK-KIND16-LABEL: fraction_16(
+subroutine fraction_16(res16, x16)
+    integer, parameter :: kind16 = merge(16, 4, selected_real_kind(p=33).eq.16)
+    real(kind = kind16) :: x16, res16
+    res16 = fraction(x16)
+  ! CHECK-KIND16: %[[temp2:.*]] = fir.load %{{.*}} : !fir.ref<f128>
+  ! CHECK-KIND16: fir.call @_FortranAFraction16(%[[temp2:.*]]) {{.*}}: (f128) -> f128
+end subroutine
diff --git flang/test/Lower/Intrinsics/ieee_class_queries.f90 flang/test/Lower/Intrinsics/ieee_class_queries.f90
index b2f9df83a902..aae8071b6a8b 100644
--- flang/test/Lower/Intrinsics/ieee_class_queries.f90
+++ flang/test/Lower/Intrinsics/ieee_class_queries.f90
@@ -1,5 +1,6 @@
 ! REQUIRES: flang-supports-f128-math
-! RUN: bbc -emit-fir -o - %s | FileCheck %s
+! REQUIRES: x86_64-registered-target
+! RUN: bbc -target x86_64-unknown-linux-gnu -emit-fir -o - %s | FileCheck %s
 
   ! CHECK-LABEL: func @_QQmain
   use ieee_arithmetic, only: ieee_is_finite, ieee_is_nan, ieee_is_negative, &
@@ -8,7 +9,6 @@
   real(3) :: x3 = -3.0
   real(4) :: x4 = -4.0
   real(8) :: x8 = -8.0
-  real(10) :: x10 = -10.0
   real(16) :: x16 = -16.0
 
   ! CHECK:     "llvm.intr.is.fpclass"(%{{.*}}) <{bit = 504 : i32}> : (f16) -> i1
diff --git flang/test/Lower/Intrinsics/ieee_is_normal.f90 flang/test/Lower/Intrinsics/ieee_is_normal.f90
index 9b864c9a9849..d55b2e3c0856 100644
--- flang/test/Lower/Intrinsics/ieee_is_normal.f90
+++ flang/test/Lower/Intrinsics/ieee_is_normal.f90
@@ -1,5 +1,4 @@
-! RUN: bbc -emit-fir %s -o - | FileCheck %s
-! RUN: flang -fc1 -emit-fir %s -o - | FileCheck %s
+! RUN: bbc -emit-fir %s -o - | FileCheck %s --check-prefixes=CHECK%if target=x86_64{{.*}} %{,CHECK-KIND10%}%if flang-supports-f128-math %{,CHECK-KIND16%}
 
 ! CHECK-LABEL: ieee_is_normal_f16
 subroutine ieee_is_normal_f16(r)
@@ -39,20 +38,22 @@ subroutine ieee_is_normal_f64(r)
   ! CHECK: fir.convert %[[l]] : (i1) -> !fir.logical<4>
 end subroutine ieee_is_normal_f64
 
-! CHECK-LABEL: ieee_is_normal_f80
+! CHECK-KIND10-LABEL: ieee_is_normal_f80
 subroutine ieee_is_normal_f80(r)
   use ieee_arithmetic
-  real(KIND=10) :: r
+  integer, parameter :: kind10 = merge(10, 4, selected_real_kind(p=18).eq.10)
+  real(KIND=kind10) :: r
   i = ieee_is_normal(r)
-  ! CHECK: %[[l:.*]] = "llvm.intr.is.fpclass"(%{{.*}}) <{bit = 360 : i32}> : (f80) -> i1
-  ! CHECK: fir.convert %[[l]] : (i1) -> !fir.logical<4>
+  ! CHECK-KIND10: %[[l:.*]] = "llvm.intr.is.fpclass"(%{{.*}}) <{bit = 360 : i32}> : (f80) -> i1
+  ! CHECK-KIND10: fir.convert %[[l]] : (i1) -> !fir.logical<4>
 end subroutine ieee_is_normal_f80
 
-! CHECK-LABEL: ieee_is_normal_f128
+! CHECK-KIND16-LABEL: ieee_is_normal_f128
 subroutine ieee_is_normal_f128(r)
   use ieee_arithmetic
-  real(KIND=16) :: r
+  integer, parameter :: kind16 = merge(16, 4, selected_real_kind(p=33).eq.16)
+  real(KIND=kind16) :: r
   i = ieee_is_normal(r)
-  ! CHECK: %[[l:.*]] = "llvm.intr.is.fpclass"(%{{.*}}) <{bit = 360 : i32}> : (f128) -> i1
-  ! CHECK: fir.convert %[[l]] : (i1) -> !fir.logical<4>
+  ! CHECK-KIND16: %[[l:.*]] = "llvm.intr.is.fpclass"(%{{.*}}) <{bit = 360 : i32}> : (f128) -> i1
+  ! CHECK-KIND16: fir.convert %[[l]] : (i1) -> !fir.logical<4>
 end subroutine ieee_is_normal_f128
diff --git flang/test/Lower/Intrinsics/ieee_next.f90 flang/test/Lower/Intrinsics/ieee_next.f90
index eb9cc028368a..aa545967f9bc 100644
--- flang/test/Lower/Intrinsics/ieee_next.f90
+++ flang/test/Lower/Intrinsics/ieee_next.f90
@@ -1,282 +1,394 @@
-! RUN: bbc -emit-fir -o - %s | FileCheck %s
+! RUN: bbc -emit-hlfir %s -o - | FileCheck %s --check-prefixes=CHECK%if target=x86_64{{.*}} %{,CHECK-KIND10%}%if flang-supports-f128-math %{,CHECK-KIND16%}
 
-! CHECK-LABEL: c.func @_QQmain
-program p
+module ieee_next_tests
   use ieee_arithmetic, only: ieee_value, ieee_negative_inf, ieee_positive_inf
   use ieee_arithmetic, only: ieee_next_after, ieee_next_down, ieee_next_up
   implicit none
-  ! CHECK-DAG: %[[V_4:[0-9]+]] = fir.alloca f80 {bindc_name = "r10", uniq_name = "_QFEr10"}
-  ! CHECK-DAG: %[[V_5:[0-9]+]] = fir.declare %[[V_4]] {uniq_name = "_QFEr10"} : (!fir.ref<f80>) -> !fir.ref<f80>
-  ! CHECK-DAG: %[[V_6:[0-9]+]] = fir.alloca f128 {bindc_name = "r16", uniq_name = "_QFEr16"}
-  ! CHECK-DAG: %[[V_7:[0-9]+]] = fir.declare %[[V_6]] {uniq_name = "_QFEr16"} : (!fir.ref<f128>) -> !fir.ref<f128>
-  ! CHECK-DAG: %[[V_8:[0-9]+]] = fir.alloca f16 {bindc_name = "r2", uniq_name = "_QFEr2"}
-  ! CHECK-DAG: %[[V_9:[0-9]+]] = fir.declare %[[V_8]] {uniq_name = "_QFEr2"} : (!fir.ref<f16>) -> !fir.ref<f16>
-  ! CHECK-DAG: %[[V_10:[0-9]+]] = fir.alloca bf16 {bindc_name = "r3", uniq_name = "_QFEr3"}
-  ! CHECK-DAG: %[[V_11:[0-9]+]] = fir.declare %[[V_10]] {uniq_name = "_QFEr3"} : (!fir.ref<bf16>) -> !fir.ref<bf16>
-  ! CHECK-DAG: %[[V_12:[0-9]+]] = fir.alloca f32 {bindc_name = "r4", uniq_name = "_QFEr4"}
-  ! CHECK-DAG: %[[V_13:[0-9]+]] = fir.declare %[[V_12]] {uniq_name = "_QFEr4"} : (!fir.ref<f32>) -> !fir.ref<f32>
-  ! CHECK-DAG: %[[V_14:[0-9]+]] = fir.alloca f64 {bindc_name = "r8", uniq_name = "_QFEr8"}
-  ! CHECK-DAG: %[[V_15:[0-9]+]] = fir.declare %[[V_14]] {uniq_name = "_QFEr8"} : (!fir.ref<f64>) -> !fir.ref<f64>
-  ! CHECK-DAG: %[[V_16:[0-9]+]] = fir.address_of(@_QFEx10) : !fir.ref<f80>
-  ! CHECK-DAG: %[[V_17:[0-9]+]] = fir.declare %[[V_16]] {uniq_name = "_QFEx10"} : (!fir.ref<f80>) -> !fir.ref<f80>
-  ! CHECK-DAG: %[[V_18:[0-9]+]] = fir.alloca f128 {bindc_name = "x16", uniq_name = "_QFEx16"}
-  ! CHECK-DAG: %[[V_19:[0-9]+]] = fir.declare %[[V_18]] {uniq_name = "_QFEx16"} : (!fir.ref<f128>) -> !fir.ref<f128>
-  ! CHECK-DAG: %[[V_20:[0-9]+]] = fir.alloca f16 {bindc_name = "x2", uniq_name = "_QFEx2"}
-  ! CHECK-DAG: %[[V_21:[0-9]+]] = fir.declare %[[V_20]] {uniq_name = "_QFEx2"} : (!fir.ref<f16>) -> !fir.ref<f16>
-  ! CHECK-DAG: %[[V_22:[0-9]+]] = fir.address_of(@_QFEx3) : !fir.ref<bf16>
-  ! CHECK-DAG: %[[V_23:[0-9]+]] = fir.declare %[[V_22]] {uniq_name = "_QFEx3"} : (!fir.ref<bf16>) -> !fir.ref<bf16>
-  ! CHECK-DAG: %[[V_24:[0-9]+]] = fir.address_of(@_QFEx4) : !fir.ref<f32>
-  ! CHECK-DAG: %[[V_25:[0-9]+]] = fir.declare %[[V_24]] {uniq_name = "_QFEx4"} : (!fir.ref<f32>) -> !fir.ref<f32>
-  ! CHECK-DAG: %[[V_26:[0-9]+]] = fir.address_of(@_QFEx8) : !fir.ref<f64>
-  ! CHECK-DAG: %[[V_27:[0-9]+]] = fir.declare %[[V_26]] {uniq_name = "_QFEx8"} : (!fir.ref<f64>) -> !fir.ref<f64>
-  real(2)  ::  r2,  x2
-  real(3)  ::  r3,  x3 = -huge(x3)
-  real(4)  ::  r4,  x4 = -0.
-  real(8)  ::  r8,  x8 =  0.
-  real(10) :: r10, x10 =  huge(x10)
-  real(16) :: r16, x16
-
-  x2  = ieee_value(x2, ieee_negative_inf)
-  x16 = ieee_value(x2, ieee_positive_inf)
+  integer, parameter :: kind10 = merge(10, 4, selected_real_kind(p=18).eq.10)
+  integer, parameter :: kind16 = merge(16, 4, selected_real_kind(p=33).eq.16)
+contains
 
-  ! CHECK:     %[[V_45:[0-9]+]] = fir.load %[[V_21]] : !fir.ref<f16>
-  ! CHECK:     %[[V_46:[0-9]+]] = fir.load %[[V_17]] : !fir.ref<f80>
-  ! CHECK-DAG: %[[V_47:[0-9]+]] = fir.coordinate_of %{{.*}}, %c2{{.*}} : (!fir.ref<!fir.array<12xi16>>, i8) -> !fir.ref<i16>
-  ! CHECK-DAG: %[[V_48:[0-9]+]] = fir.load %[[V_47]] : !fir.ref<i16>
-  ! CHECK-DAG: %[[V_49:[0-9]+]] = arith.bitcast %[[V_48]] : i16 to f16
-  ! CHECK-DAG: %[[V_50:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_46]]) <{bit = 3 : i32}> : (f80) -> i1
-  ! CHECK:     %[[V_51:[0-9]+]] = arith.select %[[V_50]], %[[V_49]], %[[V_45]] : f16
-  ! CHECK:     %[[V_52:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_51]]) <{bit = 3 : i32}> : (f16) -> i1
-  ! CHECK:     %[[V_53:[0-9]+]] = fir.convert %[[V_51]] : (f16) -> f80
-  ! CHECK:     %[[V_54:[0-9]+]] = arith.cmpf oeq, %[[V_53]], %[[V_46]] fastmath<contract> : f80
-  ! CHECK:     %[[V_55:[0-9]+]] = arith.ori %[[V_52]], %[[V_54]] : i1
-  ! CHECK:     %[[V_56:[0-9]+]] = arith.cmpf olt, %[[V_53]], %[[V_46]] fastmath<contract> : f80
-  ! CHECK:     %[[V_57:[0-9]+]] = arith.bitcast %[[V_45]] : f16 to i16
-  ! CHECK:     %[[V_58:[0-9]+]] = arith.shrui %[[V_57]], %c15{{.*}} : i16
-  ! CHECK:     %[[V_59:[0-9]+]] = fir.convert %[[V_58]] : (i16) -> i1
-  ! CHECK:     %[[V_60:[0-9]+]] = arith.cmpi ne, %[[V_56]], %[[V_59]] : i1
-  ! CHECK:     %[[V_61:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_51]]) <{bit = 516 : i32}> : (f16) -> i1
-  ! CHECK:     %[[V_62:[0-9]+]] = arith.andi %[[V_61]], %[[V_60]] : i1
-  ! CHECK:     %[[V_63:[0-9]+]] = arith.ori %[[V_55]], %[[V_62]] : i1
-  ! CHECK:     %[[V_64:[0-9]+]] = fir.if %[[V_63]] -> (f16) {
-  ! CHECK:       fir.result %[[V_51]] : f16
-  ! CHECK:     } else {
-  ! CHECK:       %[[V_202:[0-9]+]] = arith.cmpf oeq, %[[V_51]], %cst{{[_0-9]*}} fastmath<contract> : f16
-  ! CHECK:       %[[V_203:[0-9]+]] = fir.if %[[V_202]] -> (f16) {
-  ! CHECK:         %[[V_204:[0-9]+]] = arith.select %[[V_56]], %cst{{[_0-9]*}}, %cst{{[_0-9]*}} : f16
-  ! CHECK:         %[[V_205:[0-9]+]] = fir.call @_FortranAMapException(%c48{{.*}}) fastmath<contract> : (i32) -> i32
-  ! CHECK:                             fir.call @feraiseexcept(%[[V_205]]) fastmath<contract> : (i32) -> i32
-  ! CHECK:         fir.result %[[V_204]] : f16
-  ! CHECK:       } else {
-  ! CHECK:         %[[V_204:[0-9]+]] = arith.bitcast %[[V_51]] : f16 to i16
-  ! CHECK-DAG:     %[[V_205:[0-9]+]] = arith.subi %[[V_204]], %c1{{.*}} : i16
-  ! CHECK-DAG:     %[[V_206:[0-9]+]] = arith.addi %[[V_204]], %c1{{.*}} : i16
-  ! CHECK:         %[[V_207:[0-9]+]] = arith.select %[[V_60]], %[[V_206]], %[[V_205]] : i16
-  ! CHECK:         %[[V_208:[0-9]+]] = arith.bitcast %[[V_207]] : i16 to f16
-  ! CHECK:         %[[V_209:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_208]]) <{bit = 516 : i32}> : (f16) -> i1
-  ! CHECK:         fir.if %[[V_209]] {
-  ! CHECK:           %[[V_211:[0-9]+]] = fir.call @_FortranAMapException(%c40{{.*}}) fastmath<contract> : (i32) -> i32
-  ! CHECK:                               fir.call @feraiseexcept(%[[V_211]]) fastmath<contract> : (i32) -> i32
-  ! CHECK:         }
-  ! CHECK:         %[[V_210:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_208]]) <{bit = 144 : i32}> : (f16) -> i1
-  ! CHECK:         fir.if %[[V_210]] {
-  ! CHECK:           %[[V_211:[0-9]+]] = fir.call @_FortranAMapException(%c48{{.*}}) fastmath<contract> : (i32) -> i32
-  ! CHECK:                               fir.call @feraiseexcept(%[[V_211]]) fastmath<contract> : (i32) -> i32
-  ! CHECK:         }
-  ! CHECK:         fir.result %[[V_208]] : f16
-  ! CHECK:       }
-  ! CHECK:       fir.result %[[V_203]] : f16
-  ! CHECK:     }
-  ! CHECK:     fir.store %[[V_64]] to %[[V_9]] : !fir.ref<f16>
+subroutine test1(r2, x2, x10)
+  real(2)  ::  r2,  x2
+  real(kind10) :: x10
   r2 = ieee_next_after(x2, x10)
-  print "('after:  ', z4.4, ' -> ', z4.4, ' = ', g0)", x2, r2, r2
+end subroutine
+!CHECK-KIND10-LABEL:   func.func @_QMieee_next_testsPtest1(
+!CHECK-KIND10:           %[[VAL_12:.*]]:2 = hlfir.declare {{.*}}r2"
+!CHECK-KIND10:           %[[VAL_13:.*]]:2 = hlfir.declare {{.*}}x10"
+!CHECK-KIND10:           %[[VAL_14:.*]]:2 = hlfir.declare {{.*}}x2"
+!CHECK-KIND10:           %[[VAL_15:.*]] = fir.load %[[VAL_14]]#0 : !fir.ref<f16>
+!CHECK-KIND10:           %[[VAL_16:.*]] = fir.load %[[VAL_13]]#0 : !fir.ref<f80>
+!CHECK-KIND10-DAG:       %[[VAL_17:.*]] = "llvm.intr.is.fpclass"(%[[VAL_16]]) <{bit = 3 : i32}> : (f80) -> i1
+!CHECK-KIND10-DAG:       %[[VAL_18:.*]] = arith.constant 2 : i8
+!CHECK-KIND10-DAG:       %[[VAL_19:.*]] = fir.address_of(@_FortranAIeeeValueTable_2) : !fir.ref<!fir.array<12xi16>>
+!CHECK-KIND10-DAG:       %[[VAL_20:.*]] = fir.coordinate_of %[[VAL_19]], %[[VAL_18]] : (!fir.ref<!fir.array<12xi16>>, i8) -> !fir.ref<i16>
+!CHECK-KIND10-DAG:       %[[VAL_21:.*]] = fir.load %[[VAL_20]] : !fir.ref<i16>
+!CHECK-KIND10-DAG:       %[[VAL_22:.*]] = arith.bitcast %[[VAL_21]] : i16 to f16
+!CHECK-KIND10:           %[[VAL_23:.*]] = arith.select %[[VAL_17]], %[[VAL_22]], %[[VAL_15]] : f16
+!CHECK-KIND10:           %[[VAL_24:.*]] = "llvm.intr.is.fpclass"(%[[VAL_23]]) <{bit = 3 : i32}> : (f16) -> i1
+!CHECK-KIND10:           %[[VAL_25:.*]] = arith.constant 1 : i16
+!CHECK-KIND10:           %[[VAL_26:.*]] = fir.convert %[[VAL_23]] : (f16) -> f32
+!CHECK-KIND10:           %[[VAL_27:.*]] = fir.convert %[[VAL_26]] : (f32) -> f80
+!CHECK-KIND10:           %[[VAL_28:.*]] = arith.cmpf oeq, %[[VAL_27]], %[[VAL_16]] fastmath<contract> : f80
+!CHECK-KIND10:           %[[VAL_29:.*]] = arith.ori %[[VAL_24]], %[[VAL_28]] : i1
+!CHECK-KIND10:           %[[VAL_30:.*]] = arith.cmpf olt, %[[VAL_27]], %[[VAL_16]] fastmath<contract> : f80
+!CHECK-KIND10:           %[[VAL_31:.*]] = arith.bitcast %[[VAL_15]] : f16 to i16
+!CHECK-KIND10:           %[[VAL_32:.*]] = arith.constant 15 : i16
+!CHECK-KIND10:           %[[VAL_33:.*]] = arith.shrui %[[VAL_31]], %[[VAL_32]] : i16
+!CHECK-KIND10:           %[[VAL_34:.*]] = fir.convert %[[VAL_33]] : (i16) -> i1
+!CHECK-KIND10:           %[[VAL_35:.*]] = arith.cmpi ne, %[[VAL_30]], %[[VAL_34]] : i1
+!CHECK-KIND10:           %[[VAL_36:.*]] = "llvm.intr.is.fpclass"(%[[VAL_23]]) <{bit = 516 : i32}> : (f16) -> i1
+!CHECK-KIND10:           %[[VAL_37:.*]] = arith.andi %[[VAL_36]], %[[VAL_35]] : i1
+!CHECK-KIND10:           %[[VAL_38:.*]] = arith.ori %[[VAL_29]], %[[VAL_37]] : i1
+!CHECK-KIND10:           %[[VAL_39:.*]] = fir.if %[[VAL_38]] -> (f16) {
+!CHECK-KIND10:             fir.result %[[VAL_23]] : f16
+!CHECK-KIND10:           } else {
+!CHECK-KIND10:             %[[VAL_40:.*]] = arith.constant 0.000000e+00 : f16
+!CHECK-KIND10:             %[[VAL_41:.*]] = arith.cmpf oeq, %[[VAL_23]], %[[VAL_40]] fastmath<contract> : f16
+!CHECK-KIND10:             %[[VAL_42:.*]] = fir.if %[[VAL_41]] -> (f16) {
+!CHECK-KIND10:               %[[VAL_43:.*]] = arith.bitcast %[[VAL_25]] : i16 to f16
+!CHECK-KIND10:               %[[VAL_44:.*]] = arith.constant -32767 : i16
+!CHECK-KIND10:               %[[VAL_45:.*]] = arith.bitcast %[[VAL_44]] : i16 to f16
+!CHECK-KIND10:               %[[VAL_46:.*]] = arith.select %[[VAL_30]], %[[VAL_43]], %[[VAL_45]] : f16
+!CHECK-KIND10:               %[[VAL_47:.*]] = arith.constant 48 : i32
+!CHECK-KIND10:               %[[VAL_48:.*]] = fir.call @_FortranAMapException(%[[VAL_47]]) fastmath<contract> : (i32) -> i32
+!CHECK-KIND10:               %[[VAL_49:.*]] = fir.call @feraiseexcept(%[[VAL_48]]) fastmath<contract> : (i32) -> i32
+!CHECK-KIND10:               fir.result %[[VAL_46]] : f16
+!CHECK-KIND10:             } else {
+!CHECK-KIND10:               %[[VAL_50:.*]] = arith.bitcast %[[VAL_23]] : f16 to i16
+!CHECK-KIND10-DAG:           %[[VAL_51:.*]] = arith.addi %[[VAL_50]], %[[VAL_25]] : i16
+!CHECK-KIND10-DAG:           %[[VAL_52:.*]] = arith.subi %[[VAL_50]], %[[VAL_25]] : i16
+!CHECK-KIND10:               %[[VAL_53:.*]] = arith.select %[[VAL_35]], %[[VAL_51]], %[[VAL_52]] : i16
+!CHECK-KIND10:               %[[VAL_54:.*]] = arith.bitcast %[[VAL_53]] : i16 to f16
+!CHECK-KIND10:               %[[VAL_55:.*]] = "llvm.intr.is.fpclass"(%[[VAL_54]]) <{bit = 516 : i32}> : (f16) -> i1
+!CHECK-KIND10:               fir.if %[[VAL_55]] {
+!CHECK-KIND10:                 %[[VAL_56:.*]] = arith.constant 40 : i32
+!CHECK-KIND10:                 %[[VAL_57:.*]] = fir.call @_FortranAMapException(%[[VAL_56]]) fastmath<contract> : (i32) -> i32
+!CHECK-KIND10:                 %[[VAL_58:.*]] = fir.call @feraiseexcept(%[[VAL_57]]) fastmath<contract> : (i32) -> i32
+!CHECK-KIND10:               }
+!CHECK-KIND10:               %[[VAL_59:.*]] = "llvm.intr.is.fpclass"(%[[VAL_54]]) <{bit = 144 : i32}> : (f16) -> i1
+!CHECK-KIND10:               fir.if %[[VAL_59]] {
+!CHECK-KIND10:                 %[[VAL_60:.*]] = arith.constant 48 : i32
+!CHECK-KIND10:                 %[[VAL_61:.*]] = fir.call @_FortranAMapException(%[[VAL_60]]) fastmath<contract> : (i32) -> i32
+!CHECK-KIND10:                 %[[VAL_62:.*]] = fir.call @feraiseexcept(%[[VAL_61]]) fastmath<contract> : (i32) -> i32
+!CHECK-KIND10:               }
+!CHECK-KIND10:               fir.result %[[VAL_54]] : f16
+!CHECK-KIND10:             }
+!CHECK-KIND10:             fir.result %[[VAL_42]] : f16
+!CHECK-KIND10:           }
+!CHECK-KIND10:           hlfir.assign %[[VAL_39]] to %[[VAL_12]]#0 : f16, !fir.ref<f16>
+!CHECK-KIND10:           return
+!CHECK-KIND10:         }
 
-  ! CHECK:     %[[V_81:[0-9]+]] = fir.load %[[V_23]] : !fir.ref<bf16>
-  ! CHECK:     %[[V_82:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_81]]) <{bit = 3 : i32}> : (bf16) -> i1
-  ! CHECK:     %[[V_83:[0-9]+]] = fir.convert %[[V_81]] : (bf16) -> f32
-  ! CHECK:     %[[V_84:[0-9]+]] = arith.bitcast %[[V_83]] : f32 to i32
-  ! CHECK:     %[[V_85:[0-9]+]] = arith.shrui %[[V_84]], %c31{{.*}} : i32
-  ! CHECK:     %[[V_86:[0-9]+]] = fir.convert %[[V_85]] : (i32) -> i1
-  ! CHECK:     %[[V_87:[0-9]+]] = arith.cmpi ne, %[[V_86]], %true{{[_0-9]*}} : i1
-  ! CHECK:     %[[V_88:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_81]]) <{bit = 516 : i32}> : (bf16) -> i1
-  ! CHECK:     %[[V_89:[0-9]+]] = arith.andi %[[V_88]], %[[V_87]] : i1
-  ! CHECK:     %[[V_90:[0-9]+]] = arith.ori %[[V_82]], %[[V_89]] : i1
-  ! CHECK:     %[[V_91:[0-9]+]] = fir.if %[[V_90]] -> (bf16) {
-  ! CHECK:       %[[V_202:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_81]]) <{bit = 1 : i32}> : (bf16) -> i1
-  ! CHECK:       fir.if %[[V_202]] {
-  ! CHECK:         %[[V_203:[0-9]+]] = fir.call @_FortranAMapException(%c1{{.*}}) fastmath<contract> : (i32) -> i32
-  ! CHECK:                             fir.call @feraiseexcept(%[[V_203]]) fastmath<contract> : (i32) -> i32
-  ! CHECK:       }
-  ! CHECK:       fir.result %[[V_81]] : bf16
-  ! CHECK:     } else {
-  ! CHECK:       %[[V_202:[0-9]+]] = arith.cmpf oeq, %[[V_81]], %cst{{[_0-9]*}} fastmath<contract> : bf16
-  ! CHECK:       %[[V_203:[0-9]+]] = fir.if %[[V_202]] -> (bf16) {
-  ! CHECK:         fir.result %cst{{[_0-9]*}} : bf16
-  ! CHECK:       } else {
-  ! CHECK:         %[[V_204:[0-9]+]] = arith.bitcast %[[V_81]] : bf16 to i16
-  ! CHECK-DAG:     %[[V_205:[0-9]+]] = arith.subi %[[V_204]], %c1{{.*}} : i16
-  ! CHECK-DAG:     %[[V_206:[0-9]+]] = arith.addi %[[V_204]], %c1{{.*}} : i16
-  ! CHECK:         %[[V_207:[0-9]+]] = arith.select %[[V_87]], %[[V_206]], %[[V_205]] : i16
-  ! CHECK:         %[[V_208:[0-9]+]] = arith.bitcast %[[V_207]] : i16 to bf16
-  ! CHECK:         fir.result %[[V_208]] : bf16
-  ! CHECK:       }
-  ! CHECK:       fir.result %[[V_203]] : bf16
-  ! CHECK:     }
-  ! CHECK:     fir.store %[[V_91]] to %[[V_11]] : !fir.ref<bf16>
+subroutine test2(r3, x3)
+  real(3)  ::  r3,  x3
   r3 = ieee_next_up(x3)
-  print "('up:     ', z4.4, ' -> ', z4.4, ' = ', g0)", x3, r3, r3
+end subroutine
+!CHECK-LABEL:   func.func @_QMieee_next_testsPtest2(
+!CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare {{.*}}r3"
+!CHECK:           %[[VAL_12:.*]]:2 = hlfir.declare {{.*}}x3"
+!CHECK:           %[[VAL_13:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<bf16>
+!CHECK:           %[[VAL_14:.*]] = "llvm.intr.is.fpclass"(%[[VAL_13]]) <{bit = 3 : i32}> : (bf16) -> i1
+!CHECK:           %[[VAL_15:.*]] = arith.constant 1 : i16
+!CHECK:           %[[VAL_16:.*]] = arith.constant true
+!CHECK:           %[[VAL_17:.*]] = fir.convert %[[VAL_13]] : (bf16) -> f32
+!CHECK:           %[[VAL_18:.*]] = arith.bitcast %[[VAL_17]] : f32 to i32
+!CHECK:           %[[VAL_19:.*]] = arith.constant 31 : i32
+!CHECK:           %[[VAL_20:.*]] = arith.shrui %[[VAL_18]], %[[VAL_19]] : i32
+!CHECK:           %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (i32) -> i1
+!CHECK:           %[[VAL_22:.*]] = arith.cmpi ne, %[[VAL_16]], %[[VAL_21]] : i1
+!CHECK:           %[[VAL_23:.*]] = "llvm.intr.is.fpclass"(%[[VAL_13]]) <{bit = 516 : i32}> : (bf16) -> i1
+!CHECK:           %[[VAL_24:.*]] = arith.andi %[[VAL_23]], %[[VAL_22]] : i1
+!CHECK:           %[[VAL_25:.*]] = arith.ori %[[VAL_14]], %[[VAL_24]] : i1
+!CHECK:           %[[VAL_26:.*]] = fir.if %[[VAL_25]] -> (bf16) {
+!CHECK:             %[[VAL_27:.*]] = "llvm.intr.is.fpclass"(%[[VAL_13]]) <{bit = 1 : i32}> : (bf16) -> i1
+!CHECK:             fir.if %[[VAL_27]] {
+!CHECK:               %[[VAL_28:.*]] = arith.constant 1 : i32
+!CHECK:               %[[VAL_29:.*]] = fir.call @_FortranAMapException(%[[VAL_28]]) fastmath<contract> : (i32) -> i32
+!CHECK:               %[[VAL_30:.*]] = fir.call @feraiseexcept(%[[VAL_29]]) fastmath<contract> : (i32) -> i32
+!CHECK:             }
+!CHECK:             fir.result %[[VAL_13]] : bf16
+!CHECK:           } else {
+!CHECK:             %[[VAL_31:.*]] = arith.constant 0.000000e+00 : bf16
+!CHECK:             %[[VAL_32:.*]] = arith.cmpf oeq, %[[VAL_13]], %[[VAL_31]] fastmath<contract> : bf16
+!CHECK:             %[[VAL_33:.*]] = fir.if %[[VAL_32]] -> (bf16) {
+!CHECK:               %[[VAL_34:.*]] = arith.bitcast %[[VAL_15]] : i16 to bf16
+!CHECK:               %[[VAL_35:.*]] = arith.constant -32767 : i16
+!CHECK:               %[[VAL_36:.*]] = arith.bitcast %[[VAL_35]] : i16 to bf16
+!CHECK:               %[[VAL_37:.*]] = arith.select %[[VAL_16]], %[[VAL_34]], %[[VAL_36]] : bf16
+!CHECK:               fir.result %[[VAL_37]] : bf16
+!CHECK:             } else {
+!CHECK:               %[[VAL_38:.*]] = arith.bitcast %[[VAL_13]] : bf16 to i16
+!CHECK-DAG:           %[[VAL_39:.*]] = arith.addi %[[VAL_38]], %[[VAL_15]] : i16
+!CHECK-DAG:           %[[VAL_40:.*]] = arith.subi %[[VAL_38]], %[[VAL_15]] : i16
+!CHECK:               %[[VAL_41:.*]] = arith.select %[[VAL_22]], %[[VAL_39]], %[[VAL_40]] : i16
+!CHECK:               %[[VAL_42:.*]] = arith.bitcast %[[VAL_41]] : i16 to bf16
+!CHECK:               fir.result %[[VAL_42]] : bf16
+!CHECK:             }
+!CHECK:             fir.result %[[VAL_33]] : bf16
+!CHECK:           }
+!CHECK:           hlfir.assign %[[VAL_26]] to %[[VAL_11]]#0 : bf16, !fir.ref<bf16>
+!CHECK:           return
+!CHECK:         }
 
-  ! CHECK:     %[[V_104:[0-9]+]] = fir.load %[[V_25]] : !fir.ref<f32>
-  ! CHECK:     %[[V_105:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_104]]) <{bit = 3 : i32}> : (f32) -> i1
-  ! CHECK:     %[[V_106:[0-9]+]] = arith.bitcast %[[V_104]] : f32 to i32
-  ! CHECK:     %[[V_107:[0-9]+]] = arith.shrui %[[V_106]], %c31{{.*}} : i32
-  ! CHECK:     %[[V_108:[0-9]+]] = fir.convert %[[V_107]] : (i32) -> i1
-  ! CHECK:     %[[V_110:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_104]]) <{bit = 516 : i32}> : (f32) -> i1
-  ! CHECK:     %[[V_111:[0-9]+]] = arith.andi %[[V_110]], %[[V_108]] : i1
-  ! CHECK:     %[[V_112:[0-9]+]] = arith.ori %[[V_105]], %[[V_111]] : i1
-  ! CHECK:     %[[V_113:[0-9]+]] = fir.if %[[V_112]] -> (f32) {
-  ! CHECK:       %[[V_202:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_104]]) <{bit = 1 : i32}> : (f32) -> i1
-  ! CHECK:       fir.if %[[V_202]] {
-  ! CHECK:         %[[V_203:[0-9]+]] = fir.call @_FortranAMapException(%c1{{.*}}) fastmath<contract> : (i32) -> i32
-  ! CHECK:                             fir.call @feraiseexcept(%[[V_203]]) fastmath<contract> : (i32) -> i32
-  ! CHECK:       }
-  ! CHECK:       fir.result %[[V_104]] : f32
-  ! CHECK:     } else {
-  ! CHECK:       %[[V_202:[0-9]+]] = arith.cmpf oeq, %[[V_104]], %cst{{[_0-9]*}} fastmath<contract> : f32
-  ! CHECK:       %[[V_203:[0-9]+]] = fir.if %[[V_202]] -> (f32) {
-  ! CHECK:         fir.result %cst{{[_0-9]*}} : f32
-  ! CHECK:       } else {
-  ! CHECK-DAG:     %[[V_204:[0-9]+]] = arith.subi %[[V_106]], %c1{{.*}} : i32
-  ! CHECK-DAG:     %[[V_205:[0-9]+]] = arith.addi %[[V_106]], %c1{{.*}} : i32
-  ! CHECK:         %[[V_206:[0-9]+]] = arith.select %[[V_108]], %[[V_205]], %[[V_204]] : i32
-  ! CHECK:         %[[V_207:[0-9]+]] = arith.bitcast %[[V_206]] : i32 to f32
-  ! CHECK:         fir.result %[[V_207]] : f32
-  ! CHECK:       }
-  ! CHECK:       fir.result %[[V_203]] : f32
-  ! CHECK:     }
-  ! CHECK:     fir.store %[[V_113]] to %[[V_13]] : !fir.ref<f32>
+subroutine test3(r4, x4)
+  real(4)  ::  r4,  x4
   r4 = ieee_next_down(x4)
-  print "('down:   ', z8.8, ' -> ', z8.8, ' = ', g0)", x4, r4, r4
+end subroutine
+!CHECK-LABEL:   func.func @_QMieee_next_testsPtest3(
+!CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare {{.*}}r4"
+!CHECK:           %[[VAL_12:.*]]:2 = hlfir.declare {{.*}}x4"
+!CHECK:           %[[VAL_13:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<f32>
+!CHECK:           %[[VAL_14:.*]] = "llvm.intr.is.fpclass"(%[[VAL_13]]) <{bit = 3 : i32}> : (f32) -> i1
+!CHECK:           %[[VAL_15:.*]] = arith.constant 1 : i32
+!CHECK:           %[[VAL_16:.*]] = arith.constant false
+!CHECK:           %[[VAL_17:.*]] = arith.bitcast %[[VAL_13]] : f32 to i32
+!CHECK:           %[[VAL_18:.*]] = arith.constant 31 : i32
+!CHECK:           %[[VAL_19:.*]] = arith.shrui %[[VAL_17]], %[[VAL_18]] : i32
+!CHECK:           %[[VAL_20:.*]] = fir.convert %[[VAL_19]] : (i32) -> i1
+!CHECK:           %[[VAL_21:.*]] = arith.cmpi ne, %[[VAL_16]], %[[VAL_20]] : i1
+!CHECK:           %[[VAL_22:.*]] = "llvm.intr.is.fpclass"(%[[VAL_13]]) <{bit = 516 : i32}> : (f32) -> i1
+!CHECK:           %[[VAL_23:.*]] = arith.andi %[[VAL_22]], %[[VAL_21]] : i1
+!CHECK:           %[[VAL_24:.*]] = arith.ori %[[VAL_14]], %[[VAL_23]] : i1
+!CHECK:           %[[VAL_25:.*]] = fir.if %[[VAL_24]] -> (f32) {
+!CHECK:             %[[VAL_26:.*]] = "llvm.intr.is.fpclass"(%[[VAL_13]]) <{bit = 1 : i32}> : (f32) -> i1
+!CHECK:             fir.if %[[VAL_26]] {
+!CHECK:               %[[VAL_27:.*]] = arith.constant 1 : i32
+!CHECK:               %[[VAL_28:.*]] = fir.call @_FortranAMapException(%[[VAL_27]]) fastmath<contract> : (i32) -> i32
+!CHECK:               %[[VAL_29:.*]] = fir.call @feraiseexcept(%[[VAL_28]]) fastmath<contract> : (i32) -> i32
+!CHECK:             }
+!CHECK:             fir.result %[[VAL_13]] : f32
+!CHECK:           } else {
+!CHECK:             %[[VAL_30:.*]] = arith.constant 0.000000e+00 : f32
+!CHECK:             %[[VAL_31:.*]] = arith.cmpf oeq, %[[VAL_13]], %[[VAL_30]] fastmath<contract> : f32
+!CHECK:             %[[VAL_32:.*]] = fir.if %[[VAL_31]] -> (f32) {
+!CHECK:               %[[VAL_33:.*]] = arith.bitcast %[[VAL_15]] : i32 to f32
+!CHECK:               %[[VAL_34:.*]] = arith.constant -2147483647 : i32
+!CHECK:               %[[VAL_35:.*]] = arith.bitcast %[[VAL_34]] : i32 to f32
+!CHECK:               %[[VAL_36:.*]] = arith.select %[[VAL_16]], %[[VAL_33]], %[[VAL_35]] : f32
+!CHECK:               fir.result %[[VAL_36]] : f32
+!CHECK:             } else {
+!CHECK:               %[[VAL_37:.*]] = arith.bitcast %[[VAL_13]] : f32 to i32
+!CHECK-DAG:           %[[VAL_38:.*]] = arith.addi %[[VAL_37]], %[[VAL_15]] : i32
+!CHECK-DAG:           %[[VAL_39:.*]] = arith.subi %[[VAL_37]], %[[VAL_15]] : i32
+!CHECK:               %[[VAL_40:.*]] = arith.select %[[VAL_21]], %[[VAL_38]], %[[VAL_39]] : i32
+!CHECK:               %[[VAL_41:.*]] = arith.bitcast %[[VAL_40]] : i32 to f32
+!CHECK:               fir.result %[[VAL_41]] : f32
+!CHECK:             }
+!CHECK:             fir.result %[[VAL_32]] : f32
+!CHECK:           }
+!CHECK:           hlfir.assign %[[VAL_25]] to %[[VAL_11]]#0 : f32, !fir.ref<f32>
+!CHECK:           return
+!CHECK:         }
 
-  ! CHECK:     %[[V_125:[0-9]+]] = fir.load %[[V_27]] : !fir.ref<f64>
-  ! CHECK:     %[[V_126:[0-9]+]] = fir.load %[[V_21]] : !fir.ref<f16>
-  ! CHECK-DAG: %[[V_127:[0-9]+]] = fir.address_of(@_FortranAIeeeValueTable_8) : !fir.ref<!fir.array<12xi64>>
-  ! CHECK-DAG: %[[V_128:[0-9]+]] = fir.coordinate_of %[[V_127]], %c2{{.*}} : (!fir.ref<!fir.array<12xi64>>, i8) -> !fir.ref<i64>
-  ! CHECK-DAG: %[[V_129:[0-9]+]] = fir.load %[[V_128]] : !fir.ref<i64>
-  ! CHECK-DAG: %[[V_130:[0-9]+]] = arith.bitcast %[[V_129]] : i64 to f64
-  ! CHECK-DAG: %[[V_131:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_126]]) <{bit = 3 : i32}> : (f16) -> i1
-  ! CHECK:     %[[V_132:[0-9]+]] = arith.select %[[V_131]], %[[V_130]], %[[V_125]] : f64
-  ! CHECK:     %[[V_133:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_132]]) <{bit = 3 : i32}> : (f64) -> i1
-  ! CHECK:     %[[V_134:[0-9]+]] = fir.convert %[[V_126]] : (f16) -> f64
-  ! CHECK:     %[[V_135:[0-9]+]] = arith.cmpf oeq, %[[V_132]], %[[V_134]] fastmath<contract> : f64
-  ! CHECK:     %[[V_136:[0-9]+]] = arith.ori %[[V_133]], %[[V_135]] : i1
-  ! CHECK:     %[[V_137:[0-9]+]] = arith.cmpf olt, %[[V_132]], %[[V_134]] fastmath<contract> : f64
-  ! CHECK:     %[[V_138:[0-9]+]] = arith.bitcast %[[V_125]] : f64 to i64
-  ! CHECK:     %[[V_139:[0-9]+]] = arith.shrui %[[V_138]], %c63{{.*}} : i64
-  ! CHECK:     %[[V_140:[0-9]+]] = fir.convert %[[V_139]] : (i64) -> i1
-  ! CHECK:     %[[V_141:[0-9]+]] = arith.cmpi ne, %[[V_137]], %[[V_140]] : i1
-  ! CHECK:     %[[V_142:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_132]]) <{bit = 516 : i32}> : (f64) -> i1
-  ! CHECK:     %[[V_143:[0-9]+]] = arith.andi %[[V_142]], %[[V_141]] : i1
-  ! CHECK:     %[[V_144:[0-9]+]] = arith.ori %[[V_136]], %[[V_143]] : i1
-  ! CHECK:     %[[V_145:[0-9]+]] = fir.if %[[V_144]] -> (f64) {
-  ! CHECK:       fir.result %[[V_132]] : f64
-  ! CHECK:     } else {
-  ! CHECK:       %[[V_202:[0-9]+]] = arith.cmpf oeq, %[[V_132]], %cst{{[_0-9]*}} fastmath<contract> : f64
-  ! CHECK:       %[[V_203:[0-9]+]] = fir.if %[[V_202]] -> (f64) {
-  ! CHECK:         %[[V_204:[0-9]+]] = arith.select %[[V_137]], %cst{{[_0-9]*}}, %cst{{[_0-9]*}} : f64
-  ! CHECK:         %[[V_205:[0-9]+]] = fir.call @_FortranAMapException(%c48{{.*}}) fastmath<contract> : (i32) -> i32
-  ! CHECK:                             fir.call @feraiseexcept(%[[V_205]]) fastmath<contract> : (i32) -> i32
-  ! CHECK:         fir.result %[[V_204]] : f64
-  ! CHECK:       } else {
-  ! CHECK:         %[[V_204:[0-9]+]] = arith.bitcast %[[V_132]] : f64 to i64
-  ! CHECK-DAG:     %[[V_205:[0-9]+]] = arith.subi %[[V_204]], %c1{{.*}} : i64
-  ! CHECK-DAG:     %[[V_206:[0-9]+]] = arith.addi %[[V_204]], %c1{{.*}} : i64
-  ! CHECK:         %[[V_207:[0-9]+]] = arith.select %[[V_141]], %[[V_206]], %[[V_205]] : i64
-  ! CHECK:         %[[V_208:[0-9]+]] = arith.bitcast %[[V_207]] : i64 to f64
-  ! CHECK:         %[[V_209:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_208]]) <{bit = 516 : i32}> : (f64) -> i1
-  ! CHECK:         fir.if %[[V_209]] {
-  ! CHECK:           %[[V_211:[0-9]+]] = fir.call @_FortranAMapException(%c40{{.*}}) fastmath<contract> : (i32) -> i32
-  ! CHECK:                               fir.call @feraiseexcept(%[[V_211]]) fastmath<contract> : (i32) -> i32
-  ! CHECK:         }
-  ! CHECK:         %[[V_210:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_208]]) <{bit = 144 : i32}> : (f64) -> i1
-  ! CHECK:         fir.if %[[V_210]] {
-  ! CHECK:           %[[V_211:[0-9]+]] = fir.call @_FortranAMapException(%c48{{.*}}) fastmath<contract> : (i32) -> i32
-  ! CHECK:                               fir.call @feraiseexcept(%[[V_211]]) fastmath<contract> : (i32) -> i32
-  ! CHECK:         }
-  ! CHECK:         fir.result %[[V_208]] : f64
-  ! CHECK:       }
-  ! CHECK:       fir.result %[[V_203]] : f64
-  ! CHECK:     }
-  ! CHECK:     fir.store %[[V_145]] to %[[V_15]] : !fir.ref<f64>
+subroutine test4(r8, x8, x2)
+  real(2)  ::  x2
+  real(8)  ::  r8,  x8
   r8 = ieee_next_after(x8, x2)
-  print "('after:  ', z16.16, ' -> ', z16.16, ' = ', g0)", x8, r8, r8
+end subroutine
+!CHECK-LABEL:   func.func @_QMieee_next_testsPtest4(
+!CHECK:           %[[VAL_12:.*]]:2 = hlfir.declare {{.*}}r8"
+!CHECK:           %[[VAL_13:.*]]:2 = hlfir.declare {{.*}}x2"
+!CHECK:           %[[VAL_14:.*]]:2 = hlfir.declare {{.*}}x8"
+!CHECK:           %[[VAL_15:.*]] = fir.load %[[VAL_14]]#0 : !fir.ref<f64>
+!CHECK:           %[[VAL_16:.*]] = fir.load %[[VAL_13]]#0 : !fir.ref<f16>
+!CHECK-DAG:       %[[VAL_17:.*]] = "llvm.intr.is.fpclass"(%[[VAL_16]]) <{bit = 3 : i32}> : (f16) -> i1
+!CHECK-DAG:       %[[VAL_18:.*]] = arith.constant 2 : i8
+!CHECK-DAG:       %[[VAL_19:.*]] = fir.address_of(@_FortranAIeeeValueTable_8) : !fir.ref<!fir.array<12xi64>>
+!CHECK-DAG:       %[[VAL_20:.*]] = fir.coordinate_of %[[VAL_19]], %[[VAL_18]] : (!fir.ref<!fir.array<12xi64>>, i8) -> !fir.ref<i64>
+!CHECK-DAG:       %[[VAL_21:.*]] = fir.load %[[VAL_20]] : !fir.ref<i64>
+!CHECK-DAG:       %[[VAL_22:.*]] = arith.bitcast %[[VAL_21]] : i64 to f64
+!CHECK:           %[[VAL_23:.*]] = arith.select %[[VAL_17]], %[[VAL_22]], %[[VAL_15]] : f64
+!CHECK:           %[[VAL_24:.*]] = "llvm.intr.is.fpclass"(%[[VAL_23]]) <{bit = 3 : i32}> : (f64) -> i1
+!CHECK:           %[[VAL_25:.*]] = arith.constant 1 : i64
+!CHECK:           %[[VAL_26:.*]] = fir.convert %[[VAL_16]] : (f16) -> f32
+!CHECK:           %[[VAL_27:.*]] = fir.convert %[[VAL_26]] : (f32) -> f64
+!CHECK:           %[[VAL_28:.*]] = arith.cmpf oeq, %[[VAL_23]], %[[VAL_27]] fastmath<contract> : f64
+!CHECK:           %[[VAL_29:.*]] = arith.ori %[[VAL_24]], %[[VAL_28]] : i1
+!CHECK:           %[[VAL_30:.*]] = arith.cmpf olt, %[[VAL_23]], %[[VAL_27]] fastmath<contract> : f64
+!CHECK:           %[[VAL_31:.*]] = arith.bitcast %[[VAL_15]] : f64 to i64
+!CHECK:           %[[VAL_32:.*]] = arith.constant 63 : i64
+!CHECK:           %[[VAL_33:.*]] = arith.shrui %[[VAL_31]], %[[VAL_32]] : i64
+!CHECK:           %[[VAL_34:.*]] = fir.convert %[[VAL_33]] : (i64) -> i1
+!CHECK:           %[[VAL_35:.*]] = arith.cmpi ne, %[[VAL_30]], %[[VAL_34]] : i1
+!CHECK:           %[[VAL_36:.*]] = "llvm.intr.is.fpclass"(%[[VAL_23]]) <{bit = 516 : i32}> : (f64) -> i1
+!CHECK:           %[[VAL_37:.*]] = arith.andi %[[VAL_36]], %[[VAL_35]] : i1
+!CHECK:           %[[VAL_38:.*]] = arith.ori %[[VAL_29]], %[[VAL_37]] : i1
+!CHECK:           %[[VAL_39:.*]] = fir.if %[[VAL_38]] -> (f64) {
+!CHECK:             fir.result %[[VAL_23]] : f64
+!CHECK:           } else {
+!CHECK:             %[[VAL_40:.*]] = arith.constant 0.000000e+00 : f64
+!CHECK:             %[[VAL_41:.*]] = arith.cmpf oeq, %[[VAL_23]], %[[VAL_40]] fastmath<contract> : f64
+!CHECK:             %[[VAL_42:.*]] = fir.if %[[VAL_41]] -> (f64) {
+!CHECK:               %[[VAL_43:.*]] = arith.bitcast %[[VAL_25]] : i64 to f64
+!CHECK:               %[[VAL_44:.*]] = arith.constant -9223372036854775807 : i64
+!CHECK:               %[[VAL_45:.*]] = arith.bitcast %[[VAL_44]] : i64 to f64
+!CHECK:               %[[VAL_46:.*]] = arith.select %[[VAL_30]], %[[VAL_43]], %[[VAL_45]] : f64
+!CHECK:               %[[VAL_47:.*]] = arith.constant 48 : i32
+!CHECK:               %[[VAL_48:.*]] = fir.call @_FortranAMapException(%[[VAL_47]]) fastmath<contract> : (i32) -> i32
+!CHECK:               %[[VAL_49:.*]] = fir.call @feraiseexcept(%[[VAL_48]]) fastmath<contract> : (i32) -> i32
+!CHECK:               fir.result %[[VAL_46]] : f64
+!CHECK:             } else {
+!CHECK:               %[[VAL_50:.*]] = arith.bitcast %[[VAL_23]] : f64 to i64
+!CHECK-DAG:           %[[VAL_51:.*]] = arith.addi %[[VAL_50]], %[[VAL_25]] : i64
+!CHECK-DAG:           %[[VAL_52:.*]] = arith.subi %[[VAL_50]], %[[VAL_25]] : i64
+!CHECK:               %[[VAL_53:.*]] = arith.select %[[VAL_35]], %[[VAL_51]], %[[VAL_52]] : i64
+!CHECK:               %[[VAL_54:.*]] = arith.bitcast %[[VAL_53]] : i64 to f64
+!CHECK:               %[[VAL_55:.*]] = "llvm.intr.is.fpclass"(%[[VAL_54]]) <{bit = 516 : i32}> : (f64) -> i1
+!CHECK:               fir.if %[[VAL_55]] {
+!CHECK:                 %[[VAL_56:.*]] = arith.constant 40 : i32
+!CHECK:                 %[[VAL_57:.*]] = fir.call @_FortranAMapException(%[[VAL_56]]) fastmath<contract> : (i32) -> i32
+!CHECK:                 %[[VAL_58:.*]] = fir.call @feraiseexcept(%[[VAL_57]]) fastmath<contract> : (i32) -> i32
+!CHECK:               }
+!CHECK:               %[[VAL_59:.*]] = "llvm.intr.is.fpclass"(%[[VAL_54]]) <{bit = 144 : i32}> : (f64) -> i1
+!CHECK:               fir.if %[[VAL_59]] {
+!CHECK:                 %[[VAL_60:.*]] = arith.constant 48 : i32
+!CHECK:                 %[[VAL_61:.*]] = fir.call @_FortranAMapException(%[[VAL_60]]) fastmath<contract> : (i32) -> i32
+!CHECK:                 %[[VAL_62:.*]] = fir.call @feraiseexcept(%[[VAL_61]]) fastmath<contract> : (i32) -> i32
+!CHECK:               }
+!CHECK:               fir.result %[[VAL_54]] : f64
+!CHECK:             }
+!CHECK:             fir.result %[[VAL_42]] : f64
+!CHECK:           }
+!CHECK:           hlfir.assign %[[VAL_39]] to %[[VAL_12]]#0 : f64, !fir.ref<f64>
+!CHECK:           return
+!CHECK:         }
 
-  ! CHECK:     %[[V_158:[0-9]+]] = fir.load %[[V_17]] : !fir.ref<f80>
-  ! CHECK:     %[[V_159:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_158]]) <{bit = 3 : i32}> : (f80) -> i1
-  ! CHECK:     %[[V_160:[0-9]+]] = arith.bitcast %[[V_158]] : f80 to i80
-  ! CHECK:     %[[V_161:[0-9]+]] = arith.shrui %[[V_160]], %c79{{.*}} : i80
-  ! CHECK:     %[[V_162:[0-9]+]] = fir.convert %[[V_161]] : (i80) -> i1
-  ! CHECK:     %[[V_163:[0-9]+]] = arith.cmpi ne, %[[V_162]], %true{{[_0-9]*}} : i1
-  ! CHECK:     %[[V_164:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_158]]) <{bit = 516 : i32}> : (f80) -> i1
-  ! CHECK:     %[[V_165:[0-9]+]] = arith.andi %[[V_164]], %[[V_163]] : i1
-  ! CHECK:     %[[V_166:[0-9]+]] = arith.ori %[[V_159]], %[[V_165]] : i1
-  ! CHECK:     %[[V_167:[0-9]+]] = fir.if %[[V_166]] -> (f80) {
-  ! CHECK:       %[[V_202:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_158]]) <{bit = 1 : i32}> : (f80) -> i1
-  ! CHECK:       fir.if %[[V_202]] {
-  ! CHECK:         %[[V_203:[0-9]+]] = fir.call @_FortranAMapException(%c1{{.*}}) fastmath<contract> : (i32) -> i32
-  ! CHECK:                             fir.call @feraiseexcept(%[[V_203]]) fastmath<contract> : (i32) -> i32
-  ! CHECK:       }
-  ! CHECK:       fir.result %[[V_158]] : f80
-  ! CHECK:     } else {
-  ! CHECK:       %[[V_202:[0-9]+]] = arith.cmpf oeq, %[[V_158]], %cst{{[_0-9]*}} fastmath<contract> : f80
-  ! CHECK:       %[[V_203:[0-9]+]] = fir.if %[[V_202]] -> (f80) {
-  ! CHECK:         fir.result %cst{{[_0-9]*}} : f80
-  ! CHECK:       } else {
-  ! CHECK:         %[[V_204:[0-9]+]] = fir.call @_FortranAMapException(%c63{{.*}}) fastmath<contract> : (i32) -> i32
-  ! CHECK:         %[[V_205:[0-9]+]] = fir.call @fetestexcept(%[[V_204]]) fastmath<contract> : (i32) -> i32
-  ! CHECK:         %[[V_206:[0-9]+]] = fir.call @fedisableexcept(%[[V_204]]) fastmath<contract> : (i32) -> i32
-  ! CHECK:         %[[V_207:[0-9]+]] = fir.call @_FortranANearest10(%[[V_158]], %true{{[_0-9]*}}) fastmath<contract> : (f80, i1) -> f80
-  ! CHECK:         %[[V_208:[0-9]+]] = fir.call @feclearexcept(%[[V_204]]) fastmath<contract> : (i32) -> i32
-  ! CHECK:         %[[V_209:[0-9]+]] = fir.call @feraiseexcept(%[[V_205]]) fastmath<contract> : (i32) -> i32
-  ! CHECK:         %[[V_210:[0-9]+]] = fir.call @feenableexcept(%[[V_206]]) fastmath<contract> : (i32) -> i32
-  ! CHECK:         fir.result %[[V_207]] : f80
-  ! CHECK:       }
-  ! CHECK:       fir.result %[[V_203]] : f80
-  ! CHECK:     }
-  ! CHECK:     fir.store %[[V_167]] to %[[V_5]] : !fir.ref<f80>
+subroutine test5(r10, x10)
+  real(kind10) :: x10, r10
   r10 = ieee_next_up(x10)
-  print "('up:     ', z20.20, ' -> ', z20.20, ' = ', g0)", x10, r10, r10
-
-  ! CHECK:     %[[V_180:[0-9]+]] = fir.load %[[V_19]] : !fir.ref<f128>
-  ! CHECK:     %[[V_181:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_180]]) <{bit = 3 : i32}> : (f128) -> i1
-  ! CHECK:     %[[V_182:[0-9]+]] = arith.bitcast %[[V_180]] : f128 to i128
-  ! CHECK:     %[[V_183:[0-9]+]] = arith.shrui %[[V_182]], %c127{{.*}} : i128
-  ! CHECK:     %[[V_184:[0-9]+]] = fir.convert %[[V_183]] : (i128) -> i1
-  ! CHECK:     %[[V_186:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_180]]) <{bit = 516 : i32}> : (f128) -> i1
-  ! CHECK:     %[[V_187:[0-9]+]] = arith.andi %[[V_186]], %[[V_184]] : i1
-  ! CHECK:     %[[V_188:[0-9]+]] = arith.ori %[[V_181]], %[[V_187]] : i1
-  ! CHECK:     %[[V_189:[0-9]+]] = fir.if %[[V_188]] -> (f128) {
-  ! CHECK:       %[[V_202:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_180]]) <{bit = 1 : i32}> : (f128) -> i1
-  ! CHECK:       fir.if %[[V_202]] {
-  ! CHECK:         %[[V_203:[0-9]+]] = fir.call @_FortranAMapException(%c1{{.*}}) fastmath<contract> : (i32) -> i32
-  ! CHECK:                             fir.call @feraiseexcept(%[[V_203]]) fastmath<contract> : (i32) -> i32
-  ! CHECK:       }
-  ! CHECK:       fir.result %[[V_180]] : f128
-  ! CHECK:     } else {
-  ! CHECK:       %[[V_202:[0-9]+]] = arith.cmpf oeq, %[[V_180]], %cst{{[_0-9]*}} fastmath<contract> : f128
-  ! CHECK:       %[[V_203:[0-9]+]] = fir.if %[[V_202]] -> (f128) {
-  ! CHECK:         fir.result %cst{{[_0-9]*}} : f128
-  ! CHECK:       } else {
-  ! CHECK-DAG:     %[[V_204:[0-9]+]] = arith.subi %[[V_182]], %c1{{.*}} : i128
-  ! CHECK-DAG:     %[[V_205:[0-9]+]] = arith.addi %[[V_182]], %c1{{.*}} : i128
-  ! CHECK:         %[[V_206:[0-9]+]] = arith.select %[[V_184]], %[[V_205]], %[[V_204]] : i128
-  ! CHECK:         %[[V_207:[0-9]+]] = arith.bitcast %[[V_206]] : i128 to f128
-  ! CHECK:         fir.result %[[V_207]] : f128
-  ! CHECK:       }
-  ! CHECK:       fir.result %[[V_203]] : f128
-  ! CHECK:     }
-  ! CHECK:     fir.store %[[V_189]] to %[[V_7]] : !fir.ref<f128>
+end subroutine
+!CHECK-KIND10-LABEL:   func.func @_QMieee_next_testsPtest5(
+!CHECK-KIND10:           %[[VAL_11:.*]]:2 = hlfir.declare {{.*}}r10"
+!CHECK-KIND10:           %[[VAL_12:.*]]:2 = hlfir.declare {{.*}}x10"
+!CHECK-KIND10:           %[[VAL_13:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<f80>
+!CHECK-KIND10:           %[[VAL_14:.*]] = "llvm.intr.is.fpclass"(%[[VAL_13]]) <{bit = 3 : i32}> : (f80) -> i1
+!CHECK-KIND10:           %[[VAL_15:.*]] = arith.constant 1 : i80
+!CHECK-KIND10:           %[[VAL_16:.*]] = arith.constant true
+!CHECK-KIND10:           %[[VAL_17:.*]] = arith.bitcast %[[VAL_13]] : f80 to i80
+!CHECK-KIND10:           %[[VAL_18:.*]] = arith.constant 79 : i80
+!CHECK-KIND10:           %[[VAL_19:.*]] = arith.shrui %[[VAL_17]], %[[VAL_18]] : i80
+!CHECK-KIND10:           %[[VAL_20:.*]] = fir.convert %[[VAL_19]] : (i80) -> i1
+!CHECK-KIND10:           %[[VAL_21:.*]] = arith.cmpi ne, %[[VAL_16]], %[[VAL_20]] : i1
+!CHECK-KIND10:           %[[VAL_22:.*]] = "llvm.intr.is.fpclass"(%[[VAL_13]]) <{bit = 516 : i32}> : (f80) -> i1
+!CHECK-KIND10:           %[[VAL_23:.*]] = arith.andi %[[VAL_22]], %[[VAL_21]] : i1
+!CHECK-KIND10:           %[[VAL_24:.*]] = arith.ori %[[VAL_14]], %[[VAL_23]] : i1
+!CHECK-KIND10:           %[[VAL_25:.*]] = fir.if %[[VAL_24]] -> (f80) {
+!CHECK-KIND10:             %[[VAL_26:.*]] = "llvm.intr.is.fpclass"(%[[VAL_13]]) <{bit = 1 : i32}> : (f80) -> i1
+!CHECK-KIND10:             fir.if %[[VAL_26]] {
+!CHECK-KIND10:               %[[VAL_27:.*]] = arith.constant 1 : i32
+!CHECK-KIND10:               %[[VAL_28:.*]] = fir.call @_FortranAMapException(%[[VAL_27]]) fastmath<contract> : (i32) -> i32
+!CHECK-KIND10:               %[[VAL_29:.*]] = fir.call @feraiseexcept(%[[VAL_28]]) fastmath<contract> : (i32) -> i32
+!CHECK-KIND10:             }
+!CHECK-KIND10:             fir.result %[[VAL_13]] : f80
+!CHECK-KIND10:           } else {
+!CHECK-KIND10:             %[[VAL_30:.*]] = arith.constant 0.000000e+00 : f80
+!CHECK-KIND10:             %[[VAL_31:.*]] = arith.cmpf oeq, %[[VAL_13]], %[[VAL_30]] fastmath<contract> : f80
+!CHECK-KIND10:             %[[VAL_32:.*]] = fir.if %[[VAL_31]] -> (f80) {
+!CHECK-KIND10:               %[[VAL_33:.*]] = arith.bitcast %[[VAL_15]] : i80 to f80
+!CHECK-KIND10:               %[[VAL_34:.*]] = arith.constant -604462909807314587353087 : i80
+!CHECK-KIND10:               %[[VAL_35:.*]] = arith.bitcast %[[VAL_34]] : i80 to f80
+!CHECK-KIND10:               %[[VAL_36:.*]] = arith.select %[[VAL_16]], %[[VAL_33]], %[[VAL_35]] : f80
+!CHECK-KIND10:               fir.result %[[VAL_36]] : f80
+!CHECK-KIND10:             } else {
+!CHECK-KIND10:               %[[VAL_37:.*]] = arith.constant 63 : i32
+!CHECK-KIND10:               %[[VAL_38:.*]] = fir.call @_FortranAMapException(%[[VAL_37]]) fastmath<contract> : (i32) -> i32
+!CHECK-KIND10:               %[[VAL_39:.*]] = fir.call @fetestexcept(%[[VAL_38]]) fastmath<contract> : (i32) -> i32
+!CHECK-KIND10:               %[[VAL_40:.*]] = fir.call @fedisableexcept(%[[VAL_38]]) fastmath<contract> : (i32) -> i32
+!CHECK-KIND10:               %[[VAL_41:.*]] = fir.call @_FortranANearest10(%[[VAL_13]], %[[VAL_16]]) fastmath<contract> : (f80, i1) -> f80
+!CHECK-KIND10:               %[[VAL_42:.*]] = fir.call @feclearexcept(%[[VAL_38]]) fastmath<contract> : (i32) -> i32
+!CHECK-KIND10:               %[[VAL_43:.*]] = fir.call @feraiseexcept(%[[VAL_39]]) fastmath<contract> : (i32) -> i32
+!CHECK-KIND10:               %[[VAL_44:.*]] = fir.call @feenableexcept(%[[VAL_40]]) fastmath<contract> : (i32) -> i32
+!CHECK-KIND10:               fir.result %[[VAL_41]] : f80
+!CHECK-KIND10:             }
+!CHECK-KIND10:             fir.result %[[VAL_32]] : f80
+!CHECK-KIND10:           }
+!CHECK-KIND10:           hlfir.assign %[[VAL_25]] to %[[VAL_11]]#0 : f80, !fir.ref<f80>
+!CHECK-KIND10:           return
+!CHECK-KIND10:         }
 
+subroutine test6(r16, x16)
+  real(kind16) :: r16, x16
   r16 = ieee_next_down(x16)
+end subroutine
+!CHECK-KIND16-LABEL:   func.func @_QMieee_next_testsPtest6(
+!CHECK-KIND16:           %[[VAL_11:.*]]:2 = hlfir.declare {{.*}}r16"
+!CHECK-KIND16:           %[[VAL_12:.*]]:2 = hlfir.declare {{.*}}x16"
+!CHECK-KIND16:           %[[VAL_13:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<f128>
+!CHECK-KIND16:           %[[VAL_14:.*]] = "llvm.intr.is.fpclass"(%[[VAL_13]]) <{bit = 3 : i32}> : (f128) -> i1
+!CHECK-KIND16:           %[[VAL_15:.*]] = arith.constant 1 : i128
+!CHECK-KIND16:           %[[VAL_16:.*]] = arith.constant false
+!CHECK-KIND16:           %[[VAL_17:.*]] = arith.bitcast %[[VAL_13]] : f128 to i128
+!CHECK-KIND16:           %[[VAL_18:.*]] = arith.constant 127 : i128
+!CHECK-KIND16:           %[[VAL_19:.*]] = arith.shrui %[[VAL_17]], %[[VAL_18]] : i128
+!CHECK-KIND16:           %[[VAL_20:.*]] = fir.convert %[[VAL_19]] : (i128) -> i1
+!CHECK-KIND16:           %[[VAL_21:.*]] = arith.cmpi ne, %[[VAL_16]], %[[VAL_20]] : i1
+!CHECK-KIND16:           %[[VAL_22:.*]] = "llvm.intr.is.fpclass"(%[[VAL_13]]) <{bit = 516 : i32}> : (f128) -> i1
+!CHECK-KIND16:           %[[VAL_23:.*]] = arith.andi %[[VAL_22]], %[[VAL_21]] : i1
+!CHECK-KIND16:           %[[VAL_24:.*]] = arith.ori %[[VAL_14]], %[[VAL_23]] : i1
+!CHECK-KIND16:           %[[VAL_25:.*]] = fir.if %[[VAL_24]] -> (f128) {
+!CHECK-KIND16:             %[[VAL_26:.*]] = "llvm.intr.is.fpclass"(%[[VAL_13]]) <{bit = 1 : i32}> : (f128) -> i1
+!CHECK-KIND16:             fir.if %[[VAL_26]] {
+!CHECK-KIND16:               %[[VAL_27:.*]] = arith.constant 1 : i32
+!CHECK-KIND16:               %[[VAL_28:.*]] = fir.call @_FortranAMapException(%[[VAL_27]]) fastmath<contract> : (i32) -> i32
+!CHECK-KIND16:               %[[VAL_29:.*]] = fir.call @feraiseexcept(%[[VAL_28]]) fastmath<contract> : (i32) -> i32
+!CHECK-KIND16:             }
+!CHECK-KIND16:             fir.result %[[VAL_13]] : f128
+!CHECK-KIND16:           } else {
+!CHECK-KIND16:             %[[VAL_30:.*]] = arith.constant 0.000000e+00 : f128
+!CHECK-KIND16:             %[[VAL_31:.*]] = arith.cmpf oeq, %[[VAL_13]], %[[VAL_30]] fastmath<contract> : f128
+!CHECK-KIND16:             %[[VAL_32:.*]] = fir.if %[[VAL_31]] -> (f128) {
+!CHECK-KIND16:               %[[VAL_33:.*]] = arith.bitcast %[[VAL_15]] : i128 to f128
+!CHECK-KIND16:               %[[VAL_34:.*]] = arith.constant -170141183460469231731687303715884105727 : i128
+!CHECK-KIND16:               %[[VAL_35:.*]] = arith.bitcast %[[VAL_34]] : i128 to f128
+!CHECK-KIND16:               %[[VAL_36:.*]] = arith.select %[[VAL_16]], %[[VAL_33]], %[[VAL_35]] : f128
+!CHECK-KIND16:               fir.result %[[VAL_36]] : f128
+!CHECK-KIND16:             } else {
+!CHECK-KIND16:               %[[VAL_37:.*]] = arith.bitcast %[[VAL_13]] : f128 to i128
+!CHECK-KIND16-DAG:           %[[VAL_38:.*]] = arith.addi %[[VAL_37]], %[[VAL_15]] : i128
+!CHECK-KIND16-DAG:           %[[VAL_39:.*]] = arith.subi %[[VAL_37]], %[[VAL_15]] : i128
+!CHECK-KIND16:               %[[VAL_40:.*]] = arith.select %[[VAL_21]], %[[VAL_38]], %[[VAL_39]] : i128
+!CHECK-KIND16:               %[[VAL_41:.*]] = arith.bitcast %[[VAL_40]] : i128 to f128
+!CHECK-KIND16:               fir.result %[[VAL_41]] : f128
+!CHECK-KIND16:             }
+!CHECK-KIND16:             fir.result %[[VAL_32]] : f128
+!CHECK-KIND16:           }
+!CHECK-KIND16:           hlfir.assign %[[VAL_25]] to %[[VAL_11]]#0 : f128, !fir.ref<f128>
+!CHECK-KIND16:           return
+!CHECK-KIND16:         }
+end module
+
+! Expected end-to-end output when both kind10 and kind16 enabled (not part of lit
+! test, only provided for debug help):
+!
+! after:  FC00 -> FBFF = -.655E+5
+! up:     FF7F -> FF7E = -.337E+39
+! down:   80000000 -> 80000001 = -.1E-44
+! after:  0000000000000000 -> 8000000000000001 = -.5E-323
+! up:     7FFEFFFFFFFFFFFFFFFF -> 7FFF8000000000000000 = Inf
+! down:   7FFF0000000000000000000000000000 -> 7FFEFFFFFFFFFFFFFFFFFFFFFFFFFFFF = .1189731495357231765085759326628007E+4933
+program p
+  use ieee_next_tests
+  real(2)  ::  r2,  x2
+  real(3)  ::  r3,  x3 = -huge(x3)
+  real(4)  ::  r4,  x4 = -0.
+  real(8)  ::  r8,  x8 =  0.
+  real(kind10) :: r10, x10 =  huge(x10)
+  real(kind16) :: r16, x16
+
+  x2  = ieee_value(x2, ieee_negative_inf)
+  x16 = ieee_value(x2, ieee_positive_inf)
+  call test1(r2, x2, x10)
+  print "('after:  ', z4.4, ' -> ', z4.4, ' = ', g0)", x2, r2, r2
+  call test2(r3, x3)
+  print "('up:     ', z4.4, ' -> ', z4.4, ' = ', g0)", x3, r3, r3
+  call test3(r4, x4)
+  print "('down:   ', z8.8, ' -> ', z8.8, ' = ', g0)", x4, r4, r4
+  call test4(r8, x8, x2)
+  print "('after:  ', z16.16, ' -> ', z16.16, ' = ', g0)", x8, r8, r8
+  call test5(r10, x10)
+  print "('up:     ', z20.20, ' -> ', z20.20, ' = ', g0)", x10, r10, r10
+  call test6(r16, x16)
   print "('down:   ', z32.32, ' -> ', z32.32, ' = ', g0)", x16, r16, r16
 end
diff --git flang/test/Lower/Intrinsics/isnan.f90 flang/test/Lower/Intrinsics/isnan.f90
index 62b98c8ea98b..6535724b2ce3 100644
--- flang/test/Lower/Intrinsics/isnan.f90
+++ flang/test/Lower/Intrinsics/isnan.f90
@@ -1,5 +1,4 @@
-! RUN: bbc -emit-fir %s -o - | FileCheck %s
-! RUN: flang -fc1 -emit-fir %s -o - | FileCheck %s
+! RUN: bbc -emit-fir %s -o - | FileCheck %s --check-prefixes=CHECK%if target=x86_64{{.*}} %{,CHECK-KIND10%}%if flang-supports-f128-math %{,CHECK-KIND16%}
 
 ! CHECK-LABEL: isnan_f32
 subroutine isnan_f32(r)
@@ -35,36 +34,40 @@ subroutine ieee_is_nan_f64(r)
   ! CHECK: fir.convert %[[l]] : (i1) -> !fir.logical<4>
 end subroutine ieee_is_nan_f64
 
-! CHECK-LABEL: isnan_f80
+! CHECK-KIND10-LABEL: isnan_f80
 subroutine isnan_f80(r)
-  real(KIND=10) :: r
+  integer, parameter :: kind10 = merge(10, 4, selected_real_kind(p=18).eq.10)
+  real(KIND=kind10) :: r
   i = isnan(r)
-  ! CHECK: %[[l:.*]] = "llvm.intr.is.fpclass"(%{{.*}}) <{bit = 3 : i32}> : (f80) -> i1
-  ! CHECK: fir.convert %[[l]] : (i1) -> !fir.logical<4>
+  ! CHECK-KIND10: %[[l:.*]] = "llvm.intr.is.fpclass"(%{{.*}}) <{bit = 3 : i32}> : (f80) -> i1
+  ! CHECK-KIND10: fir.convert %[[l]] : (i1) -> !fir.logical<4>
 end subroutine isnan_f80
 
-! CHECK-LABEL: ieee_is_nan_f80
+! CHECK-KIND10-LABEL: ieee_is_nan_f80
 subroutine ieee_is_nan_f80(r)
   use ieee_arithmetic
-  real(KIND=10) :: r
+  integer, parameter :: kind10 = merge(10, 4, selected_real_kind(p=18).eq.10)
+  real(KIND=kind10) :: r
   i = ieee_is_nan(r)
-  ! CHECK: %[[l:.*]] = "llvm.intr.is.fpclass"(%{{.*}}) <{bit = 3 : i32}> : (f80) -> i1
-  ! CHECK: fir.convert %[[l]] : (i1) -> !fir.logical<4>
+  ! CHECK-KIND10: %[[l:.*]] = "llvm.intr.is.fpclass"(%{{.*}}) <{bit = 3 : i32}> : (f80) -> i1
+  ! CHECK-KIND10: fir.convert %[[l]] : (i1) -> !fir.logical<4>
 end subroutine ieee_is_nan_f80
 
-! CHECK-LABEL: isnan_f128
+! CHECK-KIND16-LABEL: isnan_f128
 subroutine isnan_f128(r)
-  real(KIND=16) :: r
+  integer, parameter :: kind16 = merge(16, 4, selected_real_kind(p=33).eq.16)
+  real(KIND=kind16) :: r
   i = isnan(r)
-  ! CHECK: %[[l:.*]] = "llvm.intr.is.fpclass"(%{{.*}}) <{bit = 3 : i32}> : (f128) -> i1
-  ! CHECK: fir.convert %[[l]] : (i1) -> !fir.logical<4>
+  ! CHECK-KIND16: %[[l:.*]] = "llvm.intr.is.fpclass"(%{{.*}}) <{bit = 3 : i32}> : (f128) -> i1
+  ! CHECK-KIND16: fir.convert %[[l]] : (i1) -> !fir.logical<4>
 end subroutine isnan_f128
 
-! CHECK-LABEL: ieee_is_nan_f128
+! CHECK-KIND16-LABEL: ieee_is_nan_f128
 subroutine ieee_is_nan_f128(r)
   use ieee_arithmetic
-  real(KIND=16) :: r
+  integer, parameter :: kind16 = merge(16, 4, selected_real_kind(p=33).eq.16)
+  real(KIND=kind16) :: r
   i = ieee_is_nan(r)
-  ! CHECK: %[[l:.*]] = "llvm.intr.is.fpclass"(%{{.*}}) <{bit = 3 : i32}> : (f128) -> i1
-  ! CHECK: fir.convert %[[l]] : (i1) -> !fir.logical<4>
+  ! CHECK-KIND16: %[[l:.*]] = "llvm.intr.is.fpclass"(%{{.*}}) <{bit = 3 : i32}> : (f128) -> i1
+  ! CHECK-KIND16: fir.convert %[[l]] : (i1) -> !fir.logical<4>
 end subroutine ieee_is_nan_f128
diff --git flang/test/Lower/Intrinsics/mod.f90 flang/test/Lower/Intrinsics/mod.f90
index 3f5385ac303a..5bc81d923b80 100644
--- flang/test/Lower/Intrinsics/mod.f90
+++ flang/test/Lower/Intrinsics/mod.f90
@@ -1,54 +1,38 @@
-! RUN: bbc -emit-fir -hlfir=false %s -o - | FileCheck %s
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir %s -o - | FileCheck %s
+! RUN: bbc -emit-fir %s -o - | FileCheck %s --check-prefixes=CHECK%if target=x86_64{{.*}} %{,CHECK-KIND10%}%if flang-supports-f128-math %{,CHECK-KIND16%}
 
 ! CHECK-LABEL: func @_QPmod_testr4(
-! CHECK-SAME: %[[arg0:.*]]: !fir.ref<f32>{{.*}}, %[[arg1:.*]]: !fir.ref<f32>{{.*}}, %[[arg2:.*]]: !fir.ref<f32>{{.*}}) {
 subroutine mod_testr4(r, a, p)
   real(4) :: r, a, p
-! CHECK: %[[V1:.*]] = fir.load %[[arg1]] : !fir.ref<f32>
-! CHECK: %[[V2:.*]] = fir.load %[[arg2]] : !fir.ref<f32>
-! CHECK: %[[FILE:.*]] = fir.address_of(@{{.*}}) : !fir.ref<!fir.char<1,{{.*}}>>
 ! CHECK: %[[LINE:.*]] = arith.constant {{[0-9]*}} : i32
+! CHECK: %[[A:.*]] = fir.declare{{.*}}a"
+! CHECK: %[[P:.*]] = fir.declare{{.*}}p"
+! CHECK: %[[A_LOAD:.*]] = fir.load %[[A]]
+! CHECK: %[[P_LOAD:.*]] = fir.load %[[P]]
+! CHECK: %[[FILE:.*]] = fir.address_of(@{{.*}}) : !fir.ref<!fir.char<1,{{.*}}>>
 ! CHECK: %[[FILEARG:.*]] = fir.convert %[[FILE]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
-! CHECK: fir.call @_FortranAModReal4(%[[V1]], %[[V2]], %[[FILEARG]], %[[LINE]]) {{.*}}: (f32, f32, !fir.ref<i8>, i32) -> f32
+! CHECK: fir.call @_FortranAModReal4(%[[A_LOAD]], %[[P_LOAD]], %[[FILEARG]], %[[LINE]]) {{.*}}: (f32, f32, !fir.ref<i8>, i32) -> f32
   r = mod(a, p)
 end subroutine
 
 ! CHECK-LABEL: func @_QPmod_testr8(
-! CHECK-SAME: %[[arg0:.*]]: !fir.ref<f64>{{.*}}, %[[arg1:.*]]: !fir.ref<f64>{{.*}}, %[[arg2:.*]]: !fir.ref<f64>{{.*}}) {
 subroutine mod_testr8(r, a, p)
   real(8) :: r, a, p
-! CHECK: %[[V1:.*]] = fir.load %[[arg1]] : !fir.ref<f64>
-! CHECK: %[[V2:.*]] = fir.load %[[arg2]] : !fir.ref<f64>
-! CHECK: %[[FILE:.*]] = fir.address_of(@{{.*}}) : !fir.ref<!fir.char<1,{{.*}}>>
-! CHECK: %[[LINE:.*]] = arith.constant {{[0-9]*}} : i32
-! CHECK: %[[FILEARG:.*]] = fir.convert %[[FILE]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
-! CHECK: fir.call @_FortranAModReal8(%[[V1]], %[[V2]], %[[FILEARG]], %[[LINE]]) {{.*}}: (f64, f64, !fir.ref<i8>, i32) -> f64
+! CHECK: fir.call @_FortranAModReal8(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (f64, f64, !fir.ref<i8>, i32) -> f64
   r = mod(a, p)
 end subroutine
 
-! CHECK-LABEL: func @_QPmod_testr10(
-! CHECK-SAME: %[[arg0:.*]]: !fir.ref<f80>{{.*}}, %[[arg1:.*]]: !fir.ref<f80>{{.*}}, %[[arg2:.*]]: !fir.ref<f80>{{.*}}) {
+! CHECK-KIND10-LABEL: func @_QPmod_testr10(
 subroutine mod_testr10(r, a, p)
-  real(10) :: r, a, p
-! CHECK: %[[V1:.*]] = fir.load %[[arg1]] : !fir.ref<f80>
-! CHECK: %[[V2:.*]] = fir.load %[[arg2]] : !fir.ref<f80>
-! CHECK: %[[FILE:.*]] = fir.address_of(@{{.*}}) : !fir.ref<!fir.char<1,{{.*}}>>
-! CHECK: %[[LINE:.*]] = arith.constant {{[0-9]*}} : i32
-! CHECK: %[[FILEARG:.*]] = fir.convert %[[FILE]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
-! CHECK: fir.call @_FortranAModReal10(%[[V1]], %[[V2]], %[[FILEARG]], %[[LINE]]) {{.*}}: (f80, f80, !fir.ref<i8>, i32) -> f80
+  integer, parameter :: kind10 = merge(10, 4, selected_real_kind(p=18).eq.10)
+  real(kind10) :: r, a, p
+! CHECK-KIND10: fir.call @_FortranAModReal10(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (f80, f80, !fir.ref<i8>, i32) -> f80
   r = mod(a, p)
 end subroutine
 
-! CHECK-LABEL: func @_QPmod_testr16(
-! CHECK-SAME: %[[arg0:.*]]: !fir.ref<f128>{{.*}}, %[[arg1:.*]]: !fir.ref<f128>{{.*}}, %[[arg2:.*]]: !fir.ref<f128>{{.*}}) {
+! CHECK-KIND16-LABEL: func @_QPmod_testr16(
 subroutine mod_testr16(r, a, p)
-  real(16) :: r, a, p
-! CHECK: %[[V1:.*]] = fir.load %[[arg1]] : !fir.ref<f128>
-! CHECK: %[[V2:.*]] = fir.load %[[arg2]] : !fir.ref<f128>
-! CHECK: %[[FILE:.*]] = fir.address_of(@{{.*}}) : !fir.ref<!fir.char<1,{{.*}}>>
-! CHECK: %[[LINE:.*]] = arith.constant {{[0-9]*}} : i32
-! CHECK: %[[FILEARG:.*]] = fir.convert %[[FILE]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
-! CHECK: fir.call @_FortranAModReal16(%[[V1]], %[[V2]], %[[FILEARG]], %[[LINE]]) {{.*}}: (f128, f128, !fir.ref<i8>, i32) -> f128
+  integer, parameter :: kind16 = merge(16, 4, selected_real_kind(p=33).eq.16)
+  real(kind16) :: r, a, p
+! CHECK-KIND16: fir.call @_FortranAModReal16(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (f128, f128, !fir.ref<i8>, i32) -> f128
   r = mod(a, p)
 end subroutine
diff --git flang/test/Lower/Intrinsics/modulo.f90 flang/test/Lower/Intrinsics/modulo.f90
index 781ef8296a2b..37c4cd1a94ca 100644
--- flang/test/Lower/Intrinsics/modulo.f90
+++ flang/test/Lower/Intrinsics/modulo.f90
@@ -1,5 +1,5 @@
 ! RUN: bbc -emit-fir -hlfir=false %s -o - | FileCheck %s -check-prefixes=HONORINF,ALL
-! RUN: flang -fc1 -menable-no-infs -emit-fir -flang-deprecated-no-hlfir %s -o - | FileCheck %s -check-prefixes=CHECK,ALL
+! RUN: flang -fc1 -menable-no-infs -emit-fir -flang-deprecated-no-hlfir %s -o - | FileCheck %s -check-prefixes=CHECK,ALL,%if flang-supports-f128-math %{F128%} %else %{F64%}
 
 ! ALL-LABEL: func @_QPmodulo_testr(
 ! ALL-SAME: %[[arg0:.*]]: !fir.ref<f64>{{.*}}, %[[arg1:.*]]: !fir.ref<f64>{{.*}}, %[[arg2:.*]]: !fir.ref<f64>{{.*}}) {
@@ -39,9 +39,12 @@ subroutine modulo_testi(r, a, p)
 end subroutine
 
 ! CHECK-LABEL: func @_QPmodulo_testr16(
-! CHECK-SAME: %[[arg0:.*]]: !fir.ref<f128>{{.*}}, %[[arg1:.*]]: !fir.ref<f128>{{.*}}, %[[arg2:.*]]: !fir.ref<f128>{{.*}}) {
+! F128-SAME: %[[arg0:.*]]: !fir.ref<f128>{{.*}}, %[[arg1:.*]]: !fir.ref<f128>{{.*}}, %[[arg2:.*]]: !fir.ref<f128>{{.*}}) {
+! F64-SAME: %[[arg0:.*]]: !fir.ref<f64>{{.*}}, %[[arg1:.*]]: !fir.ref<f64>{{.*}}, %[[arg2:.*]]: !fir.ref<f64>{{.*}}) {
 subroutine modulo_testr16(r, a, p)
-  real(16) :: r, a, p
-  ! CHECK: fir.call @_FortranAModuloReal16({{.*}}){{.*}}: (f128, f128, !fir.ref<i8>, i32) -> f128
+  integer, parameter :: rk = merge(16, 8, selected_real_kind(33, 4931)==16)
+  real(rk) :: r, a, p
+  !F128: fir.call @_FortranAModuloReal16({{.*}}){{.*}}: (f128, f128, !fir.ref<i8>, i32) -> f128
+  !F64: arith.remf %0, %1 fastmath<ninf,contract> : f64
   r = modulo(a, p)
 end subroutine
diff --git flang/test/Lower/Intrinsics/nearest.f90 flang/test/Lower/Intrinsics/nearest.f90
index 5920d299d5fd..d4859cfe90e5 100644
--- flang/test/Lower/Intrinsics/nearest.f90
+++ flang/test/Lower/Intrinsics/nearest.f90
@@ -1,4 +1,4 @@
-! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc -emit-fir %s -o - | FileCheck %s --check-prefixes=CHECK%if target=x86_64{{.*}} %{,CHECK-KIND10%}%if flang-supports-f128-math %{,CHECK-KIND16%}
 
 ! CHECK-LABEL: c.func @_QPnearest_test1
   ! CHECK:     %[[V_0:[0-9]+]] = fir.dummy_scope : !fir.dscope
@@ -240,168 +240,171 @@ subroutine nearest_test4(x, s)
   res = nearest(x, s)
 end
 
-! CHECK-LABEL: c.func @_QPnearest_test5
-  ! CHECK:     %[[V_0:[0-9]+]] = fir.dummy_scope : !fir.dscope
-  ! CHECK:     %[[V_1:[0-9]+]] = fir.alloca f80 {bindc_name = "res", uniq_name = "_QFnearest_test5Eres"}
-  ! CHECK:     %[[V_2:[0-9]+]] = fir.declare %[[V_1]] {uniq_name = "_QFnearest_test5Eres"} : (!fir.ref<f80>) -> !fir.ref<f80>
-  ! CHECK:     %[[V_3:[0-9]+]] = fir.declare %arg1 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test5Es"} : (!fir.ref<f80>, !fir.dscope) -> !fir.ref<f80>
-  ! CHECK:     %[[V_4:[0-9]+]] = fir.declare %arg0 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test5Ex"} : (!fir.ref<f80>, !fir.dscope) -> !fir.ref<f80>
-  ! CHECK:     %[[V_5:[0-9]+]] = fir.load %[[V_4]] : !fir.ref<f80>
-  ! CHECK:     %[[V_6:[0-9]+]] = fir.load %[[V_3]] : !fir.ref<f80>
-  ! CHECK:     %[[V_7:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_5]]) <{bit = 3 : i32}> : (f80) -> i1
-  ! CHECK:     %[[V_8:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_6]]) <{bit = 96 : i32}> : (f80) -> i1
-  ! CHECK:     fir.if %[[V_8]] {
-  ! CHECK:       fir.call @_FortranAReportFatalUserError
-  ! CHECK:     }
-  ! CHECK:     %[[V_9:[0-9]+]] = arith.bitcast %[[V_6]] : f80 to i80
-  ! CHECK:     %[[V_10:[0-9]+]] = arith.shrui %[[V_9]], %c79{{.*}} : i80
-  ! CHECK:     %[[V_11:[0-9]+]] = arith.cmpi ne, %[[V_10]], %c1{{.*}} : i80
-  ! CHECK:     %[[V_12:[0-9]+]] = arith.bitcast %[[V_5]] : f80 to i80
-  ! CHECK:     %[[V_13:[0-9]+]] = arith.shrui %[[V_12]], %c79{{.*}} : i80
-  ! CHECK:     %[[V_14:[0-9]+]] = fir.convert %[[V_13]] : (i80) -> i1
-  ! CHECK:     %[[V_15:[0-9]+]] = arith.cmpi ne, %[[V_11]], %[[V_14]] : i1
-  ! CHECK:     %[[V_16:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_5]]) <{bit = 516 : i32}> : (f80) -> i1
-  ! CHECK:     %[[V_17:[0-9]+]] = arith.andi %[[V_16]], %[[V_15]] : i1
-  ! CHECK:     %[[V_18:[0-9]+]] = arith.ori %[[V_7]], %[[V_17]] : i1
-  ! CHECK:     %[[V_19:[0-9]+]] = fir.if %[[V_18]] -> (f80) {
-  ! CHECK:       fir.result %[[V_5]] : f80
-  ! CHECK:     } else {
-  ! CHECK:       %[[V_20:[0-9]+]] = arith.cmpf oeq, %[[V_5]], %cst{{[_0-9]*}} fastmath<contract> : f80
-  ! CHECK:       %[[V_21:[0-9]+]] = fir.if %[[V_20]] -> (f80) {
-  ! CHECK:         %[[V_22:[0-9]+]] = arith.select %[[V_11]], %cst{{[_0-9]*}}, %cst{{[_0-9]*}} : f80
-  ! CHECK:         %[[V_23:[0-9]+]] = fir.call @_FortranAMapException(%c48{{.*}}) fastmath<contract> : (i32) -> i32
-  ! CHECK:                            fir.call @feraiseexcept(%[[V_23]]) fastmath<contract> : (i32) -> i32
-  ! CHECK:         fir.result %[[V_22]] : f80
-  ! CHECK:       } else {
-  ! CHECK:         %[[V_22:[0-9]+]] = fir.call @_FortranANearest10(%[[V_5]], %[[V_11]]) fastmath<contract> : (f80, i1) -> f80
-  ! CHECK:         fir.result %[[V_22]] : f80
-  ! CHECK:       }
-  ! CHECK:       fir.result %[[V_21]] : f80
-  ! CHECK:     }
-  ! CHECK:     fir.store %[[V_19]] to %[[V_2]] : !fir.ref<f80>
-  ! CHECK:     return
-  ! CHECK:   }
+! CHECK-KIND10-LABEL: c.func @_QPnearest_test5
+  ! CHECK-KIND10:     %[[V_0:[0-9]+]] = fir.dummy_scope : !fir.dscope
+  ! CHECK-KIND10:     %[[V_1:[0-9]+]] = fir.alloca f80 {bindc_name = "res", uniq_name = "_QFnearest_test5Eres"}
+  ! CHECK-KIND10:     %[[V_2:[0-9]+]] = fir.declare %[[V_1]] {uniq_name = "_QFnearest_test5Eres"} : (!fir.ref<f80>) -> !fir.ref<f80>
+  ! CHECK-KIND10:     %[[V_3:[0-9]+]] = fir.declare %arg1 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test5Es"} : (!fir.ref<f80>, !fir.dscope) -> !fir.ref<f80>
+  ! CHECK-KIND10:     %[[V_4:[0-9]+]] = fir.declare %arg0 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test5Ex"} : (!fir.ref<f80>, !fir.dscope) -> !fir.ref<f80>
+  ! CHECK-KIND10:     %[[V_5:[0-9]+]] = fir.load %[[V_4]] : !fir.ref<f80>
+  ! CHECK-KIND10:     %[[V_6:[0-9]+]] = fir.load %[[V_3]] : !fir.ref<f80>
+  ! CHECK-KIND10:     %[[V_7:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_5]]) <{bit = 3 : i32}> : (f80) -> i1
+  ! CHECK-KIND10:     %[[V_8:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_6]]) <{bit = 96 : i32}> : (f80) -> i1
+  ! CHECK-KIND10:     fir.if %[[V_8]] {
+  ! CHECK-KIND10:       fir.call @_FortranAReportFatalUserError
+  ! CHECK-KIND10:     }
+  ! CHECK-KIND10:     %[[V_9:[0-9]+]] = arith.bitcast %[[V_6]] : f80 to i80
+  ! CHECK-KIND10:     %[[V_10:[0-9]+]] = arith.shrui %[[V_9]], %c79{{.*}} : i80
+  ! CHECK-KIND10:     %[[V_11:[0-9]+]] = arith.cmpi ne, %[[V_10]], %c1{{.*}} : i80
+  ! CHECK-KIND10:     %[[V_12:[0-9]+]] = arith.bitcast %[[V_5]] : f80 to i80
+  ! CHECK-KIND10:     %[[V_13:[0-9]+]] = arith.shrui %[[V_12]], %c79{{.*}} : i80
+  ! CHECK-KIND10:     %[[V_14:[0-9]+]] = fir.convert %[[V_13]] : (i80) -> i1
+  ! CHECK-KIND10:     %[[V_15:[0-9]+]] = arith.cmpi ne, %[[V_11]], %[[V_14]] : i1
+  ! CHECK-KIND10:     %[[V_16:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_5]]) <{bit = 516 : i32}> : (f80) -> i1
+  ! CHECK-KIND10:     %[[V_17:[0-9]+]] = arith.andi %[[V_16]], %[[V_15]] : i1
+  ! CHECK-KIND10:     %[[V_18:[0-9]+]] = arith.ori %[[V_7]], %[[V_17]] : i1
+  ! CHECK-KIND10:     %[[V_19:[0-9]+]] = fir.if %[[V_18]] -> (f80) {
+  ! CHECK-KIND10:       fir.result %[[V_5]] : f80
+  ! CHECK-KIND10:     } else {
+  ! CHECK-KIND10:       %[[V_20:[0-9]+]] = arith.cmpf oeq, %[[V_5]], %cst{{[_0-9]*}} fastmath<contract> : f80
+  ! CHECK-KIND10:       %[[V_21:[0-9]+]] = fir.if %[[V_20]] -> (f80) {
+  ! CHECK-KIND10:         %[[V_22:[0-9]+]] = arith.select %[[V_11]], %cst{{[_0-9]*}}, %cst{{[_0-9]*}} : f80
+  ! CHECK-KIND10:         %[[V_23:[0-9]+]] = fir.call @_FortranAMapException(%c48{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK-KIND10:                            fir.call @feraiseexcept(%[[V_23]]) fastmath<contract> : (i32) -> i32
+  ! CHECK-KIND10:         fir.result %[[V_22]] : f80
+  ! CHECK-KIND10:       } else {
+  ! CHECK-KIND10:         %[[V_22:[0-9]+]] = fir.call @_FortranANearest10(%[[V_5]], %[[V_11]]) fastmath<contract> : (f80, i1) -> f80
+  ! CHECK-KIND10:         fir.result %[[V_22]] : f80
+  ! CHECK-KIND10:       }
+  ! CHECK-KIND10:       fir.result %[[V_21]] : f80
+  ! CHECK-KIND10:     }
+  ! CHECK-KIND10:     fir.store %[[V_19]] to %[[V_2]] : !fir.ref<f80>
+  ! CHECK-KIND10:     return
+  ! CHECK-KIND10:   }
 subroutine nearest_test5(x, s)
-  real(kind=10) :: x, s, res
+  integer, parameter :: kind10 = merge(10, 4, selected_real_kind(p=18).eq.10)
+  real(kind=kind10) :: x, s, res
   res = nearest(x, s)
 end
 
-! CHECK-LABEL: c.func @_QPnearest_test6
-  ! CHECK:     %[[V_0:[0-9]+]] = fir.dummy_scope : !fir.dscope
-  ! CHECK:     %[[V_1:[0-9]+]] = fir.alloca f128 {bindc_name = "res", uniq_name = "_QFnearest_test6Eres"}
-  ! CHECK:     %[[V_2:[0-9]+]] = fir.declare %[[V_1]] {uniq_name = "_QFnearest_test6Eres"} : (!fir.ref<f128>) -> !fir.ref<f128>
-  ! CHECK:     %[[V_3:[0-9]+]] = fir.declare %arg1 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test6Es"} : (!fir.ref<f128>, !fir.dscope) -> !fir.ref<f128>
-  ! CHECK:     %[[V_4:[0-9]+]] = fir.declare %arg0 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test6Ex"} : (!fir.ref<f128>, !fir.dscope) -> !fir.ref<f128>
-  ! CHECK:     %[[V_5:[0-9]+]] = fir.load %[[V_4]] : !fir.ref<f128>
-  ! CHECK:     %[[V_6:[0-9]+]] = fir.load %[[V_3]] : !fir.ref<f128>
-  ! CHECK:     %[[V_7:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_5]]) <{bit = 3 : i32}> : (f128) -> i1
-  ! CHECK:     %[[V_8:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_6]]) <{bit = 96 : i32}> : (f128) -> i1
-  ! CHECK:     fir.if %[[V_8]] {
-  ! CHECK:       fir.call @_FortranAReportFatalUserError
-  ! CHECK:     }
-  ! CHECK:     %[[V_9:[0-9]+]] = arith.bitcast %[[V_6]] : f128 to i128
-  ! CHECK:     %[[V_10:[0-9]+]] = arith.shrui %[[V_9]], %c127{{.*}} : i128
-  ! CHECK:     %[[V_11:[0-9]+]] = arith.cmpi ne, %[[V_10]], %c1{{.*}} : i128
-  ! CHECK:     %[[V_12:[0-9]+]] = arith.bitcast %[[V_5]] : f128 to i128
-  ! CHECK:     %[[V_13:[0-9]+]] = arith.shrui %[[V_12]], %c127{{.*}} : i128
-  ! CHECK:     %[[V_14:[0-9]+]] = fir.convert %[[V_13]] : (i128) -> i1
-  ! CHECK:     %[[V_15:[0-9]+]] = arith.cmpi ne, %[[V_11]], %[[V_14]] : i1
-  ! CHECK:     %[[V_16:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_5]]) <{bit = 516 : i32}> : (f128) -> i1
-  ! CHECK:     %[[V_17:[0-9]+]] = arith.andi %[[V_16]], %[[V_15]] : i1
-  ! CHECK:     %[[V_18:[0-9]+]] = arith.ori %[[V_7]], %[[V_17]] : i1
-  ! CHECK:     %[[V_19:[0-9]+]] = fir.if %[[V_18]] -> (f128) {
-  ! CHECK:       fir.result %[[V_5]] : f128
-  ! CHECK:     } else {
-  ! CHECK:       %[[V_20:[0-9]+]] = arith.cmpf oeq, %[[V_5]], %cst{{[_0-9]*}} fastmath<contract> : f128
-  ! CHECK:       %[[V_21:[0-9]+]] = fir.if %[[V_20]] -> (f128) {
-  ! CHECK:         %[[V_22:[0-9]+]] = arith.select %[[V_11]], %cst{{[_0-9]*}}, %cst{{[_0-9]*}} : f128
-  ! CHECK:         %[[V_23:[0-9]+]] = fir.call @_FortranAMapException(%c48{{.*}}) fastmath<contract> : (i32) -> i32
-  ! CHECK:                            fir.call @feraiseexcept(%[[V_23]]) fastmath<contract> : (i32) -> i32
-  ! CHECK:         fir.result %[[V_22]] : f128
-  ! CHECK:       } else {
-  ! CHECK-DAG:     %[[V_22:[0-9]+]] = arith.subi %[[V_12]], %c1{{.*}} : i128
-  ! CHECK-DAG:     %[[V_23:[0-9]+]] = arith.addi %[[V_12]], %c1{{.*}} : i128
-  ! CHECK:         %[[V_24:[0-9]+]] = arith.select %[[V_15]], %[[V_23]], %[[V_22]] : i128
-  ! CHECK:         %[[V_25:[0-9]+]] = arith.bitcast %[[V_24]] : i128 to f128
-  ! CHECK:         %[[V_26:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_25]]) <{bit = 516 : i32}> : (f128) -> i1
-  ! CHECK:         fir.if %[[V_26]] {
-  ! CHECK:           %[[V_28:[0-9]+]] = fir.call @_FortranAMapException(%c40{{.*}}) fastmath<contract> : (i32) -> i32
-  ! CHECK:                              fir.call @feraiseexcept(%[[V_28]]) fastmath<contract> : (i32) -> i32
-  ! CHECK:         }
-  ! CHECK:         %[[V_27:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_25]]) <{bit = 144 : i32}> : (f128) -> i1
-  ! CHECK:         fir.if %[[V_27]] {
-  ! CHECK:           %[[V_28:[0-9]+]] = fir.call @_FortranAMapException(%c48{{.*}}) fastmath<contract> : (i32) -> i32
-  ! CHECK:                              fir.call @feraiseexcept(%[[V_28]]) fastmath<contract> : (i32) -> i32
-  ! CHECK:         }
-  ! CHECK:         fir.result %[[V_25]] : f128
-  ! CHECK:       }
-  ! CHECK:       fir.result %[[V_21]] : f128
-  ! CHECK:     }
-  ! CHECK:     fir.store %[[V_19]] to %[[V_2]] : !fir.ref<f128>
-  ! CHECK:     return
-  ! CHECK:   }
+! CHECK-KIND16-LABEL: c.func @_QPnearest_test6
+  ! CHECK-KIND16:     %[[V_0:[0-9]+]] = fir.dummy_scope : !fir.dscope
+  ! CHECK-KIND16:     %[[V_1:[0-9]+]] = fir.alloca f128 {bindc_name = "res", uniq_name = "_QFnearest_test6Eres"}
+  ! CHECK-KIND16:     %[[V_2:[0-9]+]] = fir.declare %[[V_1]] {uniq_name = "_QFnearest_test6Eres"} : (!fir.ref<f128>) -> !fir.ref<f128>
+  ! CHECK-KIND16:     %[[V_3:[0-9]+]] = fir.declare %arg1 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test6Es"} : (!fir.ref<f128>, !fir.dscope) -> !fir.ref<f128>
+  ! CHECK-KIND16:     %[[V_4:[0-9]+]] = fir.declare %arg0 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test6Ex"} : (!fir.ref<f128>, !fir.dscope) -> !fir.ref<f128>
+  ! CHECK-KIND16:     %[[V_5:[0-9]+]] = fir.load %[[V_4]] : !fir.ref<f128>
+  ! CHECK-KIND16:     %[[V_6:[0-9]+]] = fir.load %[[V_3]] : !fir.ref<f128>
+  ! CHECK-KIND16:     %[[V_7:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_5]]) <{bit = 3 : i32}> : (f128) -> i1
+  ! CHECK-KIND16:     %[[V_8:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_6]]) <{bit = 96 : i32}> : (f128) -> i1
+  ! CHECK-KIND16:     fir.if %[[V_8]] {
+  ! CHECK-KIND16:       fir.call @_FortranAReportFatalUserError
+  ! CHECK-KIND16:     }
+  ! CHECK-KIND16:     %[[V_9:[0-9]+]] = arith.bitcast %[[V_6]] : f128 to i128
+  ! CHECK-KIND16:     %[[V_10:[0-9]+]] = arith.shrui %[[V_9]], %c127{{.*}} : i128
+  ! CHECK-KIND16:     %[[V_11:[0-9]+]] = arith.cmpi ne, %[[V_10]], %c1{{.*}} : i128
+  ! CHECK-KIND16:     %[[V_12:[0-9]+]] = arith.bitcast %[[V_5]] : f128 to i128
+  ! CHECK-KIND16:     %[[V_13:[0-9]+]] = arith.shrui %[[V_12]], %c127{{.*}} : i128
+  ! CHECK-KIND16:     %[[V_14:[0-9]+]] = fir.convert %[[V_13]] : (i128) -> i1
+  ! CHECK-KIND16:     %[[V_15:[0-9]+]] = arith.cmpi ne, %[[V_11]], %[[V_14]] : i1
+  ! CHECK-KIND16:     %[[V_16:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_5]]) <{bit = 516 : i32}> : (f128) -> i1
+  ! CHECK-KIND16:     %[[V_17:[0-9]+]] = arith.andi %[[V_16]], %[[V_15]] : i1
+  ! CHECK-KIND16:     %[[V_18:[0-9]+]] = arith.ori %[[V_7]], %[[V_17]] : i1
+  ! CHECK-KIND16:     %[[V_19:[0-9]+]] = fir.if %[[V_18]] -> (f128) {
+  ! CHECK-KIND16:       fir.result %[[V_5]] : f128
+  ! CHECK-KIND16:     } else {
+  ! CHECK-KIND16:       %[[V_20:[0-9]+]] = arith.cmpf oeq, %[[V_5]], %cst{{[_0-9]*}} fastmath<contract> : f128
+  ! CHECK-KIND16:       %[[V_21:[0-9]+]] = fir.if %[[V_20]] -> (f128) {
+  ! CHECK-KIND16:         %[[V_22:[0-9]+]] = arith.select %[[V_11]], %cst{{[_0-9]*}}, %cst{{[_0-9]*}} : f128
+  ! CHECK-KIND16:         %[[V_23:[0-9]+]] = fir.call @_FortranAMapException(%c48{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK-KIND16:                            fir.call @feraiseexcept(%[[V_23]]) fastmath<contract> : (i32) -> i32
+  ! CHECK-KIND16:         fir.result %[[V_22]] : f128
+  ! CHECK-KIND16:       } else {
+  ! CHECK-KIND16-DAG:     %[[V_22:[0-9]+]] = arith.subi %[[V_12]], %c1{{.*}} : i128
+  ! CHECK-KIND16-DAG:     %[[V_23:[0-9]+]] = arith.addi %[[V_12]], %c1{{.*}} : i128
+  ! CHECK-KIND16:         %[[V_24:[0-9]+]] = arith.select %[[V_15]], %[[V_23]], %[[V_22]] : i128
+  ! CHECK-KIND16:         %[[V_25:[0-9]+]] = arith.bitcast %[[V_24]] : i128 to f128
+  ! CHECK-KIND16:         %[[V_26:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_25]]) <{bit = 516 : i32}> : (f128) -> i1
+  ! CHECK-KIND16:         fir.if %[[V_26]] {
+  ! CHECK-KIND16:           %[[V_28:[0-9]+]] = fir.call @_FortranAMapException(%c40{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK-KIND16:                              fir.call @feraiseexcept(%[[V_28]]) fastmath<contract> : (i32) -> i32
+  ! CHECK-KIND16:         }
+  ! CHECK-KIND16:         %[[V_27:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_25]]) <{bit = 144 : i32}> : (f128) -> i1
+  ! CHECK-KIND16:         fir.if %[[V_27]] {
+  ! CHECK-KIND16:           %[[V_28:[0-9]+]] = fir.call @_FortranAMapException(%c48{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK-KIND16:                              fir.call @feraiseexcept(%[[V_28]]) fastmath<contract> : (i32) -> i32
+  ! CHECK-KIND16:         }
+  ! CHECK-KIND16:         fir.result %[[V_25]] : f128
+  ! CHECK-KIND16:       }
+  ! CHECK-KIND16:       fir.result %[[V_21]] : f128
+  ! CHECK-KIND16:     }
+  ! CHECK-KIND16:     fir.store %[[V_19]] to %[[V_2]] : !fir.ref<f128>
+  ! CHECK-KIND16:     return
+  ! CHECK-KIND16:   }
 subroutine nearest_test6(x, s)
-  real(kind=16) :: x, s, res
+  integer, parameter :: kind16 = merge(16, 4, selected_real_kind(p=33).eq.16)
+  real(kind=kind16) :: x, s, res
   res = nearest(x, s)
 end
 
-! CHECK-LABEL: c.func @_QPnearest_test7
-  ! CHECK:     %[[V_0:[0-9]+]] = fir.dummy_scope : !fir.dscope
-  ! CHECK:     %[[V_1:[0-9]+]] = fir.alloca f128 {bindc_name = "res", uniq_name = "_QFnearest_test7Eres"}
-  ! CHECK:     %[[V_2:[0-9]+]] = fir.declare %[[V_1]] {uniq_name = "_QFnearest_test7Eres"} : (!fir.ref<f128>) -> !fir.ref<f128>
-  ! CHECK:     %[[V_3:[0-9]+]] = fir.declare %arg1 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test7Es"} : (!fir.ref<f32>, !fir.dscope) -> !fir.ref<f32>
-  ! CHECK:     %[[V_4:[0-9]+]] = fir.declare %arg0 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test7Ex"} : (!fir.ref<f128>, !fir.dscope) -> !fir.ref<f128>
-  ! CHECK:     %[[V_5:[0-9]+]] = fir.load %[[V_4]] : !fir.ref<f128>
-  ! CHECK:     %[[V_6:[0-9]+]] = fir.load %[[V_3]] : !fir.ref<f32>
-  ! CHECK:     %[[V_7:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_5]]) <{bit = 3 : i32}> : (f128) -> i1
-  ! CHECK:     %[[V_8:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_6]]) <{bit = 96 : i32}> : (f32) -> i1
-  ! CHECK:     fir.if %[[V_8]] {
-  ! CHECK:       fir.call @_FortranAReportFatalUserError
-  ! CHECK:     }
-  ! CHECK:     %[[V_9:[0-9]+]] = arith.bitcast %[[V_6]] : f32 to i32
-  ! CHECK:     %[[V_10:[0-9]+]] = arith.shrui %[[V_9]], %c31{{.*}} : i32
-  ! CHECK:     %[[V_11:[0-9]+]] = fir.convert %[[V_10]] : (i32) -> i128
-  ! CHECK:     %[[V_12:[0-9]+]] = arith.cmpi ne, %[[V_11]], %c1{{.*}} : i128
-  ! CHECK:     %[[V_13:[0-9]+]] = arith.bitcast %[[V_5]] : f128 to i128
-  ! CHECK:     %[[V_14:[0-9]+]] = arith.shrui %[[V_13]], %c127{{.*}} : i128
-  ! CHECK:     %[[V_15:[0-9]+]] = fir.convert %[[V_14]] : (i128) -> i1
-  ! CHECK:     %[[V_16:[0-9]+]] = arith.cmpi ne, %[[V_12]], %[[V_15]] : i1
-  ! CHECK:     %[[V_17:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_5]]) <{bit = 516 : i32}> : (f128) -> i1
-  ! CHECK:     %[[V_18:[0-9]+]] = arith.andi %[[V_17]], %[[V_16]] : i1
-  ! CHECK:     %[[V_19:[0-9]+]] = arith.ori %[[V_7]], %[[V_18]] : i1
-  ! CHECK:     %[[V_20:[0-9]+]] = fir.if %[[V_19]] -> (f128) {
-  ! CHECK:       fir.result %[[V_5]] : f128
-  ! CHECK:     } else {
-  ! CHECK:       %[[V_21:[0-9]+]] = arith.cmpf oeq, %[[V_5]], %cst{{[_0-9]*}} fastmath<contract> : f128
-  ! CHECK:       %[[V_22:[0-9]+]] = fir.if %[[V_21]] -> (f128) {
-  ! CHECK:         %[[V_23:[0-9]+]] = arith.select %[[V_12]], %cst{{[_0-9]*}}, %cst{{[_0-9]*}} : f128
-  ! CHECK:         %[[V_24:[0-9]+]] = fir.call @_FortranAMapException(%c48{{.*}}) fastmath<contract> : (i32) -> i32
-  ! CHECK:                            fir.call @feraiseexcept(%[[V_24]]) fastmath<contract> : (i32) -> i32
-  ! CHECK:         fir.result %[[V_23]] : f128
-  ! CHECK:       } else {
-  ! CHECK-DAG:     %[[V_23:[0-9]+]] = arith.subi %[[V_13]], %c1{{.*}} : i128
-  ! CHECK-DAG:     %[[V_24:[0-9]+]] = arith.addi %[[V_13]], %c1{{.*}} : i128
-  ! CHECK:         %[[V_25:[0-9]+]] = arith.select %[[V_16]], %[[V_24]], %[[V_23]] : i128
-  ! CHECK:         %[[V_26:[0-9]+]] = arith.bitcast %[[V_25]] : i128 to f128
-  ! CHECK:         %[[V_27:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_26]]) <{bit = 516 : i32}> : (f128) -> i1
-  ! CHECK:         fir.if %[[V_27]] {
-  ! CHECK:           %[[V_29:[0-9]+]] = fir.call @_FortranAMapException(%c40{{.*}}) fastmath<contract> : (i32) -> i32
-  ! CHECK:                              fir.call @feraiseexcept(%[[V_29]]) fastmath<contract> : (i32) -> i32
-  ! CHECK:         }
-  ! CHECK:         %[[V_28:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_26]]) <{bit = 144 : i32}> : (f128) -> i1
-  ! CHECK:         fir.if %[[V_28]] {
-  ! CHECK:           %[[V_29:[0-9]+]] = fir.call @_FortranAMapException(%c48{{.*}}) fastmath<contract> : (i32) -> i32
-  ! CHECK:                              fir.call @feraiseexcept(%[[V_29]]) fastmath<contract> : (i32) -> i32
-  ! CHECK:         }
-  ! CHECK:         fir.result %[[V_26]] : f128
-  ! CHECK:       }
-  ! CHECK:       fir.result %[[V_22]] : f128
-  ! CHECK:     }
-  ! CHECK:     fir.store %[[V_20]] to %[[V_2]] : !fir.ref<f128>
-  ! CHECK:     return
-  ! CHECK:   }
+! CHECK-KIND16-LABEL: c.func @_QPnearest_test7
+  ! CHECK-KIND16:     %[[V_0:[0-9]+]] = fir.dummy_scope : !fir.dscope
+  ! CHECK-KIND16:     %[[V_1:[0-9]+]] = fir.alloca f128 {bindc_name = "res", uniq_name = "_QFnearest_test7Eres"}
+  ! CHECK-KIND16:     %[[V_2:[0-9]+]] = fir.declare %[[V_1]] {uniq_name = "_QFnearest_test7Eres"} : (!fir.ref<f128>) -> !fir.ref<f128>
+  ! CHECK-KIND16:     %[[V_3:[0-9]+]] = fir.declare %arg1 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test7Es"} : (!fir.ref<f32>, !fir.dscope) -> !fir.ref<f32>
+  ! CHECK-KIND16:     %[[V_4:[0-9]+]] = fir.declare %arg0 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test7Ex"} : (!fir.ref<f128>, !fir.dscope) -> !fir.ref<f128>
+  ! CHECK-KIND16:     %[[V_5:[0-9]+]] = fir.load %[[V_4]] : !fir.ref<f128>
+  ! CHECK-KIND16:     %[[V_6:[0-9]+]] = fir.load %[[V_3]] : !fir.ref<f32>
+  ! CHECK-KIND16:     %[[V_7:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_5]]) <{bit = 3 : i32}> : (f128) -> i1
+  ! CHECK-KIND16:     %[[V_8:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_6]]) <{bit = 96 : i32}> : (f32) -> i1
+  ! CHECK-KIND16:     fir.if %[[V_8]] {
+  ! CHECK-KIND16:       fir.call @_FortranAReportFatalUserError
+  ! CHECK-KIND16:     }
+  ! CHECK-KIND16:     %[[V_9:[0-9]+]] = arith.bitcast %[[V_6]] : f32 to i32
+  ! CHECK-KIND16:     %[[V_10:[0-9]+]] = arith.shrui %[[V_9]], %c31{{.*}} : i32
+  ! CHECK-KIND16:     %[[V_11:[0-9]+]] = fir.convert %[[V_10]] : (i32) -> i128
+  ! CHECK-KIND16:     %[[V_12:[0-9]+]] = arith.cmpi ne, %[[V_11]], %c1{{.*}} : i128
+  ! CHECK-KIND16:     %[[V_13:[0-9]+]] = arith.bitcast %[[V_5]] : f128 to i128
+  ! CHECK-KIND16:     %[[V_14:[0-9]+]] = arith.shrui %[[V_13]], %c127{{.*}} : i128
+  ! CHECK-KIND16:     %[[V_15:[0-9]+]] = fir.convert %[[V_14]] : (i128) -> i1
+  ! CHECK-KIND16:     %[[V_16:[0-9]+]] = arith.cmpi ne, %[[V_12]], %[[V_15]] : i1
+  ! CHECK-KIND16:     %[[V_17:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_5]]) <{bit = 516 : i32}> : (f128) -> i1
+  ! CHECK-KIND16:     %[[V_18:[0-9]+]] = arith.andi %[[V_17]], %[[V_16]] : i1
+  ! CHECK-KIND16:     %[[V_19:[0-9]+]] = arith.ori %[[V_7]], %[[V_18]] : i1
+  ! CHECK-KIND16:     %[[V_20:[0-9]+]] = fir.if %[[V_19]] -> (f128) {
+  ! CHECK-KIND16:       fir.result %[[V_5]] : f128
+  ! CHECK-KIND16:     } else {
+  ! CHECK-KIND16:       %[[V_21:[0-9]+]] = arith.cmpf oeq, %[[V_5]], %cst{{[_0-9]*}} fastmath<contract> : f128
+  ! CHECK-KIND16:       %[[V_22:[0-9]+]] = fir.if %[[V_21]] -> (f128) {
+  ! CHECK-KIND16:         %[[V_23:[0-9]+]] = arith.select %[[V_12]], %cst{{[_0-9]*}}, %cst{{[_0-9]*}} : f128
+  ! CHECK-KIND16:         %[[V_24:[0-9]+]] = fir.call @_FortranAMapException(%c48{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK-KIND16:                            fir.call @feraiseexcept(%[[V_24]]) fastmath<contract> : (i32) -> i32
+  ! CHECK-KIND16:         fir.result %[[V_23]] : f128
+  ! CHECK-KIND16:       } else {
+  ! CHECK-KIND16-DAG:     %[[V_23:[0-9]+]] = arith.subi %[[V_13]], %c1{{.*}} : i128
+  ! CHECK-KIND16-DAG:     %[[V_24:[0-9]+]] = arith.addi %[[V_13]], %c1{{.*}} : i128
+  ! CHECK-KIND16:         %[[V_25:[0-9]+]] = arith.select %[[V_16]], %[[V_24]], %[[V_23]] : i128
+  ! CHECK-KIND16:         %[[V_26:[0-9]+]] = arith.bitcast %[[V_25]] : i128 to f128
+  ! CHECK-KIND16:         %[[V_27:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_26]]) <{bit = 516 : i32}> : (f128) -> i1
+  ! CHECK-KIND16:         fir.if %[[V_27]] {
+  ! CHECK-KIND16:           %[[V_29:[0-9]+]] = fir.call @_FortranAMapException(%c40{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK-KIND16:                              fir.call @feraiseexcept(%[[V_29]]) fastmath<contract> : (i32) -> i32
+  ! CHECK-KIND16:         }
+  ! CHECK-KIND16:         %[[V_28:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_26]]) <{bit = 144 : i32}> : (f128) -> i1
+  ! CHECK-KIND16:         fir.if %[[V_28]] {
+  ! CHECK-KIND16:           %[[V_29:[0-9]+]] = fir.call @_FortranAMapException(%c48{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK-KIND16:                              fir.call @feraiseexcept(%[[V_29]]) fastmath<contract> : (i32) -> i32
+  ! CHECK-KIND16:         }
+  ! CHECK-KIND16:         fir.result %[[V_26]] : f128
+  ! CHECK-KIND16:       }
+  ! CHECK-KIND16:       fir.result %[[V_22]] : f128
+  ! CHECK-KIND16:     }
+  ! CHECK-KIND16:     fir.store %[[V_20]] to %[[V_2]] : !fir.ref<f128>
+  ! CHECK-KIND16:     return
+  ! CHECK-KIND16:   }
 subroutine nearest_test7(x, s)
-  real(kind=16) :: x, res
+  integer, parameter :: kind16 = merge(16, 4, selected_real_kind(p=33).eq.16)
+  real(kind=kind16) :: x, res
   real :: s
   res = nearest(x, s)
 end
diff --git flang/test/Lower/Intrinsics/norm2.f90 flang/test/Lower/Intrinsics/norm2.f90
index ac761ae3f538..ec89caa51d1a 100644
--- flang/test/Lower/Intrinsics/norm2.f90
+++ flang/test/Lower/Intrinsics/norm2.f90
@@ -1,94 +1,70 @@
-! RUN: bbc -emit-fir -hlfir=false %s -o - | FileCheck %s
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir %s -o - | FileCheck %s
+! RUN: bbc -emit-fir %s -o - | FileCheck %s --check-prefixes=CHECK%if target=x86_64{{.*}} %{,CHECK-KIND10%}%if flang-supports-f128-math %{,CHECK-KIND16%}
 
 ! CHECK-LABEL: func @_QPnorm2_test_4(
-! CHECK-SAME: %[[arg0:.*]]: !fir.box<!fir.array<?xf32>>{{.*}}) -> f32
 real(4) function norm2_test_4(a)
   real(4) :: a(:)
-  ! CHECK-DAG:  %[[c0:.*]] = arith.constant 0 : index
-  ! CHECK-DAG:  %[[arr:.*]] = fir.convert %[[arg0]] : (!fir.box<!fir.array<?xf32>>) -> !fir.box<none>
+  ! CHECK:  %[[c0:.*]] = arith.constant 0 : index
+  ! CHECK: %[[a1:.*]] = fir.declare{{.*}}a"
+  ! CHECK: %[[a:.*]] = fir.rebox %[[a1]]{{.*}}
+  ! CHECK-DAG:  %[[arr:.*]] = fir.convert %[[a]] : (!fir.box<!fir.array<?xf32>>) -> !fir.box<none>
   ! CHECK:  %[[dim:.*]] = fir.convert %[[c0]] : (index) -> i32
   norm2_test_4 = norm2(a)
   ! CHECK:  %{{.*}} = fir.call @_FortranANorm2_4(%[[arr]], %{{.*}}, %{{.*}}, %[[dim]]) {{.*}} : (!fir.box<none>, !fir.ref<i8>, i32, i32) -> f32
 end function norm2_test_4
 
 ! CHECK-LABEL: func @_QPnorm2_test_8(
-! CHECK-SAME: %[[arg0:.*]]: !fir.box<!fir.array<?x?xf64>>{{.*}}) -> f64
 real(8) function norm2_test_8(a)
   real(8) :: a(:,:)
-  ! CHECK-DAG:  %[[c0:.*]] = arith.constant 0 : index
-  ! CHECK-DAG:  %[[arr:.*]] = fir.convert %[[arg0]] : (!fir.box<!fir.array<?x?xf64>>) -> !fir.box<none>
-  ! CHECK:  %[[dim:.*]] = fir.convert %[[c0]] : (index) -> i32
   norm2_test_8 = norm2(a)
-  ! CHECK:  %{{.*}} = fir.call @_FortranANorm2_8(%[[arr]], %{{.*}}, %{{.*}}, %[[dim]]) {{.*}} : (!fir.box<none>, !fir.ref<i8>, i32, i32) -> f64
+  ! CHECK: fir.call @_FortranANorm2_8(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.box<none>, !fir.ref<i8>, i32, i32) -> f64
 end function norm2_test_8
 
-! CHECK-LABEL: func @_QPnorm2_test_10(
-! CHECK-SAME: %[[arg0:.*]]: !fir.box<!fir.array<?x?x?xf80>>{{.*}}) -> f80
-real(10) function norm2_test_10(a)
-  real(10) :: a(:,:,:)
-  ! CHECK-DAG:  %[[c0:.*]] = arith.constant 0 : index
-  ! CHECK-DAG:  %[[arr:.*]] = fir.convert %[[arg0]] : (!fir.box<!fir.array<?x?x?xf80>>) -> !fir.box<none>
-  ! CHECK:  %[[dim:.*]] = fir.convert %[[c0]] : (index) -> i32
+! CHECK-KIND10-LABEL: func @_QPnorm2_test_10(
+function norm2_test_10(a)
+  integer, parameter :: kind10 = merge(10, 4, selected_real_kind(p=18).eq.10)
+  real(kind10) :: a(:,:,:), norm2_test_10
   norm2_test_10 = norm2(a)
-  ! CHECK:  %{{.*}} = fir.call @_FortranANorm2_10(%[[arr]], %{{.*}}, %{{.*}}, %[[dim]]) {{.*}} : (!fir.box<none>, !fir.ref<i8>, i32, i32) -> f80
+  ! CHECK-KIND10: fir.call @_FortranANorm2_10(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.box<none>, !fir.ref<i8>, i32, i32) -> f80
 end function norm2_test_10
 
-! CHECK-LABEL: func @_QPnorm2_test_16(
-! CHECK-SAME: %[[arg0:.*]]: !fir.box<!fir.array<?x?x?xf128>>{{.*}}) -> f128
-real(16) function norm2_test_16(a)
-  real(16) :: a(:,:,:)
-  ! CHECK-DAG:  %[[c0:.*]] = arith.constant 0 : index
-  ! CHECK-DAG:  %[[arr:.*]] = fir.convert %[[arg0]] : (!fir.box<!fir.array<?x?x?xf128>>) -> !fir.box<none>
-  ! CHECK:  %[[dim:.*]] = fir.convert %[[c0]] : (index) -> i32
+! CHECK-KIND16-LABEL: func @_QPnorm2_test_16(
+function norm2_test_16(a)
+  integer, parameter :: kind16 = merge(16, 4, selected_real_kind(p=33).eq.16)
+  real(kind16) :: a(:,:,:), norm2_test_16
   norm2_test_16 = norm2(a)
-  ! CHECK:  %{{.*}} = fir.call @_FortranANorm2_16(%[[arr]], %{{.*}}, %{{.*}}, %[[dim]]) {{.*}} : (!fir.box<none>, !fir.ref<i8>, i32, i32) -> f128
+  ! CHECK-KIND16:  %{{.*}} = fir.call @_FortranANorm2_16(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.box<none>, !fir.ref<i8>, i32, i32) -> f128
 end function norm2_test_16
 
 ! CHECK-LABEL: func @_QPnorm2_test_dim_2(
-! CHECK-SAME: %[[arg0:.*]]: !fir.box<!fir.array<?x?xf32>>{{.*}}, %[[arg1:.*]]: !fir.box<!fir.array<?xf32>>{{.*}})
 subroutine norm2_test_dim_2(a,r)
   real :: a(:,:)
   real :: r(:)
   ! CHECK-DAG:  %[[dim:.*]] = arith.constant 1 : i32
+  ! CHECK-DAG: %[[a1:.*]] = fir.declare{{.*}}a"
+  ! CHECK-DAG: %[[a:.*]] = fir.rebox %[[a1]]{{.*}}
   ! CHECK-DAG:  %[[r:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>>
   ! CHECK-DAG:  %[[res:.*]] = fir.convert %[[r]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
-  ! CHECK:  %[[arr:.*]] = fir.convert %[[arg0]] : (!fir.box<!fir.array<?x?xf32>>) -> !fir.box<none>
+  ! CHECK:  %[[arr:.*]] = fir.convert %[[a]] : (!fir.box<!fir.array<?x?xf32>>) -> !fir.box<none>
   r = norm2(a,dim=1)
   ! CHECK:  fir.call @_FortranANorm2Dim(%[[res]], %[[arr]], %[[dim]], %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> ()
-  ! CHECK:  %[[box:.*]] = fir.load %[[r]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
-  ! CHECK-DAG:  %[[addr:.*]] = fir.box_addr %[[box]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>) -> !fir.heap<!fir.array<?xf32>>
-  ! CHECK-DAG:  fir.freemem %[[addr]]
+  ! CHECK-DAG:  fir.freemem
 end subroutine norm2_test_dim_2
 
 ! CHECK-LABEL: func @_QPnorm2_test_dim_3(
-! CHECK-SAME: %[[arg0:.*]]: !fir.box<!fir.array<?x?x?xf32>>{{.*}}, %[[arg1:.*]]: !fir.box<!fir.array<?x?xf32>>{{.*}})
 subroutine norm2_test_dim_3(a,r)
   real :: a(:,:,:)
   real :: r(:,:)
   ! CHECK-DAG:  %[[dim:.*]] = arith.constant 3 : i32
-  ! CHECK-DAG:  %[[r:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?x?xf32>>>
-  ! CHECK-DAG:  %[[res:.*]] = fir.convert %[[r]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<none>>
-  ! CHECK:  %[[arr:.*]] = fir.convert %[[arg0]] : (!fir.box<!fir.array<?x?x?xf32>>) -> !fir.box<none>
   r = norm2(a,dim=3)
-  ! CHECK:  fir.call @_FortranANorm2Dim(%[[res]], %[[arr]], %[[dim]], %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> ()
-  ! CHECK:  %[[box:.*]] = fir.load %[[r]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
-  ! CHECK-DAG:  %[[addr:.*]] = fir.box_addr %[[box]] : (!fir.box<!fir.heap<!fir.array<?x?xf32>>>) -> !fir.heap<!fir.array<?x?xf32>>
-  ! CHECK-DAG:  fir.freemem %[[addr]]
+  ! CHECK:  fir.call @_FortranANorm2Dim(%{{.*}}, %{{.*}}, %[[dim]], %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> ()
 end subroutine norm2_test_dim_3
 
 ! CHECK-LABEL: func @_QPnorm2_test_real16(
-! CHECK-SAME: %[[arg0:.*]]: !fir.box<!fir.array<?x?x?xf128>>{{.*}}, %[[arg1:.*]]: !fir.box<!fir.array<?x?xf128>>{{.*}})
 subroutine norm2_test_real16(a,r)
-  real(16) :: a(:,:,:)
-  real(16) :: r(:,:)
-  ! CHECK-DAG:  %[[dim:.*]] = arith.constant 3 : i32
-  ! CHECK-DAG:  %[[r:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?x?xf128>>>
-  ! CHECK-DAG:  %[[res:.*]] = fir.convert %[[r]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf128>>>>) -> !fir.ref<!fir.box<none>>
-  ! CHECK:  %[[arr:.*]] = fir.convert %[[arg0]] : (!fir.box<!fir.array<?x?x?xf128>>) -> !fir.box<none>
+  integer, parameter :: kind16 = merge(16, 4, selected_real_kind(p=33).eq.16)
+  real(kind16) :: a(:,:,:)
+  real(kind16) :: r(:,:)
   r = norm2(a,dim=3)
-  ! CHECK:  fir.call @_FortranANorm2DimReal16(%[[res]], %[[arr]], %[[dim]], %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> ()
-  ! CHECK:  %[[box:.*]] = fir.load %[[r]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf128>>>>
-  ! CHECK-DAG:  %[[addr:.*]] = fir.box_addr %[[box]] : (!fir.box<!fir.heap<!fir.array<?x?xf128>>>) -> !fir.heap<!fir.array<?x?xf128>>
-  ! CHECK-DAG:  fir.freemem %[[addr]]
+  ! CHECK-KIND16:  fir.call @_FortranANorm2DimReal16(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> ()
+  ! CHECK-KIND16:  fir.freemem
 end subroutine norm2_test_real16
diff --git flang/test/Lower/Intrinsics/powi_real16.f90 flang/test/Lower/Intrinsics/powi_real16.f90
index 9e7d0f828b5c..dc19b32742c0 100644
--- flang/test/Lower/Intrinsics/powi_real16.f90
+++ flang/test/Lower/Intrinsics/powi_real16.f90
@@ -1,3 +1,4 @@
+! REQUIRES: flang-supports-f128-math
 ! RUN: bbc -emit-fir %s -o - | FileCheck %s --check-prefix=CHECK-FAST
 ! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s --check-prefix=CHECK-PRECISE
 ! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s --check-prefix=CHECK-FAST
diff --git flang/test/Lower/Intrinsics/random_number_real16.f90 flang/test/Lower/Intrinsics/random_number_real16.f90
index 060574d5b3b3..1597d9bd5397 100644
--- flang/test/Lower/Intrinsics/random_number_real16.f90
+++ flang/test/Lower/Intrinsics/random_number_real16.f90
@@ -1,3 +1,4 @@
+! REQUIRES: flang-supports-f128-math
 ! RUN: bbc -emit-fir %s -o - | FileCheck %s
 ! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
 
diff --git flang/test/Lower/Intrinsics/reduce.f90 flang/test/Lower/Intrinsics/reduce.f90
index 8d7ec89d2747..4851fa20906d 100644
--- flang/test/Lower/Intrinsics/reduce.f90
+++ flang/test/Lower/Intrinsics/reduce.f90
@@ -1,4 +1,4 @@
-! RUN: bbc -emit-hlfir %s -o - | FileCheck %s
+! RUN: bbc -emit-hlfir %s -o - | FileCheck %s --check-prefixes=CHECK%if target=x86_64{{.*}} %{,CHECK-KIND10%}%if flang-supports-f128-math %{,CHECK-KIND16%}
 
 module reduce_mod
 
@@ -17,6 +17,10 @@ end type
     end function
   end interface
 
+  integer, parameter :: kind10 = merge(10, 4, selected_real_kind(p=18).eq.10)
+  integer, parameter :: kind16 = merge(16, 4, selected_real_kind(p=33).eq.16)
+  
+
 contains
 
 pure function red_int1(a,b)
@@ -270,48 +274,48 @@ end subroutine
 ! CHECK: fir.call @_FortranAReduceReal8Value
 
 pure function red_real10(a,b)
-  real(10), intent(in) :: a, b
-  real(10) :: red_real10
+  real(kind10), intent(in) :: a, b
+  real(kind10) :: red_real10
   red_real10 = a + b
 end function
 
 pure function red_real10_value(a,b)
-  real(10), value, intent(in) :: a, b
-  real(10) :: red_real10_value
+  real(kind10), value, intent(in) :: a, b
+  real(kind10) :: red_real10_value
   red_real10_value = a + b
 end function
 
 subroutine real10(a)
-  real(10), intent(in) :: a(:)
-  real(10) :: res
+  real(kind10), intent(in) :: a(:)
+  real(kind10) :: res
   res = reduce(a, red_real10)
   res = reduce(a, red_real10_value)
 end subroutine
 
-! CHECK: fir.call @_FortranAReduceReal10Ref
-! CHECK: fir.call @_FortranAReduceReal10Value
+! CHECK-KIND10: fir.call @_FortranAReduceReal10Ref
+! CHECK-KIND10: fir.call @_FortranAReduceReal10Value
 
 pure function red_real16(a,b)
-  real(16), intent(in) :: a, b
-  real(16) :: red_real16
+  real(kind16), intent(in) :: a, b
+  real(kind16) :: red_real16
   red_real16 = a + b
 end function
 
 pure function red_real16_value(a,b)
-  real(16), value, intent(in) :: a, b
-  real(16) :: red_real16_value
+  real(kind16), value, intent(in) :: a, b
+  real(kind16) :: red_real16_value
   red_real16_value = a + b
 end function
 
 subroutine real16(a)
-  real(16), intent(in) :: a(:)
-  real(16) :: res
+  real(kind16), intent(in) :: a(:)
+  real(kind16) :: res
   res = reduce(a, red_real16)
   res = reduce(a, red_real16_value)
 end subroutine
 
-! CHECK: fir.call @_FortranAReduceReal16Ref
-! CHECK: fir.call @_FortranAReduceReal16Value
+! CHECK-KIND16: fir.call @_FortranAReduceReal16Ref
+! CHECK-KIND16: fir.call @_FortranAReduceReal16Value
 
 pure function red_complex2(a,b)
   complex(2), intent(in) :: a, b
@@ -402,48 +406,48 @@ end subroutine
 ! CHECK: fir.call @_FortranACppReduceComplex8Value
 
 pure function red_complex10(a,b)
-  complex(10), intent(in) :: a, b
-  complex(10) :: red_complex10
+  complex(kind10), intent(in) :: a, b
+  complex(kind10) :: red_complex10
   red_complex10 = a + b
 end function
 
 pure function red_complex10_value(a,b)
-  complex(10), value, intent(in) :: a, b
-  complex(10) :: red_complex10_value
+  complex(kind10), value, intent(in) :: a, b
+  complex(kind10) :: red_complex10_value
   red_complex10_value = a + b
 end function
 
 subroutine complex10(a)
-  complex(10), intent(in) :: a(:)
-  complex(10) :: res
+  complex(kind10), intent(in) :: a(:)
+  complex(kind10) :: res
   res = reduce(a, red_complex10)
   res = reduce(a, red_complex10_value)
 end subroutine
 
-! CHECK: fir.call @_FortranACppReduceComplex10Ref
-! CHECK: fir.call @_FortranACppReduceComplex10Value
+! CHECK-KIND10: fir.call @_FortranACppReduceComplex10Ref
+! CHECK-KIND10: fir.call @_FortranACppReduceComplex10Value
 
 pure function red_complex16(a,b)
-  complex(16), intent(in) :: a, b
-  complex(16) :: red_complex16
+  complex(kind16), intent(in) :: a, b
+  complex(kind16) :: red_complex16
   red_complex16 = a + b
 end function
 
 pure function red_complex16_value(a,b)
-  complex(16), value, intent(in) :: a, b
-  complex(16) :: red_complex16_value
+  complex(kind16), value, intent(in) :: a, b
+  complex(kind16) :: red_complex16_value
   red_complex16_value = a + b
 end function
 
 subroutine complex16(a)
-  complex(16), intent(in) :: a(:)
-  complex(16) :: res
+  complex(kind16), intent(in) :: a(:)
+  complex(kind16) :: res
   res = reduce(a, red_complex16)
   res = reduce(a, red_complex16_value)
 end subroutine
 
-! CHECK: fir.call @_FortranACppReduceComplex16Ref
-! CHECK: fir.call @_FortranACppReduceComplex16Value
+! CHECK-KIND16: fir.call @_FortranACppReduceComplex16Ref
+! CHECK-KIND16: fir.call @_FortranACppReduceComplex16Value
 
 pure function red_log1(a,b)
   logical(1), intent(in) :: a, b
@@ -693,26 +697,26 @@ end subroutine
 ! CHECK: fir.call @_FortranAReduceReal8DimValue
 
 subroutine real10dim(a, id)
-  real(10), intent(in) :: a(:,:)
-  real(10), allocatable :: res(:)
+  real(kind10), intent(in) :: a(:,:)
+  real(kind10), allocatable :: res(:)
 
   res = reduce(a, red_real10, 2)
   res = reduce(a, red_real10_value, 2)
 end subroutine
 
-! CHECK: fir.call @_FortranAReduceReal10DimRef
-! CHECK: fir.call @_FortranAReduceReal10DimValue
+! CHECK-KIND10: fir.call @_FortranAReduceReal10DimRef
+! CHECK-KIND10: fir.call @_FortranAReduceReal10DimValue
 
 subroutine real16dim(a, id)
-  real(16), intent(in) :: a(:,:)
-  real(16), allocatable :: res(:)
+  real(kind16), intent(in) :: a(:,:)
+  real(kind16), allocatable :: res(:)
 
   res = reduce(a, red_real16, 2)
   res = reduce(a, red_real16_value, 2)
 end subroutine
 
-! CHECK: fir.call @_FortranAReduceReal16DimRef
-! CHECK: fir.call @_FortranAReduceReal16DimValue
+! CHECK-KIND16: fir.call @_FortranAReduceReal16DimRef
+! CHECK-KIND16: fir.call @_FortranAReduceReal16DimValue
 
 subroutine complex2dim(a, id)
   complex(2), intent(in) :: a(:,:)
@@ -759,26 +763,26 @@ end subroutine
 ! CHECK: fir.call @_FortranACppReduceComplex8DimValue
 
 subroutine complex10dim(a, id)
-  complex(10), intent(in) :: a(:,:)
-  complex(10), allocatable :: res(:)
+  complex(kind10), intent(in) :: a(:,:)
+  complex(kind10), allocatable :: res(:)
 
   res = reduce(a, red_complex10, 2)
   res = reduce(a, red_complex10_value, 2)
 end subroutine
 
-! CHECK: fir.call @_FortranACppReduceComplex10DimRef
-! CHECK: fir.call @_FortranACppReduceComplex10DimValue
+! CHECK-KIND10: fir.call @_FortranACppReduceComplex10DimRef
+! CHECK-KIND10: fir.call @_FortranACppReduceComplex10DimValue
 
 subroutine complex16dim(a, id)
-  complex(16), intent(in) :: a(:,:)
-  complex(16), allocatable :: res(:)
+  complex(kind16), intent(in) :: a(:,:)
+  complex(kind16), allocatable :: res(:)
 
   res = reduce(a, red_complex16, 2)
   res = reduce(a, red_complex16_value, 2)
 end subroutine
 
-! CHECK: fir.call @_FortranACppReduceComplex16DimRef
-! CHECK: fir.call @_FortranACppReduceComplex16DimValue
+! CHECK-KIND16: fir.call @_FortranACppReduceComplex16DimRef
+! CHECK-KIND16: fir.call @_FortranACppReduceComplex16DimValue
 
 subroutine logical1dim(a, id)
   logical(1), intent(in) :: a(:,:)
diff --git flang/test/Lower/Intrinsics/rrspacing.f90 flang/test/Lower/Intrinsics/rrspacing.f90
index 7125f6bf319b..5cfd7c7befdc 100644
--- flang/test/Lower/Intrinsics/rrspacing.f90
+++ flang/test/Lower/Intrinsics/rrspacing.f90
@@ -1,3 +1,4 @@
+! REQUIRES: flang-supports-f128-math
 ! RUN: bbc -emit-fir -hlfir=false %s -o - | FileCheck %s
 ! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir %s -o - | FileCheck %s
 
diff --git flang/test/Lower/Intrinsics/scale.f90 flang/test/Lower/Intrinsics/scale.f90
index 91892838ea51..9c97349d1dd5 100644
--- flang/test/Lower/Intrinsics/scale.f90
+++ flang/test/Lower/Intrinsics/scale.f90
@@ -1,53 +1,42 @@
-! RUN: bbc -emit-fir -hlfir=false %s -o - | FileCheck %s
+! RUN: bbc -emit-hlfir %s -o - | FileCheck %s --check-prefixes=CHECK%if target=x86_64{{.*}} %{,CHECK-KIND10%}%if flang-supports-f128-math %{,CHECK-KIND16%}
 
 ! CHECK-LABEL: scale_test1
 subroutine scale_test1(x, i)
     real :: x, res
-  ! CHECK: %[[res:.*]] = fir.alloca f32 {bindc_name = "res", uniq_name = "_QFscale_test1Eres"}
-  ! CHECK: %[[x:.*]] = fir.load %arg0 : !fir.ref<f32>
+  ! CHECK: %[[i:.*]]:2 = hlfir.declare{{.*}}i"
+  ! CHECK: %[[res:.*]]:2 = hlfir.declare{{.*}}res"
+  ! CHECK: %[[x:.*]]:2 = hlfir.declare{{.*}}x"
+  ! CHECK: %[[x_val:.*]] = fir.load %[[x]]#0 : !fir.ref<f32>
     integer :: i
-  ! CHECK: %[[i0:.*]] = fir.load %arg1 : !fir.ref<i32>
+  ! CHECK: %[[i_val:.*]] = fir.load %[[i]]#0 : !fir.ref<i32>
     res = scale(x, i)
-  ! CHECK: %[[i1:.*]] = fir.convert %[[i0]] : (i32) -> i64
-  ! CHECK: %[[tmp:.*]] = fir.call @_FortranAScale4(%[[x]], %[[i1]]) {{.*}}: (f32, i64) -> f32
-  ! CHECK: fir.store %[[tmp]] to %[[res]] : !fir.ref<f32>
-  end subroutine scale_test1
+  ! CHECK: %[[i_cast:.*]] = fir.convert %[[i_val]] : (i32) -> i64
+  ! CHECK: %[[tmp:.*]] = fir.call @_FortranAScale4(%[[x_val]], %[[i_cast]]) {{.*}}: (f32, i64) -> f32
+  ! CHECK: hlfir.assign %[[tmp]] to %[[res]]#0 : f32, !fir.ref<f32>
+end subroutine scale_test1
   
-  ! CHECK-LABEL: scale_test2
-  subroutine scale_test2(x, i)
-    real(kind=8) :: x, res
-  ! CHECK: %[[res:.*]] = fir.alloca f64 {bindc_name = "res", uniq_name = "_QFscale_test2Eres"}
-  ! CHECK: %[[x:.*]] = fir.load %arg0 : !fir.ref<f64>
-    integer :: i
-  ! CHECK: %[[i0:.*]] = fir.load %arg1 : !fir.ref<i32>
-    res = scale(x, i)
-  ! CHECK: %[[i1:.*]] = fir.convert %[[i0]] : (i32) -> i64
-  ! CHECK: %[[tmp:.*]] = fir.call @_FortranAScale8(%[[x]], %[[i1]]) {{.*}}: (f64, i64) -> f64
-  ! CHECK: fir.store %[[tmp]] to %[[res]] : !fir.ref<f64>
-  end subroutine scale_test2
-  
-  ! CHECK-LABEL: scale_test3
-  subroutine scale_test3(x, i)
-    real(kind=10) :: x, res
-  ! CHECK: %[[res:.*]] = fir.alloca f80 {bindc_name = "res", uniq_name = "_QFscale_test3Eres"}
-  ! CHECK: %[[x:.*]] = fir.load %arg0 : !fir.ref<f80>
-    integer :: i
-  ! CHECK: %[[i0:.*]] = fir.load %arg1 : !fir.ref<i32>
-    res = scale(x, i)
-  ! CHECK: %[[i1:.*]] = fir.convert %[[i0]] : (i32) -> i64
-  ! CHECK: %[[tmp:.*]] = fir.call @_FortranAScale10(%[[x]], %[[i1]]) {{.*}}: (f80, i64) -> f80
-  ! CHECK: fir.store %[[tmp]] to %[[res]] : !fir.ref<f80>
-  end subroutine scale_test3
-  
-  ! CHECK-LABEL: scale_test4
-  subroutine scale_test4(x, i)
-    real(kind=16) :: x, res
-  ! CHECK: %[[res:.*]] = fir.alloca f128 {bindc_name = "res", uniq_name = "_QFscale_test4Eres"}
-  ! CHECK: %[[x:.*]] = fir.load %arg0 : !fir.ref<f128>
-    integer :: i
-  ! CHECK: %[[i0:.*]] = fir.load %arg1 : !fir.ref<i32>
-    res = scale(x, i)
-  ! CHECK: %[[i1:.*]] = fir.convert %[[i0]] : (i32) -> i64
-  ! CHECK: %[[tmp:.*]] = fir.call @_FortranAScale16(%[[x]], %[[i1]]) {{.*}}: (f128, i64) -> f128
-  ! CHECK: fir.store %[[tmp]] to %[[res]] : !fir.ref<f128>
-  end subroutine scale_test4
+! CHECK-LABEL: scale_test2
+subroutine scale_test2(x, i)
+  real(kind=8) :: x, res
+  integer :: i
+  res = scale(x, i)
+! CHECK: fir.call @_FortranAScale8(%{{.*}}, %{{.*}}) {{.*}}: (f64, i64) -> f64
+end subroutine scale_test2
+
+! CHECK-KIND10-LABEL: scale_test3
+subroutine scale_test3(x, i)
+  integer, parameter :: kind10 = merge(10, 4, selected_real_kind(p=18).eq.10)
+  real(kind=kind10) :: x, res
+  integer :: i
+  res = scale(x, i)
+! CHECK-KIND10: fir.call @_FortranAScale10(%{{.*}}, %{{.*}}) {{.*}}: (f80, i64) -> f80
+end subroutine scale_test3
+
+! CHECK-KIND16-LABEL: scale_test4
+subroutine scale_test4(x, i)
+  integer, parameter :: kind16 = merge(16, 4, selected_real_kind(p=33).eq.16)
+  real(kind=kind16) :: x, res
+  integer :: i
+  res = scale(x, i)
+! CHECK-KIND16: fir.call @_FortranAScale16(%{{.*}}, %{{.*}}) {{.*}}: (f128, i64) -> f128
+end subroutine scale_test4
diff --git flang/test/Lower/Intrinsics/set_exponent.f90 flang/test/Lower/Intrinsics/set_exponent.f90
index fedbad78747a..a06be6aac441 100644
--- flang/test/Lower/Intrinsics/set_exponent.f90
+++ flang/test/Lower/Intrinsics/set_exponent.f90
@@ -1,47 +1,43 @@
-! RUN: bbc -emit-fir -hlfir=false %s -o - | FileCheck %s
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir %s -o - | FileCheck %s
+! RUN: bbc -emit-hlfir %s -o - | FileCheck %s --check-prefixes=CHECK%if target=x86_64{{.*}} %{,CHECK-KIND10%}%if flang-supports-f128-math %{,CHECK-KIND16%}
 
 ! SET_EXPONENT
-! CHECK-LABEL: set_exponent_test
-subroutine set_exponent_test
+! CHECK-LABEL: set_exponent_test_4
+subroutine set_exponent_test_4(x, i)
+  real(kind = 4) :: x
+  integer :: i
+  x = set_exponent(x, i)
+! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare {{.*}}"_QFset_exponent_test_4Ei"
+! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare {{.*}}"_QFset_exponent_test_4Ex"
+! CHECK:  %[[VAL_5:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<f32>
+! CHECK:  %[[VAL_6:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i32>
+! CHECK:  %[[VAL_7:.*]] = fir.convert %[[VAL_6]] : (i32) -> i64
+! CHECK:  %[[VAL_8:.*]] = fir.call @_FortranASetExponent4(%[[VAL_5]], %[[VAL_7]]) fastmath<contract> : (f32, i64) -> f32
+! CHECK:  hlfir.assign %[[VAL_8]] to %[[VAL_4]]#0 : f32, !fir.ref<f32>
+end subroutine
 
-  real(kind = 4) :: x1 = 178.1378e-4
-  real(kind = 8) :: x2 = 178.1378e-4
-  real(kind = 10) :: x3 = 178.1378e-4
-  real(kind = 16) :: x4 = 178.1378e-4
-  integer :: i = 17
-! CHECK: %[[addri:.*]] = fir.address_of(@_QFset_exponent_testEi) : !fir.ref<i32>
-! CHECK: %[[addrx1:.*]] = fir.address_of(@_QFset_exponent_testEx1) : !fir.ref<f32>
-! CHECK: %[[addrx2:.*]] = fir.address_of(@_QFset_exponent_testEx2) : !fir.ref<f64>
-! CHECK: %[[addrx3:.*]] = fir.address_of(@_QFset_exponent_testEx3) : !fir.ref<f80>
-! CHECK: %[[addrx4:.*]] = fir.address_of(@_QFset_exponent_testEx4) : !fir.ref<f128>
 
-  x1 = set_exponent(x1, i)
-! CHECK: %[[x1:.*]] = fir.load %[[addrx1:.*]] : !fir.ref<f32>
-! CHECK: %[[i1:.*]] = fir.load %[[addri:.*]] : !fir.ref<i32>
-! CHECK: %[[i64v1:.*]] = fir.convert %[[i1:.*]] : (i32) -> i64
-! CHECK: %[[result1:.*]] = fir.call @_FortranASetExponent4(%[[x1:.*]], %[[i64v1:.*]]) {{.*}}: (f32, i64) -> f32
-! CHECK: fir.store %[[result1:.*]] to %[[addrx1:.*]] : !fir.ref<f32>
+! CHECK-LABEL: set_exponent_test_8
+subroutine set_exponent_test_8(x, i)
+  real(kind = 8) :: x
+  integer :: i
+  x = set_exponent(x, i)
+! CHECK: fir.call @_FortranASetExponent8(%{{.*}}, %{{.*}}) {{.*}}: (f64, i64) -> f64
+end subroutine
 
-  x2 = set_exponent(x2, i)
-! CHECK: %[[x2:.*]] = fir.load %[[addrx2:.*]] : !fir.ref<f64>
-! CHECK: %[[i2:.*]] = fir.load %[[addri:.*]] : !fir.ref<i32>
-! CHECK: %[[i64v2:.*]] = fir.convert %[[i2:.*]] : (i32) -> i64
-! CHECK: %[[result2:.*]] = fir.call @_FortranASetExponent8(%[[x2:.*]], %[[i64v2:.*]]) {{.*}}: (f64, i64) -> f64
-! CHECK: fir.store %[[result2:.*]] to %[[addrx2:.*]] : !fir.ref<f64>
-
-  x3 = set_exponent(x3, i)
-! CHECK: %[[x3:.*]] = fir.load %[[addrx3:.*]] : !fir.ref<f80>
-! CHECK: %[[i3:.*]] = fir.load %[[addri:.*]] : !fir.ref<i32>
-! CHECK: %[[i64v3:.*]] = fir.convert %[[i3:.*]] : (i32) -> i64
-! CHECK: %[[result3:.*]] = fir.call @_FortranASetExponent10(%[[x3:.*]], %[[i64v3:.*]]) {{.*}}: (f80, i64) -> f80
-! CHECK: fir.store %[[result3:.*]] to %[[addrx3:.*]] : !fir.ref<f80>
-
-  x4 = set_exponent(x4, i)
-! CHECK: %[[x4:.*]] = fir.load %[[addrx4:.*]] : !fir.ref<f128>
-! CHECK: %[[i4:.*]] = fir.load %[[addri:.*]] : !fir.ref<i32>
-! CHECK: %[[i64v4:.*]] = fir.convert %18 : (i32) -> i64
-! CHECK: %[[result4:.*]] = fir.call @_FortranASetExponent16(%[[x4:.*]], %[[i64v4:.*]]) {{.*}}: (f128, i64) -> f128
-! CHECK: fir.store %[[result4:.*]] to %[[addrx4:.*]] : !fir.ref<f128>
-end subroutine set_exponent_test
+! CHECK-KIND10-LABEL: set_exponent_test_10
+subroutine set_exponent_test_10(x, i)
+  integer, parameter :: kind10 = merge(10, 4, selected_real_kind(p=18).eq.10)
+  real(kind = kind10) :: x
+  integer :: i
+  x = set_exponent(x, i)
+! CHECK-KIND10: fir.call @_FortranASetExponent10(%{{.*}}, %{{.*}}) {{.*}}: (f80, i64) -> f80
+end subroutine
 
+! CHECK-KIND16-LABEL: set_exponent_test_16
+subroutine set_exponent_test_16(x, i)
+  integer, parameter :: kind16 = merge(16, 4, selected_real_kind(p=33).eq.16)
+  real(kind = kind16) :: x
+  integer :: i
+  x = set_exponent(x, i)
+! CHECK-KIND16: fir.call @_FortranASetExponent16(%{{.*}}, %{{.*}}) {{.*}}: (f128, i64) -> f128
+end subroutine
diff --git flang/test/Lower/Intrinsics/sign.f90 flang/test/Lower/Intrinsics/sign.f90
index 218080f0d49b..965bda3d5b83 100644
--- flang/test/Lower/Intrinsics/sign.f90
+++ flang/test/Lower/Intrinsics/sign.f90
@@ -1,4 +1,4 @@
-! RUN: bbc %s -o - | FileCheck %s
+! RUN: bbc %s -o - | FileCheck %s --check-prefixes=CHECK,%if flang-supports-f128-math %{F128%} %else %{F64%}
 
 ! CHECK-LABEL: sign_testi
 subroutine sign_testi(a, b, c)
@@ -22,8 +22,10 @@ end subroutine
 
 ! CHECK-LABEL: sign_testr2
 subroutine sign_testr2(a, b, c)
-  real(KIND=16) a, b, c
+  integer, parameter :: rk = merge(16, 8, selected_real_kind(33, 4931)==16)
+  real(KIND=rk) a, b, c
   ! CHECK-NOT: fir.call @{{.*}}fabs
-  ! CHECK: math.copysign{{.*}} : f128
+  ! F128: math.copysign{{.*}} : f128
+  ! F64: math.copysign{{.*}} : f64
   c = sign(a, b)
 end subroutine
diff --git flang/test/Lower/Intrinsics/spacing.f90 flang/test/Lower/Intrinsics/spacing.f90
index 151f4e2a6d23..9f597a214979 100644
--- flang/test/Lower/Intrinsics/spacing.f90
+++ flang/test/Lower/Intrinsics/spacing.f90
@@ -1,5 +1,4 @@
-! RUN: bbc -emit-fir %s -o - | FileCheck %s
-! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir %s -o - | FileCheck %s --check-prefixes=CHECK%if target=x86_64{{.*}} %{,CHECK-KIND10%}
 
 ! CHECK-LABEL: func @_QPspacing_test(
 real*4 function spacing_test(x)
@@ -9,12 +8,13 @@ real*4 function spacing_test(x)
 ! CHECK: %{{.*}} = fir.call @_FortranASpacing4(%[[a1]]) {{.*}}: (f32) -> f32
 end function
 
-! CHECK-LABEL: func @_QPspacing_test2(
-real*10 function spacing_test2(x)
-  real*10 :: x
+! CHECK-KIND10-LABEL: func @_QPspacing_test2(
+function spacing_test2(x)
+  integer, parameter :: kind10 = merge(10, 4, selected_real_kind(p=18).eq.10)
+  real(kind10) :: x, spacing_test2
   spacing_test2 = spacing(x)
-! CHECK: %[[a1:.*]] = fir.load %{{.*}} : !fir.ref<f80>
-! CHECK: %{{.*}} = fir.call @_FortranASpacing10(%[[a1]]) {{.*}}: (f80) -> f80
+! CHECK-KIND10: %[[a1:.*]] = fir.load %{{.*}} : !fir.ref<f80>
+! CHECK-KIND10: %{{.*}} = fir.call @_FortranASpacing10(%[[a1]]) {{.*}}: (f80) -> f80
 end function
 
 ! CHECK-LABEL: test_real2
diff --git flang/test/Lower/OpenMP/DelayedPrivatization/distribute-standalone-private.f90 flang/test/Lower/OpenMP/DelayedPrivatization/distribute-standalone-private.f90
index 9c2ff8b52848..8098cd53e9d2 100644
--- flang/test/Lower/OpenMP/DelayedPrivatization/distribute-standalone-private.f90
+++ flang/test/Lower/OpenMP/DelayedPrivatization/distribute-standalone-private.f90
@@ -16,8 +16,8 @@ subroutine standalone_distribute
     !$omp end teams
 end subroutine standalone_distribute
 
-! CHECK: omp.private {type = private} @[[I_PRIVATIZER_SYM:.*]] : !fir.ref<i32>
-! CHECK: omp.private {type = private} @[[VAR_PRIVATIZER_SYM:.*]] : !fir.ref<i32>
+! CHECK: omp.private {type = private} @[[I_PRIVATIZER_SYM:.*]] : i32
+! CHECK: omp.private {type = private} @[[VAR_PRIVATIZER_SYM:.*]] : i32
 
 
 ! CHECK-LABEL: func.func @_QPstandalone_distribute() {
diff --git flang/test/Lower/OpenMP/DelayedPrivatization/equivalence.f90 flang/test/Lower/OpenMP/DelayedPrivatization/equivalence.f90
index 2307c0951379..721bfff012f1 100644
--- flang/test/Lower/OpenMP/DelayedPrivatization/equivalence.f90
+++ flang/test/Lower/OpenMP/DelayedPrivatization/equivalence.f90
@@ -13,13 +13,15 @@ subroutine private_common
   !$omp end parallel
 end subroutine
 
-! CHECK:  omp.private {type = firstprivate} @[[X_PRIVATIZER:.*]] : ![[X_TYPE:fir.ptr<f32>]] alloc {
-! CHECK:  ^bb0(%{{.*}}: ![[X_TYPE]]):
-! CHECK:    %[[PRIV_ALLOC:.*]] = fir.alloca f32 {bindc_name = "x", {{.*}}}
-! CHECK:    %[[PRIV_DECL:.*]]:2 = hlfir.declare %[[PRIV_ALLOC]] {{{.*}}} : (![[PRIV_TYPE:fir.ref<f32>]]) -> ({{.*}})
-! CHECK:    %[[PRIV_CONV:.*]] = fir.convert %[[PRIV_DECL]]#0 : (![[PRIV_TYPE]]) -> ![[X_TYPE]]
-! CHECK:    omp.yield(%[[PRIV_CONV]] : ![[X_TYPE]])
-! CHECK:  } copy {
+! TODO: the copy region for pointers is incorrect. OpenMP 5.2 says
+!
+! > If the original list item has the POINTER attribute, the new list items
+! > receive the same association status as the original list item
+!
+! Currently the original pointer is unconditionally loaded, which is undefined
+! behavior if that pointer is not associated.
+
+! CHECK:  omp.private {type = firstprivate} @[[X_PRIVATIZER:.*]] : ![[X_TYPE:fir.ptr<f32>]] copy {
 ! CHECK:  ^bb0(%[[ORIG_PTR:.*]]: ![[X_TYPE]], %[[PRIV_REF:.*]]: ![[X_TYPE]]):
 ! CHECK:    %[[ORIG_VAL:.*]] = fir.load %[[ORIG_PTR]] : !fir.ptr<f32>
 ! CHECK:    hlfir.assign %[[ORIG_VAL]] to %[[PRIV_REF]] : f32, ![[X_TYPE]]
diff --git flang/test/Lower/OpenMP/DelayedPrivatization/target-private-allocatable.f90 flang/test/Lower/OpenMP/DelayedPrivatization/target-private-allocatable.f90
index e11525c569ff..3e2b3e59018b 100644
--- flang/test/Lower/OpenMP/DelayedPrivatization/target-private-allocatable.f90
+++ flang/test/Lower/OpenMP/DelayedPrivatization/target-private-allocatable.f90
@@ -16,28 +16,25 @@ end subroutine target_allocatable
 
 ! CHECK-LABEL: omp.private {type = private}
 ! CHECK-SAME:    @[[VAR_PRIVATIZER_SYM:.*]] :
-! CHECK-SAME:      [[TYPE:!fir.ref<!fir.box<!fir.heap<i32>>>]] alloc {
-! CHECK:  ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]):
-! CHECK:    %[[PRIV_ALLOC:.*]] = fir.alloca [[DESC_TYPE:!fir.box<!fir.heap<i32>>]] {bindc_name = "alloc_var", {{.*}}}
+! CHECK-SAME:      [[DESC_TYPE:!fir.box<!fir.heap<i32>>]] init {
+! CHECK:  ^bb0(%[[PRIV_ARG:.*]]: [[TYPE:!fir.ref<!fir.box<!fir.heap<i32>>>]], %[[PRIV_ALLOC:.*]]: [[TYPE]]):
 
 ! CHECK-NEXT:   %[[PRIV_ARG_VAL:.*]] = fir.load %[[PRIV_ARG]] : [[TYPE]]
 ! CHECK-NEXT:   %[[PRIV_ARG_BOX:.*]] = fir.box_addr %[[PRIV_ARG_VAL]] : ([[DESC_TYPE]]) -> !fir.heap<i32>
 ! CHECK-NEXT:   %[[PRIV_ARG_ADDR:.*]] = fir.convert %[[PRIV_ARG_BOX]] : (!fir.heap<i32>) -> i64
 ! CHECK-NEXT:   %[[C0:.*]] = arith.constant 0 : i64
-! CHECK-NEXT:   %[[ALLOC_COND:.*]] = arith.cmpi ne, %[[PRIV_ARG_ADDR]], %[[C0]] : i64
+! CHECK-NEXT:   %[[ALLOC_COND:.*]] = arith.cmpi eq, %[[PRIV_ARG_ADDR]], %[[C0]] : i64
 
 ! CHECK-NEXT:   fir.if %[[ALLOC_COND]] {
-! CHECK:          %[[PRIV_ALLOCMEM:.*]] = fir.allocmem i32 {fir.must_be_heap = true, {{.*}}}
+! CHECK-NEXT:     %[[ZERO_BOX:.*]] = fir.embox %[[PRIV_ARG_BOX]] : (!fir.heap<i32>) -> [[DESC_TYPE]]
+! CHECK-NEXT:     fir.store %[[ZERO_BOX]] to %[[PRIV_ALLOC]] : [[TYPE]]
+! CHECK-NEXT:   } else {
+! CHECK-NEXT:     %[[PRIV_ALLOCMEM:.*]] = fir.allocmem i32
 ! CHECK-NEXT:     %[[PRIV_ALLOCMEM_BOX:.*]] = fir.embox %[[PRIV_ALLOCMEM]] : (!fir.heap<i32>) -> [[DESC_TYPE]]
 ! CHECK-NEXT:     fir.store %[[PRIV_ALLOCMEM_BOX]] to %[[PRIV_ALLOC]] : [[TYPE]]
-! CHECK-NEXT:   } else {
-! CHECK-NEXT:     %[[ZERO_BITS:.*]] = fir.zero_bits !fir.heap<i32>
-! CHECK-NEXT:     %[[ZERO_BOX:.*]] = fir.embox %[[ZERO_BITS]] : (!fir.heap<i32>) -> [[DESC_TYPE]]
-! CHECK-NEXT:     fir.store %[[ZERO_BOX]] to %[[PRIV_ALLOC]] : [[TYPE]]
 ! CHECK-NEXT:   }
 
-! CHECK-NEXT:   %[[PRIV_DECL:.*]]:2 = hlfir.declare %[[PRIV_ALLOC]]
-! CHECK-NEXT:   omp.yield(%[[PRIV_DECL]]#0 : [[TYPE]])
+! CHECK-NEXT:   omp.yield(%[[PRIV_ALLOC]] : [[TYPE]])
 
 ! CHECK-NEXT: } dealloc {
 ! CHECK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]):
@@ -49,12 +46,7 @@ end subroutine target_allocatable
 ! CHECK-NEXT:   %[[PRIV_NULL_COND:.*]] = arith.cmpi ne, %[[PRIV_ADDR_I64]], %[[C0]] : i64
 
 ! CHECK-NEXT:   fir.if %[[PRIV_NULL_COND]] {
-! CHECK:          %[[PRIV_VAL_2:.*]] = fir.load %[[PRIV_ARG]]
-! CHECK-NEXT:     %[[PRIV_ADDR_2:.*]] = fir.box_addr %[[PRIV_VAL_2]]
-! CHECK-NEXT:     fir.freemem %[[PRIV_ADDR_2]]
-! CHECK-NEXT:     %[[ZEROS:.*]] = fir.zero_bits
-! CHECK-NEXT:     %[[ZEROS_BOX:.*]]  = fir.embox %[[ZEROS]]
-! CHECK-NEXT:     fir.store %[[ZEROS_BOX]] to %[[PRIV_ARG]]
+! CHECK-NEXT:     fir.freemem %[[PRIV_ADDR]]
 ! CHECK-NEXT:   }
 
 ! CHECK-NEXT:   omp.yield
diff --git flang/test/Lower/OpenMP/DelayedPrivatization/target-private-multiple-variables.f90 flang/test/Lower/OpenMP/DelayedPrivatization/target-private-multiple-variables.f90
index f3f9bbe4a76a..5d31de10d74f 100644
--- flang/test/Lower/OpenMP/DelayedPrivatization/target-private-multiple-variables.f90
+++ flang/test/Lower/OpenMP/DelayedPrivatization/target-private-multiple-variables.f90
@@ -38,95 +38,81 @@ end subroutine target_allocatable
 !
 ! CHECK:      omp.private {type = private}
 ! CHECK-SAME:   @[[CHAR_PRIVATIZER_SYM:[^[:space:]]+char_var[^[:space:]]+]]
-! CHECK-SAME:   : [[CHAR_TYPE:!fir.boxchar<1>]] alloc {
+! CHECK-SAME:   : [[CHAR_TYPE:!fir.boxchar<1>]] init {
 !
-! CHECK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[CHAR_TYPE]]):
+! CHECK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[CHAR_TYPE]], %[[UNUSED:.*]]: [[CHAR_TYPE]]):
 ! CHECK-NEXT:   %[[UNBOX:.*]]:2 = fir.unboxchar %[[PRIV_ARG]]
-! CHECK:        %[[PRIV_ALLOC:.*]] = fir.alloca !fir.char<1,?>(%[[UNBOX]]#1 : index)
-! CHECK-NEXT:   %[[PRIV_DECL:.*]]:2 = hlfir.declare %[[PRIV_ALLOC]] typeparams %[[UNBOX]]#1
-! CHECK-NEXT:   omp.yield(%[[PRIV_DECL]]#0 : [[CHAR_TYPE]])
-! CHECK-NEXT: }
+! CHECK:        %[[PRIV_ALLOC:.*]] = fir.allocmem !fir.char<1,?>(%[[UNBOX]]#1 : index)
+! CHECK:        %[[BOXCHAR:.*]] = fir.emboxchar %[[PRIV_ALLOC]], %[[UNBOX]]#1
+! CHECK-NEXT:   omp.yield(%[[BOXCHAR]] : [[CHAR_TYPE]])
+! CHECK-NEXT: } dealloc {
 
 ! Test the privatizer for `complex`
 !
 ! CHECK:      omp.private {type = private}
 ! CHECK-SAME:   @[[COMP_PRIVATIZER_SYM:[^[:space:]]+comp_var[^[:space:]]+]]
-! CHECK-SAME:   : [[COMP_TYPE:!fir.ref<complex<f32>>]] alloc {
-!
-! CHECK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[COMP_TYPE]]):
-! CHECK-NEXT:   %[[PRIV_ALLOC:.*]] = fir.alloca complex<f32>
-! CHECK-NEXT:   %[[PRIV_DECL:.*]]:2 = hlfir.declare %[[PRIV_ALLOC]]
-! CHECK-NEXT:   omp.yield(%[[PRIV_DECL]]#0 : [[COMP_TYPE]])
-! CHECK-NEXT: }
+! CHECK-SAME:   : [[COMP_TYPE:complex<f32>]]{{$}}
 
 ! Test the privatizer for `real(:)`
 !
 ! CHECK:      omp.private {type = private}
 ! CHECK-SAME:   @[[ARR_PRIVATIZER_SYM:[^[:space:]]+real_arr[^[:space:]]+]]
-! CHECK-SAME:   : [[ARR_TYPE:!fir.box<!fir.array<\?xf32>>]] alloc {
-!
-! CHECK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[ARR_TYPE]]):
-! CHECK:        %[[C0:.*]] = arith.constant 0 : index
-! CHECK-NEXT:   %[[DIMS:.*]]:3 = fir.box_dims %[[PRIV_ARG]], %[[C0]] : ([[ARR_TYPE]], index)
-! CHECK:        %[[PRIV_ALLOCA:.*]] = fir.alloca !fir.array<{{\?}}xf32>
-! CHECK-NEXT:   %[[SHAPE_SHIFT:.*]] = fir.shape_shift %[[DIMS]]#0, %[[DIMS]]#1
-! CHECK-NEXT:   %[[PRIV_DECL:.*]]:2 = hlfir.declare %[[PRIV_ALLOCA]](%[[SHAPE_SHIFT]])
-! CHECK-NEXT:  omp.yield(%[[PRIV_DECL]]#0 : [[ARR_TYPE]])
+! CHECK-SAME:   : [[ARR_TYPE:!fir.box<!fir.array<\?xf32>>]] init {
+!
+! CHECK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: !fir.ref<[[ARR_TYPE]]>, %[[PRIV_ALLOC:.*]]: !fir.ref<[[ARR_TYPE]]>):
+! CHECK-NEXT:   %[[MOLD:.*]] = fir.load %[[PRIV_ARG]]
+! CHECK-NEXT:   %[[C0:.*]] = arith.constant 0 : index
+! CHECK-NEXT:   %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[MOLD]], %[[C0]]
+! CHECK-NEXT:   %[[SHAPE:.*]] = fir.shape %[[BOX_DIMS]]#1
+! CHECK-NEXT:   %[[DATA_ALLOC:.*]] = fir.allocmem !fir.array<?xf32>, %[[BOX_DIMS]]#1
+! CHECK-NEXT:   %[[TRUE:.*]] = arith.constant true
+! CHECK-NEXT:   %[[DECL:.*]]:2 = hlfir.declare %[[DATA_ALLOC:.*]](%[[SHAPE]])
+! CHECK-NEXT:   %[[C0_2:.*]] = arith.constant 0 : index
+! CHECK-NEXT:   %[[BOX_DIMS_2:.*]]:3 = fir.box_dims %[[MOLD]], %[[C0_2]]
+! CHECK-NEXT:   %[[SHAPE_SHIFT:.*]] = fir.shape_shift %[[BOX_DIMS_2]]#0, %[[BOX_DIMS_2]]#1
+! CHECK-NEXT:   %[[BOX:.*]] = fir.rebox %[[DECL]]#0(%[[SHAPE_SHIFT]])
+! CHECK-NEXT:   fir.store %[[BOX]] to %[[PRIV_ALLOC]]
+! CHECK-NEXT:   omp.yield(%[[PRIV_ALLOC]] : !fir.ref<[[ARR_TYPE]]>)
 ! CHECK-NEXT: }
 
 ! Test the privatizer for `real(:)`'s lower bound
 !
 ! CHECK:      omp.private {type = private}
 ! CHECK-SAME:   @[[LB_PRIVATIZER_SYM:[^[:space:]]+lb[^[:space:]]+]]
-! CHECK-SAME:   : [[LB_TYPE:!fir.ref<i64>]] alloc {
-
-! CHECK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[LB_TYPE]]):
-! CHECK-NEXT:   %[[PRIV_ALLOCA:.*]] = fir.alloca i64
-! CHECK-NEXT:   %[[PRIV_DECL:.*]]:2 = hlfir.declare %[[PRIV_ALLOCA]]
-! CHECK-NEXT:  omp.yield(%[[PRIV_DECL]]#0 : [[LB_TYPE]])
-! CHECK-NEXT: }
+! CHECK-SAME:   : [[LB_TYPE:i64]]{{$}}
 
 ! Test the privatizer for `real`
 !
 ! CHECK:      omp.private {type = private}
 ! CHECK-SAME:   @[[REAL_PRIVATIZER_SYM:[^[:space:]]+real_var[^[:space:]]+]]
-! CHECK-SAME:   : [[REAL_TYPE:!fir.ref<f32>]] alloc {
-
-! CHECK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[REAL_TYPE]]):
-! CHECK-NEXT:   %[[PRIV_ALLOCA:.*]] = fir.alloca f32
-! CHECK-NEXT:   %[[PRIV_DECL:.*]]:2 = hlfir.declare %[[PRIV_ALLOCA]]
-! CHECK-NEXT:  omp.yield(%[[PRIV_DECL]]#0 : [[REAL_TYPE]])
-! CHECK-NEXT: }
+! CHECK-SAME:   : [[REAL_TYPE:f32]]{{$}}
 
 ! Test the privatizer for `allocatable`
 !
 ! CHECK:      omp.private {type = private}
 ! CHECK-SAME:   @[[ALLOC_PRIVATIZER_SYM:[^[:space:]]+alloc_var[^[:space:]]+]]
-! CHECK-SAME:   : [[ALLOC_TYPE:!fir.ref<!fir.box<!fir.heap<i32>>>]] alloc {
+! CHECK-SAME:   : [[ALLOC_TYPE:!fir.box<!fir.heap<i32>>]] init {
 !
-! CHECK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[ALLOC_TYPE]]):
-! CHECK:        %[[PRIV_ALLOC:.*]] = fir.alloca !fir.box<!fir.heap<i32>>
+! CHECK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: !fir.ref<[[ALLOC_TYPE]]>, %[[PRIV_ALLOC:.*]]: !fir.ref<[[ALLOC_TYPE]]>):
 ! CHECK-NEXT:   %[[PRIV_ARG_VAL:.*]] = fir.load %[[PRIV_ARG]] : !fir.ref<!fir.box<!fir.heap<i32>>>
 ! CHECK-NEXT:   %[[PRIV_ARG_BOX:.*]] = fir.box_addr %[[PRIV_ARG_VAL]] : (!fir.box<!fir.heap<i32>>) -> !fir.heap<i32>
 ! CHECK-NEXT:   %[[PRIV_ARG_ADDR:.*]] = fir.convert %[[PRIV_ARG_BOX]] : (!fir.heap<i32>) -> i64
 ! CHECK-NEXT:   %[[C0:.*]] = arith.constant 0 : i64
-! CHECK-NEXT:   %[[ALLOC_COND:.*]] = arith.cmpi ne, %[[PRIV_ARG_ADDR]], %[[C0]] : i64
+! CHECK-NEXT:   %[[ALLOC_COND:.*]] = arith.cmpi eq, %[[PRIV_ARG_ADDR]], %[[C0]] : i64
 !
 ! CHECK-NEXT:   fir.if %[[ALLOC_COND]] {
-! CHECK:          %[[PRIV_ALLOCMEM:.*]] = fir.allocmem i32 {fir.must_be_heap = true, {{.*}}}
+! CHECK-NEXT:     %[[ZERO_BOX:.*]] = fir.embox %[[PRIV_ARG_BOX]] : (!fir.heap<i32>) -> !fir.box<!fir.heap<i32>>
+! CHECK-NEXT:     fir.store %[[ZERO_BOX]] to %[[PRIV_ALLOC]] : !fir.ref<!fir.box<!fir.heap<i32>>>
+! CHECK-NEXT:   } else {
+! CHECK:          %[[PRIV_ALLOCMEM:.*]] = fir.allocmem i32
 ! CHECK-NEXT:     %[[PRIV_ALLOCMEM_BOX:.*]] = fir.embox %[[PRIV_ALLOCMEM]] : (!fir.heap<i32>) -> !fir.box<!fir.heap<i32>>
 ! CHECK-NEXT:     fir.store %[[PRIV_ALLOCMEM_BOX]] to %[[PRIV_ALLOC]] : !fir.ref<!fir.box<!fir.heap<i32>>>
-! CHECK-NEXT:   } else {
-! CHECK-NEXT:     %[[ZERO_BITS:.*]] = fir.zero_bits !fir.heap<i32>
-! CHECK-NEXT:     %[[ZERO_BOX:.*]] = fir.embox %[[ZERO_BITS]] : (!fir.heap<i32>) -> !fir.box<!fir.heap<i32>>
-! CHECK-NEXT:     fir.store %[[ZERO_BOX]] to %[[PRIV_ALLOC]] : !fir.ref<!fir.box<!fir.heap<i32>>>
 ! CHECK-NEXT:   }
 !
-! CHECK-NEXT:   %[[PRIV_DECL:.*]]:2 = hlfir.declare %[[PRIV_ALLOC]]
-! CHECK-NEXT:   omp.yield(%[[PRIV_DECL]]#0 : [[ALLOC_TYPE]])
+! CHECK-NEXT:   omp.yield(%[[PRIV_ALLOC]] : !fir.ref<[[ALLOC_TYPE]]>)
 !
 ! CHECK-NEXT: } dealloc {
-! CHECK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[ALLOC_TYPE]]):
+! CHECK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: !fir.ref<[[ALLOC_TYPE]]>):
 !
 ! CHECK-NEXT:   %[[PRIV_VAL:.*]] = fir.load %[[PRIV_ARG]]
 ! CHECK-NEXT:   %[[PRIV_ADDR:.*]] = fir.box_addr %[[PRIV_VAL]]
@@ -135,12 +121,7 @@ end subroutine target_allocatable
 ! CHECK-NEXT:   %[[PRIV_NULL_COND:.*]] = arith.cmpi ne, %[[PRIV_ADDR_I64]], %[[C0]] : i64
 !
 ! CHECK-NEXT:   fir.if %[[PRIV_NULL_COND]] {
-! CHECK:          %[[PRIV_VAL_2:.*]] = fir.load %[[PRIV_ARG]]
-! CHECK-NEXT:     %[[PRIV_ADDR_2:.*]] = fir.box_addr %[[PRIV_VAL_2]]
-! CHECK-NEXT:     fir.freemem %[[PRIV_ADDR_2]]
-! CHECK-NEXT:     %[[ZEROS:.*]] = fir.zero_bits
-! CHECK-NEXT:     %[[ZEROS_BOX:.*]]  = fir.embox %[[ZEROS]]
-! CHECK-NEXT:     fir.store %[[ZEROS_BOX]] to %[[PRIV_ARG]]
+! CHECK-NEXT:     fir.freemem %[[PRIV_ADDR]]
 ! CHECK-NEXT:   }
 !
 ! CHECK-NEXT:   omp.yield
@@ -157,9 +138,9 @@ end subroutine target_allocatable
 ! CHECK:        %[[CHAR_VAR_DECL:.*]]:2 = hlfir.declare %[[CHAR_VAR_ALLOC]] typeparams
 ! CHECK:        %[[REAL_ARR_ALLOC:.*]] = fir.alloca !fir.array<?xf32>, {{.*}} {bindc_name = "real_arr", {{.*}}}
 ! CHECK:        %[[REAL_ARR_DECL:.*]]:2 = hlfir.declare %[[REAL_ARR_ALLOC]]({{.*}})
+! CHECK:        fir.store %[[REAL_ARR_DECL]]#0 to %[[REAL_ARR_DESC_ALLOCA]] : !fir.ref<!fir.box<!fir.array<?xf32>>>
 ! CHECK:        %[[MAPPED_MI0:.*]] = omp.map.info var_ptr(%[[MAPPED_DECL]]#1 : !fir.ref<i32>, i32) {{.*}}
 ! CHECK:        %[[ALLOC_VAR_MAP:.*]] = omp.map.info var_ptr(%[[ALLOC_VAR_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<i32>>>, !fir.box<!fir.heap<i32>>)
-! CHECK:        fir.store %[[REAL_ARR_DECL]]#0 to %[[REAL_ARR_DESC_ALLOCA]] : !fir.ref<!fir.box<!fir.array<?xf32>>>
 ! CHECK:        %[[REAL_ARR_DESC_MAP:.*]] = omp.map.info var_ptr(%[[REAL_ARR_DESC_ALLOCA]] : !fir.ref<!fir.box<!fir.array<?xf32>>>, !fir.box<!fir.array<?xf32>>)
 ! CHECK:        fir.store %[[CHAR_VAR_DECL]]#0 to %[[CHAR_VAR_DESC_ALLOCA]] : !fir.ref<!fir.boxchar<1>>
 ! CHECK:        %[[CHAR_VAR_DESC_MAP:.*]] = omp.map.info var_ptr(%[[CHAR_VAR_DESC_ALLOCA]] : !fir.ref<!fir.boxchar<1>>, !fir.boxchar<1>)
@@ -174,16 +155,15 @@ end subroutine target_allocatable
 ! CHECK-SAME:       @[[ALLOC_PRIVATIZER_SYM]] %{{[^[:space:]]+}}#0 -> %[[ALLOC_ARG:[^,]+]] [map_idx=1],
 ! CHECK-SAME:       @[[REAL_PRIVATIZER_SYM]] %{{[^[:space:]]+}}#0 -> %[[REAL_ARG:[^,]+]],
 ! CHECK-SAME:       @[[LB_PRIVATIZER_SYM]] %{{[^[:space:]]+}}#0 -> %[[LB_ARG:[^,]+]],
-! CHECK-SAME:       @[[ARR_PRIVATIZER_SYM]] %{{[^[:space:]]+}}#0 -> %[[ARR_ARG:[^,]+]] [map_idx=2],
+! CHECK-SAME:       @[[ARR_PRIVATIZER_SYM]] %{{[^[:space:]]+}} -> %[[ARR_ARG:[^,]+]] [map_idx=2],
 ! CHECK-SAME:       @[[COMP_PRIVATIZER_SYM]] %{{[^[:space:]]+}}#0 -> %[[COMP_ARG:[^,]+]],
 ! CHECK-SAME:       @[[CHAR_PRIVATIZER_SYM]] %{{[^[:space:]]+}}#0 -> %[[CHAR_ARG:[^,]+]] [map_idx=3] :
-! CHECK-SAME:       !fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<f32>, !fir.ref<i64>, !fir.box<!fir.array<?xf32>>, !fir.ref<complex<f32>>, !fir.boxchar<1>) {
+! CHECK-SAME:       !fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<f32>, !fir.ref<i64>, !fir.ref<!fir.box<!fir.array<?xf32>>>, !fir.ref<complex<f32>>, !fir.boxchar<1>) {
 ! CHECK-NOT:      fir.alloca
 ! CHECK:          hlfir.declare %[[ALLOC_ARG]]
 ! CHECK:          hlfir.declare %[[REAL_ARG]]
 ! CHECK:          hlfir.declare %[[LB_ARG]]
-! CHECK:          %[[ARR_ARG_ADDR:.*]] = fir.box_addr %[[ARR_ARG]]
-! CHECK:          hlfir.declare %[[ARR_ARG_ADDR]]
+! CHECK:          hlfir.declare %[[ARR_ARG]]
 ! CHECK:          hlfir.declare %[[COMP_ARG]]
 ! CHECK:          %[[CHAR_ARG_UNBOX:.*]]:2 = fir.unboxchar %[[CHAR_ARG]]
 ! CHECK:          hlfir.declare %[[CHAR_ARG_UNBOX]]
diff --git flang/test/Lower/OpenMP/DelayedPrivatization/target-private-simple.f90 flang/test/Lower/OpenMP/DelayedPrivatization/target-private-simple.f90
index 3c6836e81abe..5abf2cbb15c9 100644
--- flang/test/Lower/OpenMP/DelayedPrivatization/target-private-simple.f90
+++ flang/test/Lower/OpenMP/DelayedPrivatization/target-private-simple.f90
@@ -15,12 +15,7 @@ subroutine target_simple
 end subroutine target_simple
 
 ! CHECK-LABEL: omp.private {type = private}
-! CHECK-SAME:              @[[VAR_PRIVATIZER_SYM:.*]] : !fir.ref<i32> alloc {
-! CHECK:  ^bb0(%[[PRIV_ARG:.*]]: !fir.ref<i32>):
-! CHECK:    %[[PRIV_ALLOC:.*]] = fir.alloca i32 {bindc_name = "simple_var", {{.*}}}
-! CHECK:    %[[PRIV_DECL:.*]]:2 = hlfir.declare %[[PRIV_ALLOC]]
-! CHECK:    omp.yield(%[[PRIV_DECL]]#0 : !fir.ref<i32>)
-! CHECK: }
+! CHECK-SAME:              @[[VAR_PRIVATIZER_SYM:.*]] : i32
 
 ! CHECK-LABEL: func.func @_QPtarget_simple() {
 ! CHECK:  %[[VAR_ALLOC:.*]] = fir.alloca i32 {bindc_name = "simple_var", {{.*}}}
diff --git flang/test/Lower/OpenMP/DelayedPrivatization/wsloop.f90 flang/test/Lower/OpenMP/DelayedPrivatization/wsloop.f90
index 66fd120085c7..65c218fe9f77 100644
--- flang/test/Lower/OpenMP/DelayedPrivatization/wsloop.f90
+++ flang/test/Lower/OpenMP/DelayedPrivatization/wsloop.f90
@@ -13,8 +13,8 @@ subroutine wsloop_private
     end do
 end subroutine wsloop_private
 
-! CHECK: omp.private {type = private} @[[I_PRIVATIZER:.*i_private_ref_i32]]
-! CHECK: omp.private {type = firstprivate} @[[X_PRIVATIZER:.*x_firstprivate_ref_i32]]
+! CHECK: omp.private {type = private} @[[I_PRIVATIZER:.*i_private_i32]]
+! CHECK: omp.private {type = firstprivate} @[[X_PRIVATIZER:.*x_firstprivate_i32]]
 
 ! CHECK: func.func @{{.*}}() {
 ! CHECK:   %[[I_DECL:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "{{.*}}i"}
diff --git flang/test/Lower/OpenMP/Todo/metadirective-exec.f90 flang/test/Lower/OpenMP/Todo/metadirective-exec.f90
new file mode 100644
index 000000000000..2e160a189661
--- /dev/null
+++ flang/test/Lower/OpenMP/Todo/metadirective-exec.f90
@@ -0,0 +1,9 @@
+!RUN: %not_todo_cmd bbc -emit-hlfir -fopenmp -fopenmp-version=52 -o - %s 2>&1 | FileCheck %s
+!RUN: %not_todo_cmd %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=52 -o - %s 2>&1 | FileCheck %s
+
+!CHECK: not yet implemented: METADIRECTIVE
+subroutine f00
+  continue
+  !Executable
+  !$omp metadirective when(user={condition(.true.)}: nothing)
+end
diff --git flang/test/Lower/OpenMP/Todo/metadirective-spec.f90 flang/test/Lower/OpenMP/Todo/metadirective-spec.f90
new file mode 100644
index 000000000000..a00612a92218
--- /dev/null
+++ flang/test/Lower/OpenMP/Todo/metadirective-spec.f90
@@ -0,0 +1,9 @@
+!RUN: %not_todo_cmd bbc -emit-hlfir -fopenmp -fopenmp-version=52 -o - %s 2>&1 | FileCheck %s
+!RUN: %not_todo_cmd %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=52 -o - %s 2>&1 | FileCheck %s
+
+!CHECK: not yet implemented: METADIRECTIVE
+subroutine f00
+  !Specification
+  !$omp metadirective when(user={condition(.true.)}: nothing)
+  implicit none
+end
diff --git flang/test/Lower/OpenMP/cfg-conversion-omp.private.f90 flang/test/Lower/OpenMP/cfg-conversion-omp.private.f90
index 44036492f559..8b8adf2b140c 100644
--- flang/test/Lower/OpenMP/cfg-conversion-omp.private.f90
+++ flang/test/Lower/OpenMP/cfg-conversion-omp.private.f90
@@ -21,34 +21,27 @@ subroutine delayed_privatization_allocatable
 end subroutine
 
 ! CFGConv-LABEL: omp.private {type = private}
-! CFGConv-SAME: @[[PRIVATIZER_SYM:.*]] : [[TYPE:!fir.ref<!fir.box<!fir.heap<i32>>>]] alloc {
+! CFGConv-SAME: @[[PRIVATIZER_SYM:.*]] : [[BOX_TYPE:!fir.box<!fir.heap<i32>>]] init {
 
-! CFGConv-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]):
-
-! CFGConv-NEXT:   %[[PRIV_ALLOC:.*]] = fir.alloca !fir.box<!fir.heap<i32>> {bindc_name = "var1", pinned, uniq_name = "_QFdelayed_privatization_allocatableEvar1"}
+! CFGConv-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE:!fir.ref<!fir.box<!fir.heap<i32>>>]], %[[PRIV_ALLOC:.*]]: [[TYPE]]):
 
 ! CFGConv-NEXT:   %[[PRIV_ARG_VAL:.*]] = fir.load %[[PRIV_ARG]] : !fir.ref<!fir.box<!fir.heap<i32>>>
 ! CFGConv-NEXT:   %[[PRIV_ARG_BOX:.*]] = fir.box_addr %[[PRIV_ARG_VAL]] : (!fir.box<!fir.heap<i32>>) -> !fir.heap<i32>
 ! CFGConv-NEXT:   %[[PRIV_ARG_ADDR:.*]] = fir.convert %[[PRIV_ARG_BOX]] : (!fir.heap<i32>) -> i64
 ! CFGConv-NEXT:   %[[C0:.*]] = arith.constant 0 : i64
-! CFGConv-NEXT:   %[[ALLOC_COND:.*]] = arith.cmpi ne, %[[PRIV_ARG_ADDR]], %[[C0]] : i64
-! CFGConv-NEXT:   cf.cond_br %[[ALLOC_COND]], ^[[ALLOC_MEM_BB:.*]], ^[[ZERO_MEM_BB:.*]]
-! CFGConv-NEXT: ^[[ALLOC_MEM_BB]]:
-! CFGConv:        fir.allocmem
+! CFGConv-NEXT:   %[[ALLOC_COND:.*]] = arith.cmpi eq, %[[PRIV_ARG_ADDR]], %[[C0]] : i64
+! CFGConv-NEXT:   cf.cond_br %[[ALLOC_COND]], ^[[ZERO_MEM_BB:.*]], ^[[ALLOC_MEM_BB:.*]]
+! CFGConv-NEXT: ^[[ZERO_MEM_BB]]:
 ! CFGConv:        cf.br ^[[DECL_BB:.*]]
-! CFGConv:      ^[[ZERO_MEM_BB]]:
-! CFGConv-NEXT:   fir.zero_bits
+! CFGConv:      ^[[ALLOC_MEM_BB]]:
+! CFGConv:        fir.allocmem
 ! CFGConv:        cf.br ^[[DECL_BB:.*]]
 ! CFGConv-NEXT: ^[[DECL_BB]]:
-! CFGConv-NEXT:   hlfir.declare
 ! CFGConv-NEXT:   omp.yield
 
 
 ! LLVMDialect-LABEL: omp.private {type = private}
-! LLVMDialect-SAME: @[[PRIVATIZER_SYM:.*]] : [[TYPE:!llvm.ptr]] alloc {
-
-! LLVMDialect-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]):
-! LLVMDialect:        llvm.alloca
+! LLVMDialect-SAME: @[[PRIVATIZER_SYM:.*]] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> init {
+! LLVMDialect-NEXT: ^bb0(%[[PRIV_ARG:.*]]: !llvm.ptr, %[[PRIV_ALLOC:.*]]: !llvm.ptr):
 ! LLVMDialect:        llvm.call @malloc
-
 ! LLVMDialect-NOT:    hlfir.declare
diff --git flang/test/Lower/OpenMP/copyprivate.f90 flang/test/Lower/OpenMP/copyprivate.f90
index 761e6190ed6e..4c3ed9389369 100644
--- flang/test/Lower/OpenMP/copyprivate.f90
+++ flang/test/Lower/OpenMP/copyprivate.f90
@@ -14,13 +14,13 @@
 !CHECK-DAG: func private @_copy_c16x8(%{{.*}}: !fir.ref<!fir.char<2,8>>, %{{.*}}: !fir.ref<!fir.char<2,8>>)
 
 !CHECK-DAG: func private @_copy_box_Uxi32(%{{.*}}: !fir.ref<!fir.box<!fir.array<?xi32>>>, %{{.*}}: !fir.ref<!fir.box<!fir.array<?xi32>>>)
-!CHECK-DAG: func private @_copy_10xi32(%{{.*}}: !fir.ref<!fir.array<10xi32>>, %{{.*}}: !fir.ref<!fir.array<10xi32>>)
-!CHECK-DAG: func private @_copy_3x4xi32(%{{.*}}: !fir.ref<!fir.array<3x4xi32>>, %{{.*}}: !fir.ref<!fir.array<3x4xi32>>)
-!CHECK-DAG: func private @_copy_10xf32(%{{.*}}: !fir.ref<!fir.array<10xf32>>, %{{.*}}: !fir.ref<!fir.array<10xf32>>)
-!CHECK-DAG: func private @_copy_3x4xz32(%{{.*}}: !fir.ref<!fir.array<3x4xcomplex<f32>>>, %{{.*}}: !fir.ref<!fir.array<3x4xcomplex<f32>>>)
-!CHECK-DAG: func private @_copy_10xl32(%{{.*}}: !fir.ref<!fir.array<10x!fir.logical<4>>>, %{{.*}}: !fir.ref<!fir.array<10x!fir.logical<4>>>)
-!CHECK-DAG: func private @_copy_3xc8x8(%{{.*}}: !fir.ref<!fir.array<3x!fir.char<1,8>>>, %{{.*}}: !fir.ref<!fir.array<3x!fir.char<1,8>>>)
-!CHECK-DAG: func private @_copy_3xc16x5(%{{.*}}: !fir.ref<!fir.array<3x!fir.char<2,5>>>, %{{.*}}: !fir.ref<!fir.array<3x!fir.char<2,5>>>)
+!CHECK-DAG: func private @_copy_box_10xi32(%{{.*}}: !fir.ref<!fir.box<!fir.array<10xi32>>>, %{{.*}}: !fir.ref<!fir.box<!fir.array<10xi32>>>)
+!CHECK-DAG: func private @_copy_box_3x4xi32(%{{.*}}: !fir.ref<!fir.box<!fir.array<3x4xi32>>>, %{{.*}}: !fir.ref<!fir.box<!fir.array<3x4xi32>>>)
+!CHECK-DAG: func private @_copy_box_10xf32(%{{.*}}: !fir.ref<!fir.box<!fir.array<10xf32>>>, %{{.*}}: !fir.ref<!fir.box<!fir.array<10xf32>>>)
+!CHECK-DAG: func private @_copy_box_3x4xz32(%{{.*}}: !fir.ref<!fir.box<!fir.array<3x4xcomplex<f32>>>>, %{{.*}}: !fir.ref<!fir.box<!fir.array<3x4xcomplex<f32>>>>)
+!CHECK-DAG: func private @_copy_box_10xl32(%{{.*}}: !fir.ref<!fir.box<!fir.array<10x!fir.logical<4>>>>, %{{.*}}: !fir.ref<!fir.box<!fir.array<10x!fir.logical<4>>>>)
+!CHECK-DAG: func private @_copy_box_3xc8x8(%{{.*}}: !fir.ref<!fir.box<!fir.array<3x!fir.char<1,8>>>>, %{{.*}}: !fir.ref<!fir.box<!fir.array<3x!fir.char<1,8>>>>)
+!CHECK-DAG: func private @_copy_box_3xc16x5(%{{.*}}: !fir.ref<!fir.box<!fir.array<3x!fir.char<2,5>>>>, %{{.*}}: !fir.ref<!fir.box<!fir.array<3x!fir.char<2,5>>>>)
 
 !CHECK-DAG: func private @_copy_rec__QFtest_dtTdt(%{{.*}}: !fir.ref<!fir.type<_QFtest_dtTdt{i:i32,r:f32}>>, %{{.*}}: !fir.ref<!fir.type<_QFtest_dtTdt{i:i32,r:f32}>>)
 !CHECK-DAG: func private @_copy_box_heap_Uxi32(%{{.*}}: !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, %{{.*}}: !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
@@ -95,20 +95,16 @@ end subroutine
 
 !CHECK-LABEL: func @_QPtest_array
 !CHECK:         omp.parallel
-!CHECK:           %[[A:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFtest_arrayEa"} : (!fir.box<!fir.array<?xi32>>, !fir.shift<1>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
-!CHECK:           %[[I1:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFtest_arrayEi1"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
-!CHECK:           %[[I2:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFtest_arrayEi2"} : (!fir.ref<!fir.array<3x4xi32>>, !fir.shape<2>) -> (!fir.ref<!fir.array<3x4xi32>>, !fir.ref<!fir.array<3x4xi32>>)
-!CHECK:           %[[I3:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFtest_arrayEi3"} : (!fir.ref<!fir.array<?xi32>>, !fir.shapeshift<1>) -> (!fir.box<!fir.array<?xi32>>, !fir.ref<!fir.array<?xi32>>)
-!CHECK:           %[[R1:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFtest_arrayEr1"} : (!fir.ref<!fir.array<10xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>)
-!CHECK:           %[[C1:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFtest_arrayEc1"} : (!fir.ref<!fir.array<3x4xcomplex<f32>>>, !fir.shape<2>) -> (!fir.ref<!fir.array<3x4xcomplex<f32>>>, !fir.ref<!fir.array<3x4xcomplex<f32>>>)
-!CHECK:           %[[L1:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFtest_arrayEl1"} : (!fir.ref<!fir.array<10x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10x!fir.logical<4>>>, !fir.ref<!fir.array<10x!fir.logical<4>>>)
-!CHECK:           %[[S1:.*]]:2 = hlfir.declare {{.*}} {uniq_name = "_QFtest_arrayEs1"} : (!fir.ref<!fir.array<3x!fir.char<1,8>>>, !fir.shape<1>, index) -> (!fir.ref<!fir.array<3x!fir.char<1,8>>>, !fir.ref<!fir.array<3x!fir.char<1,8>>>)
-!CHECK:           %[[S2:.*]]:2 = hlfir.declare {{.*}} {uniq_name = "_QFtest_arrayEs2"} : (!fir.ref<!fir.array<3x!fir.char<2,5>>>, !fir.shape<1>, index) -> (!fir.ref<!fir.array<3x!fir.char<2,5>>>, !fir.ref<!fir.array<3x!fir.char<2,5>>>)
-!CHECK:           %[[A_REF:.*]] = fir.alloca !fir.box<!fir.array<?xi32>>
-!CHECK:           fir.store %[[A]]#0 to %[[A_REF]] : !fir.ref<!fir.box<!fir.array<?xi32>>>
-!CHECK:           %[[I3_REF:.*]] = fir.alloca !fir.box<!fir.array<?xi32>>
-!CHECK:           fir.store %[[I3]]#0 to %[[I3_REF]] : !fir.ref<!fir.box<!fir.array<?xi32>>>
-!CHECK:           omp.single copyprivate(%[[A_REF]] -> @_copy_box_Uxi32 : !fir.ref<!fir.box<!fir.array<?xi32>>>, %[[I1]]#0 -> @_copy_10xi32 : !fir.ref<!fir.array<10xi32>>, %[[I2]]#0 -> @_copy_3x4xi32 : !fir.ref<!fir.array<3x4xi32>>, %[[I3_REF]] -> @_copy_box_Uxi32 : !fir.ref<!fir.box<!fir.array<?xi32>>>, %[[R1]]#0 -> @_copy_10xf32 : !fir.ref<!fir.array<10xf32>>, %[[C1]]#0 -> @_copy_3x4xz32 : !fir.ref<!fir.array<3x4xcomplex<f32>>>, %[[L1]]#0 -> @_copy_10xl32 : !fir.ref<!fir.array<10x!fir.logical<4>>>, %[[S1]]#0 -> @_copy_3xc8x8 : !fir.ref<!fir.array<3x!fir.char<1,8>>>, %[[S2]]#0 -> @_copy_3xc16x5 : !fir.ref<!fir.array<3x!fir.char<2,5>>>)
+!CHECK:           %[[A:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFtest_arrayEa"}
+!CHECK:           %[[I1:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFtest_arrayEi1"}
+!CHECK:           %[[I2:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFtest_arrayEi2"}
+!CHECK:           %[[I3:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFtest_arrayEi3"}
+!CHECK:           %[[R1:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFtest_arrayEr1"}
+!CHECK:           %[[C1:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFtest_arrayEc1"}
+!CHECK:           %[[L1:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFtest_arrayEl1"}
+!CHECK:           %[[S1:.*]]:2 = hlfir.declare {{.*}} {uniq_name = "_QFtest_arrayEs1"}
+!CHECK:           %[[S2:.*]]:2 = hlfir.declare {{.*}} {uniq_name = "_QFtest_arrayEs2"}
+!CHECK:           omp.single copyprivate(%[[A]]#0 -> @_copy_box_Uxi32 : {{.*}}, %[[I1]]#0 -> @_copy_box_10xi32 : {{.*}}, %[[I2]]#0 -> @_copy_box_3x4xi32 : {{.*}}, %[[I3]]#0 -> @_copy_box_Uxi32 : {{.*}}, %[[R1]]#0 -> @_copy_box_10xf32 : {{.*}}, %[[C1]]#0 -> @_copy_box_3x4xz32 : {{.*}}, %[[L1]]#0 -> @_copy_box_10xl32 : {{.*}}, %[[S1]]#0 -> @_copy_box_3xc8x8 : {{.*}}, %[[S2]]#0 -> @_copy_box_3xc16x5 : {{.*}})
 subroutine test_array(a, n)
   integer :: a(:), n
   integer :: i1(10), i2(3, 4), i3(n)
diff --git flang/test/Lower/OpenMP/default-clause-byref.f90 flang/test/Lower/OpenMP/default-clause-byref.f90
index 654c13ada9e3..168aa1f5394a 100644
--- flang/test/Lower/OpenMP/default-clause-byref.f90
+++ flang/test/Lower/OpenMP/default-clause-byref.f90
@@ -7,57 +7,27 @@
 ! RUN: bbc -fopenmp -emit-hlfir --force-byref-reduction %s -o - \
 ! RUN: | FileCheck %s
 
-!CHECK:  omp.private {type = firstprivate} @[[W_FIRSTPRIVATIZER:_QFEw_firstprivate_ref_i32]] : !fir.ref<i32> alloc {
-!CHECK:  ^bb0(%{{.*}}: !fir.ref<i32>):
-!CHECK:    %[[PRIV_W_ALLOC:.*]] = fir.alloca i32 {bindc_name = "w", {{.*}}}
-!CHECK:    %[[PRIV_W_DECL:.*]]:2 = hlfir.declare %[[PRIV_W_ALLOC]] {uniq_name = "_QFEw"}
-!CHECK:    omp.yield(%[[PRIV_W_DECL]]#0 : !fir.ref<i32>)
-!CHECK:  } copy {
+!CHECK:  omp.private {type = firstprivate} @[[W_FIRSTPRIVATIZER:_QFEw_firstprivate_i32]] : i32 copy {
 !CHECK:  ^bb0(%[[ORIG_W:.*]]: !fir.ref<i32>, %[[PRIV_W:.*]]: !fir.ref<i32>):
 !CHECK:    %[[ORIG_W_VAL:.*]] = fir.load %[[ORIG_W]]
 !CHECK:    hlfir.assign %[[ORIG_W_VAL]] to %[[PRIV_W]]
 !CHECK:    omp.yield(%[[PRIV_W]] : !fir.ref<i32>)
 !CHECK:  }
 
-!CHECK:  omp.private {type = firstprivate} @[[Y_FIRSTPRIVATIZER:_QFEy_firstprivate_ref_i32]] : !fir.ref<i32> alloc {
-!CHECK:  ^bb0(%{{.*}}: !fir.ref<i32>):
-!CHECK:    %[[PRIV_Y_ALLOC:.*]] = fir.alloca i32 {bindc_name = "y", {{.*}}}
-!CHECK:    %[[PRIV_Y_DECL:.*]]:2 = hlfir.declare %[[PRIV_Y_ALLOC]] {uniq_name = "_QFEy"}
-!CHECK:    omp.yield(%[[PRIV_Y_DECL]]#0 : !fir.ref<i32>)
-!CHECK:  } copy {
+!CHECK:  omp.private {type = firstprivate} @[[Y_FIRSTPRIVATIZER:_QFEy_firstprivate_i32]] : i32 copy {
 !CHECK:  ^bb0(%[[ORIG_Y:.*]]: !fir.ref<i32>, %[[PRIV_Y:.*]]: !fir.ref<i32>):
 !CHECK:    %[[ORIG_Y_VAL:.*]] = fir.load %[[ORIG_Y]]
 !CHECK:    hlfir.assign %[[ORIG_Y_VAL]] to %[[PRIV_Y]]
 !CHECK:    omp.yield(%[[PRIV_Y]] : !fir.ref<i32>)
 !CHECK:  }
 
-!CHECK:  omp.private {type = private} @[[X_PRIVATIZER:_QFEx_private_ref_i32]] : !fir.ref<i32> alloc {
-!CHECK:  ^bb0(%{{.*}}: !fir.ref<i32>):
-!CHECK:    %[[PRIV_X_ALLOC:.*]] = fir.alloca i32 {bindc_name = "x", {{.*}}}
-!CHECK:    %[[PRIV_X_DECL:.*]]:2 = hlfir.declare %[[PRIV_X_ALLOC]] {uniq_name = "_QFEx"}
-!CHECK:    omp.yield(%[[PRIV_X_DECL]]#0 : !fir.ref<i32>)
-!CHECK:  }
+!CHECK:  omp.private {type = private} @[[X_PRIVATIZER:_QFEx_private_i32]] : i32
 
-!CHECK:  omp.private {type = private} @[[W_PRIVATIZER:_QFEw_private_ref_i32]] : !fir.ref<i32> alloc {
-!CHECK:  ^bb0(%{{.*}}: !fir.ref<i32>):
-!CHECK:    %[[PRIV_W_ALLOC:.*]] = fir.alloca i32 {bindc_name = "w", {{.*}}}
-!CHECK:    %[[PRIV_W_DECL:.*]]:2 = hlfir.declare %[[PRIV_W_ALLOC]] {uniq_name = "_QFEw"}
-!CHECK:    omp.yield(%[[PRIV_W_DECL]]#0 : !fir.ref<i32>)
-!CHECK:  }
+!CHECK:  omp.private {type = private} @[[W_PRIVATIZER:_QFEw_private_i32]] : i32
 
-!CHECK:  omp.private {type = private} @[[Y_PRIVATIZER:_QFEy_private_ref_i32]] : !fir.ref<i32> alloc {
-!CHECK:  ^bb0(%{{.*}}: !fir.ref<i32>):
-!CHECK:    %[[PRIV_Y_ALLOC:.*]] = fir.alloca i32 {bindc_name = "y", {{.*}}}
-!CHECK:    %[[PRIV_Y_DECL:.*]]:2 = hlfir.declare %[[PRIV_Y_ALLOC]] {uniq_name = "_QFEy"}
-!CHECK:    omp.yield(%[[PRIV_Y_DECL]]#0 : !fir.ref<i32>)
-!CHECK:  }
+!CHECK:  omp.private {type = private} @[[Y_PRIVATIZER:_QFEy_private_i32]] : i32
 
-!CHECK:  omp.private {type = firstprivate} @[[X_FIRSTPRIVATIZER:_QFEx_firstprivate_ref_i32]] : !fir.ref<i32> alloc {
-!CHECK:  ^bb0(%{{.*}}: !fir.ref<i32>):
-!CHECK:    %[[PRIV_X_ALLOC:.*]] = fir.alloca i32 {bindc_name = "x", {{.*}}}
-!CHECK:    %[[PRIV_X_DECL:.*]]:2 = hlfir.declare %[[PRIV_X_ALLOC]] {uniq_name = "_QFEx"}
-!CHECK:    omp.yield(%[[PRIV_X_DECL]]#0 : !fir.ref<i32>)
-!CHECK:  } copy {
+!CHECK:  omp.private {type = firstprivate} @[[X_FIRSTPRIVATIZER:_QFEx_firstprivate_i32]] : i32 copy {
 !CHECK:  ^bb0(%[[ORIG_X:.*]]: !fir.ref<i32>, %[[PRIV_X:.*]]: !fir.ref<i32>):
 !CHECK:    %[[ORIG_X_VAL:.*]] = fir.load %[[ORIG_X]]
 !CHECK:    hlfir.assign %[[ORIG_X_VAL]] to %[[PRIV_X]]
diff --git flang/test/Lower/OpenMP/delayed-privatization-allocatable-array.f90 flang/test/Lower/OpenMP/delayed-privatization-allocatable-array.f90
index 759d80cf45b2..9b6dbabf0c6f 100644
--- flang/test/Lower/OpenMP/delayed-privatization-allocatable-array.f90
+++ flang/test/Lower/OpenMP/delayed-privatization-allocatable-array.f90
@@ -16,38 +16,35 @@ subroutine delayed_privatization_private(var1, l1)
 end subroutine
 
 ! CHECK-LABEL: omp.private {type = firstprivate}
-! CHECK-SAME: @[[PRIVATIZER_SYM:.*]] : [[TYPE:!fir.ref<!fir.box<!fir.heap<!fir.array<\?xi32>>>>]] alloc {
+! CHECK-SAME: @[[PRIVATIZER_SYM:.*]] : [[BOX_TYPE:!fir.box<!fir.heap<!fir.array<\?xi32>>>]] init {
 
-! CHECK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]):
-! CHECK-NEXT:   %[[PRIV_ALLOC:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<{{\?}}xi32>>> {bindc_name = "var1", pinned, uniq_name = "_QFdelayed_privatization_privateEvar1"}
+! CHECK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE:!fir.ref<!fir.box<!fir.heap<!fir.array<\?xi32>>>>]], %[[PRIV_ALLOC:.*]]: [[TYPE]]):
 
 ! CHECK-NEXT:   %[[PRIV_ARG_VAL:.*]] = fir.load %[[PRIV_ARG]]
 ! CHECK-NEXT:   %[[PRIV_ARG_BOX:.*]] = fir.box_addr %[[PRIV_ARG_VAL]]
 ! CHECK-NEXT:   %[[PRIV_ARG_ADDR:.*]] = fir.convert %[[PRIV_ARG_BOX]]
 ! CHECK-NEXT:   %[[C0:.*]] = arith.constant 0 : i64
-! CHECK-NEXT:   %[[ALLOC_COND:.*]] = arith.cmpi ne, %[[PRIV_ARG_ADDR]], %[[C0]] : i64
+! CHECK-NEXT:   %[[ALLOC_COND:.*]] = arith.cmpi eq, %[[PRIV_ARG_ADDR]], %[[C0]] : i64
 
 ! CHECK-NEXT:   fir.if %[[ALLOC_COND]] {
-! CHECK-NEXT:     %[[PRIV_ARG_VAL:.*]] = fir.load %[[PRIV_ARG]] : [[TYPE]]
+! CHECK-NEXT:     %[[C0_2:.*]] = arith.constant 0 : index
+! CHECK-NEXT:     %[[SHAPE:.*]] = fir.shape %[[C0_2]]
+! CHECK-NEXT:     %[[EMBOX_2:.*]] = fir.embox %[[PRIV_ARG_BOX]](%[[SHAPE]])
+! CHECK-NEXT:     fir.store %[[EMBOX_2]] to %[[PRIV_ALLOC]]
+! CHECK-NEXT:   } else {
 ! CHECK-NEXT:     %[[C0:.*]] = arith.constant 0 : index
 ! CHECK-NEXT:     %[[DIMS:.*]]:3 = fir.box_dims %[[PRIV_ARG_VAL]], %[[C0]]
-! CHECK-NEXT:     fir.box_addr %[[PRIV_ARG_VAL]]
-! CHECK-NEXT:     %[[C0_2:.*]] = arith.constant 0 : index 
-! CHECK-NEXT:     %[[CMP:.*]] = arith.cmpi sgt, %[[DIMS]]#1, %[[C0_2]] : index
-! CHECK-NEXT:     %[[SELECT:.*]] = arith.select %[[CMP]], %[[DIMS]]#1, %[[C0_2]] : index
-! CHECK-NEXT:     %[[MEM:.*]] = fir.allocmem !fir.array<?xi32>, %[[SELECT]]
-! CHECK-NEXT:     %[[SHAPE_SHIFT:.*]] = fir.shape_shift %[[DIMS]]#0, %[[SELECT]] : (index, index) -> !fir.shapeshift<1>
-! CHECK-NEXT:     %[[EMBOX:.*]] = fir.embox %[[MEM]](%[[SHAPE_SHIFT]])
+! CHECK-NEXT:     %[[SHAPE:.*]] = fir.shape %[[DIMS]]#1
+! CHECK-NEXT:     %[[MEM:.*]] = fir.allocmem !fir.array<?xi32>, %[[DIMS]]#1
+! CHECK-NEXT:     %[[TRUE:.*]] = arith.constant true
+! CHECK-NEXT:     %[[DECL:.*]]:2 = hlfir.declare %[[MEM]](%[[SHAPE]])
+! CHECK-NEXT:     %[[C0_2:.*]] = arith.constant 0 : index
+! CHECK-NEXT:     %[[DIMS_2:.*]]:3 = fir.box_dims %[[PRIV_ARG_VAL]], %[[C0_2]]
+! CHECK-NEXT:     %[[SHAPE_SHIFT:.*]] = fir.shape_shift %[[DIMS_2]]#0, %[[DIMS_2]]#1
+! CHECK-NEXT:     %[[EMBOX:.*]] = fir.rebox %[[DECL]]#0(%[[SHAPE_SHIFT]])
 ! CHECK-NEXT:     fir.store %[[EMBOX]] to %[[PRIV_ALLOC]]
-! CHECK-NEXT:   } else {
-! CHECK-NEXT:     %[[ZEROS:.*]] = fir.zero_bits
-! CHECK-NEXT:     %[[C0_3:.*]] = arith.constant 0 : index
-! CHECK-NEXT:     %[[SHAPE:.*]] = fir.shape %[[C0_3]] : (index) -> !fir.shape<1>
-! CHECK-NEXT:     %[[EMBOX_2:.*]] = fir.embox %[[ZEROS]](%[[SHAPE]])
-! CHECK-NEXT:     fir.store %[[EMBOX_2]] to %[[PRIV_ALLOC]]
 ! CHECK-NEXT:   }
 
-! CHECK-NEXT:   hlfir.declare
 ! CHECK-NEXT:   omp.yield
 
 ! CHECK-NEXT: } copy {
diff --git flang/test/Lower/OpenMP/delayed-privatization-allocatable-firstprivate.f90 flang/test/Lower/OpenMP/delayed-privatization-allocatable-firstprivate.f90
index b3a668018df1..01ca1073ae84 100644
--- flang/test/Lower/OpenMP/delayed-privatization-allocatable-firstprivate.f90
+++ flang/test/Lower/OpenMP/delayed-privatization-allocatable-firstprivate.f90
@@ -18,9 +18,9 @@ subroutine delayed_privatization_allocatable
 end subroutine
 
 ! CHECK-LABEL: omp.private {type = firstprivate}
-! CHECK-SAME: @[[PRIVATIZER_SYM:.*]] : [[TYPE:!fir.ref<!fir.box<!fir.heap<i32>>>]] alloc {
+! CHECK-SAME: @[[PRIVATIZER_SYM:.*]] : [[BOX_TYPE:!fir.box<!fir.heap<i32>>]] init {
 
-! CHECK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]):
+! CHECK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE:!fir.ref<!fir.box<!fir.heap<i32>>>]], %[[PRIV_ALLOC:.*]]: [[TYPE]]):
 
 ! CHECK: } copy {
 ! CHECK: ^bb0(%[[PRIV_ORIG_ARG:.*]]: [[TYPE]], %[[PRIV_PRIV_ARG:.*]]: [[TYPE]]):
diff --git flang/test/Lower/OpenMP/delayed-privatization-allocatable-private.f90 flang/test/Lower/OpenMP/delayed-privatization-allocatable-private.f90
index f1fae2540aa4..4ce66f52110e 100644
--- flang/test/Lower/OpenMP/delayed-privatization-allocatable-private.f90
+++ flang/test/Lower/OpenMP/delayed-privatization-allocatable-private.f90
@@ -15,30 +15,26 @@ subroutine delayed_privatization_allocatable
 end subroutine
 
 ! CHECK-LABEL: omp.private {type = private}
-! CHECK-SAME: @[[PRIVATIZER_SYM:.*]] : [[TYPE:!fir.ref<!fir.box<!fir.heap<i32>>>]] alloc {
+! CHECK-SAME: @[[PRIVATIZER_SYM:.*]] : [[BOX_TYPE:!fir.box<!fir.heap<i32>>]] init {
 
-! CHECK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]):
-
-! CHECK-NEXT:   %[[PRIV_ALLOC:.*]] = fir.alloca !fir.box<!fir.heap<i32>> {bindc_name = "var1", pinned, uniq_name = "_QFdelayed_privatization_allocatableEvar1"}
+! CHECK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE:!fir.ref<!fir.box<!fir.heap<i32>>>]], %[[PRIV_ALLOC:.*]]: [[TYPE]]):
 
 ! CHECK-NEXT:   %[[PRIV_ARG_VAL:.*]] = fir.load %[[PRIV_ARG]] : !fir.ref<!fir.box<!fir.heap<i32>>>
 ! CHECK-NEXT:   %[[PRIV_ARG_BOX:.*]] = fir.box_addr %[[PRIV_ARG_VAL]] : (!fir.box<!fir.heap<i32>>) -> !fir.heap<i32>
 ! CHECK-NEXT:   %[[PRIV_ARG_ADDR:.*]] = fir.convert %[[PRIV_ARG_BOX]] : (!fir.heap<i32>) -> i64
 ! CHECK-NEXT:   %[[C0:.*]] = arith.constant 0 : i64
-! CHECK-NEXT:   %[[ALLOC_COND:.*]] = arith.cmpi ne, %[[PRIV_ARG_ADDR]], %[[C0]] : i64
+! CHECK-NEXT:   %[[ALLOC_COND:.*]] = arith.cmpi eq, %[[PRIV_ARG_ADDR]], %[[C0]] : i64
 
 ! CHECK-NEXT:   fir.if %[[ALLOC_COND]] {
-! CHECK:          %[[PRIV_ALLOCMEM:.*]] = fir.allocmem i32 {fir.must_be_heap = true, uniq_name = "_QFdelayed_privatization_allocatableEvar1.alloc"}
+! CHECK-NEXT:     %[[ZERO_BOX:.*]] = fir.embox %[[PRIV_ARG_BOX]] : (!fir.heap<i32>) -> !fir.box<!fir.heap<i32>>
+! CHECK-NEXT:     fir.store %[[ZERO_BOX]] to %[[PRIV_ALLOC]] : !fir.ref<!fir.box<!fir.heap<i32>>>
+! CHECK-NEXT:   } else {
+! CHECK-NEXT:     %[[PRIV_ALLOCMEM:.*]] = fir.allocmem i32
 ! CHECK-NEXT:     %[[PRIV_ALLOCMEM_BOX:.*]] = fir.embox %[[PRIV_ALLOCMEM]] : (!fir.heap<i32>) -> !fir.box<!fir.heap<i32>>
 ! CHECK-NEXT:     fir.store %[[PRIV_ALLOCMEM_BOX]] to %[[PRIV_ALLOC]] : !fir.ref<!fir.box<!fir.heap<i32>>>
-! CHECK-NEXT:   } else {
-! CHECK-NEXT:     %[[ZERO_BITS:.*]] = fir.zero_bits !fir.heap<i32>
-! CHECK-NEXT:     %[[ZERO_BOX:.*]] = fir.embox %[[ZERO_BITS]] : (!fir.heap<i32>) -> !fir.box<!fir.heap<i32>>
-! CHECK-NEXT:     fir.store %[[ZERO_BOX]] to %[[PRIV_ALLOC]] : !fir.ref<!fir.box<!fir.heap<i32>>>
 ! CHECK-NEXT:   }
 
-! CHECK-NEXT:   %[[PRIV_DECL:.*]]:2 = hlfir.declare %[[PRIV_ALLOC]]
-! CHECK-NEXT:   omp.yield(%[[PRIV_DECL]]#0 : [[TYPE]])
+! CHECK-NEXT:   omp.yield(%[[PRIV_ALLOC]] : [[TYPE]])
 
 ! CHECK-NEXT: } dealloc {
 ! CHECK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]):
@@ -50,12 +46,7 @@ end subroutine
 ! CHECK-NEXT:   %[[PRIV_NULL_COND:.*]] = arith.cmpi ne, %[[PRIV_ADDR_I64]], %[[C0]] : i64
 
 ! CHECK-NEXT:   fir.if %[[PRIV_NULL_COND]] {
-! CHECK:          %[[PRIV_VAL_2:.*]] = fir.load %[[PRIV_ARG]]
-! CHECK-NEXT:     %[[PRIV_ADDR_2:.*]] = fir.box_addr %[[PRIV_VAL_2]]
-! CHECK-NEXT:     fir.freemem %[[PRIV_ADDR_2]]
-! CHECK-NEXT:     %[[ZEROS:.*]] = fir.zero_bits
-! CHECK-NEXT:     %[[ZEROS_BOX:.*]]  = fir.embox %[[ZEROS]]
-! CHECK-NEXT:     fir.store %[[ZEROS_BOX]] to %[[PRIV_ARG]]
+! CHECK-NEXT:     fir.freemem %[[PRIV_ADDR]]
 ! CHECK-NEXT:   }
 
 ! CHECK-NEXT:   omp.yield
diff --git flang/test/Lower/OpenMP/delayed-privatization-array.f90 flang/test/Lower/OpenMP/delayed-privatization-array.f90
index 3d641a0d6968..95fa3f9e0305 100644
--- flang/test/Lower/OpenMP/delayed-privatization-array.f90
+++ flang/test/Lower/OpenMP/delayed-privatization-array.f90
@@ -29,20 +29,28 @@ subroutine delayed_privatization_private_1d(var1, l1, u1)
 end subroutine
 
 ! ONE_DIM-LABEL: omp.private {type = firstprivate}
-! ONE_DIM-SAME: @[[PRIVATIZER_SYM:.*]] : [[TYPE:!fir.box<!fir.array<\?xi32>>]] alloc {
-
-! ONE_DIM-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]):
-
-! ONE_DIM:   %[[C0:.*]] = arith.constant 0 : index
-! ONE_DIM-NEXT:   %[[DIMS:.*]]:3 = fir.box_dims %[[PRIV_ARG]], %[[C0]] : ([[TYPE]], index) -> (index, index, index)
-! ONE_DIM:   %[[PRIV_ALLOCA:.*]] = fir.alloca !fir.array<{{\?}}xi32>
-! ONE_DIM-NEXT:   %[[SHAPE_SHIFT:.*]] = fir.shape_shift %[[DIMS]]#0, %[[DIMS]]#1 : (index, index) -> !fir.shapeshift<1>
-! ONE_DIM-NEXT:   %[[PRIV_DECL:.*]]:2 = hlfir.declare %[[PRIV_ALLOCA]](%[[SHAPE_SHIFT]]) {uniq_name = "_QFdelayed_privatization_private_1dEvar1"}
-! ONE_DIM-NEXT:  omp.yield(%[[PRIV_DECL]]#0 : [[TYPE]])
+! ONE_DIM-SAME: @[[PRIVATIZER_SYM:.*]] : [[BOX_TYPE:!fir.box<!fir.array<\?xi32>>]] init {
+
+! ONE_DIM-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE:!fir.ref<!fir.box<!fir.array<\?xi32>>>]], %[[PRIV_BOX_ALLOC:.*]]: [[TYPE]]):
+
+! ONE_DIM-NEXT:   %[[PRIV_ARG_VAL:.*]] = fir.load %[[PRIV_ARG]]
+! ONE_DIM-NEXT:   %[[C0:.*]] = arith.constant 0 : index
+! ONE_DIM-NEXT:   %[[DIMS:.*]]:3 = fir.box_dims %[[PRIV_ARG_VAL]], %[[C0]]
+! ONE_DIM-NEXT:   %[[SHAPE:.*]] = fir.shape %[[DIMS]]#1
+! ONE_DIM-NEXT:   %[[ARRAY_ALLOC:.*]] = fir.allocmem !fir.array<?xi32>, %[[DIMS]]#1
+! ONE_DIM-NEXT:   %[[TRUE:.*]] = arith.constant true
+! ONE_DIM-NEXT:   %[[DECL:.*]]:2 = hlfir.declare %[[ARRAY_ALLOC]](%[[SHAPE]])
+! ONE_DIM-NEXT:   %[[C0_0:.*]] = arith.constant 0
+! ONE_DIM-NEXT:   %[[DIMS2:.*]]:3 = fir.box_dims %[[PRIV_ARG_VAL]], %[[C0_0]]
+! ONE_DIM-NEXT:   %[[SHAPE_SHIFT:.*]] = fir.shape_shift %[[DIMS2]]#0, %[[DIMS2]]#1
+! ONE_DIM-NEXT:   %[[REBOX:.*]] = fir.rebox %[[DECL]]#0(%[[SHAPE_SHIFT]])
+! ONE_DIM-NEXT:   fir.store %[[REBOX]] to %[[PRIV_BOX_ALLOC]]
+! ONE_DIM-NEXT:   omp.yield(%[[PRIV_BOX_ALLOC]] : [[TYPE]])
 
 ! ONE_DIM-NEXT: } copy {
 ! ONE_DIM-NEXT: ^bb0(%[[PRIV_ORIG_ARG:.*]]: [[TYPE]], %[[PRIV_PRIV_ARG:.*]]: [[TYPE]]):
-! ONE_DIM-NEXT:  hlfir.assign %[[PRIV_ORIG_ARG]] to %[[PRIV_PRIV_ARG]]
+! ONE_DIM-NEXT:  %[[PRIV_ORIG_ARG_VAL:.*]] = fir.load %[[PRIV_ORIG_ARG:.*]] : [[TYPE]]
+! ONE_DIM-NEXT:  hlfir.assign %[[PRIV_ORIG_ARG_VAL]] to %[[PRIV_PRIV_ARG]]
 ! ONE_DIM-NEXT:   omp.yield(%[[PRIV_PRIV_ARG]] : [[TYPE]])
 ! ONE_DIM-NEXT: }
 
@@ -58,24 +66,31 @@ subroutine delayed_privatization_private_2d(var1, l1, u1, l2, u2)
 end subroutine
 
 ! TWO_DIM-LABEL: omp.private {type = firstprivate}
-! TWO_DIM-SAME: @[[PRIVATIZER_SYM:.*]] : [[TYPE:!fir.box<!fir.array<\?x\?xi32>>]] alloc {
-
-! TWO_DIM-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]):
-! TWO_DIM:        %[[C0:.*]] = arith.constant 0 : index
-! TWO_DIM-NEXT:   %[[DIMS0:.*]]:3 = fir.box_dims %[[PRIV_ARG]], %[[C0]] : ([[TYPE]], index) -> (index, index, index)
+! TWO_DIM-SAME: @[[PRIVATIZER_SYM:.*]] : [[BOX_TYPE:!fir.box<!fir.array<\?x\?xi32>>]] init {
 
+! TWO_DIM-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE:!fir.ref<!fir.box<!fir.array<\?x\?xi32>>>]], %[[PRIV_BOX_ALLOC:.*]]: [[TYPE]]):
+! TWO_DIM-NEXT:   %[[PRIV_ARG_VAL:.*]] = fir.load %[[PRIV_ARG]]
+! TWO_DIM-NEXT:   %[[C0:.*]] = arith.constant 0 : index
+! TWO_DIM-NEXT:   %[[DIMS_0:.*]]:3 = fir.box_dims %[[PRIV_ARG_VAL]], %[[C0]]
 ! TWO_DIM-NEXT:   %[[C1:.*]] = arith.constant 1 : index
-! TWO_DIM-NEXT:   %[[DIMS1:.*]]:3 = fir.box_dims %[[PRIV_ARG]], %[[C1]] : ([[TYPE]], index) -> (index, index, index)
-
-! TWO_DIM-NEXT:   %[[PRIV_ALLOCA:.*]] = fir.alloca !fir.array<{{\?}}x{{\?}}xi32>, %[[DIMS0]]#1, %[[DIMS1]]#1 {bindc_name = "var1", pinned, uniq_name = "_QFdelayed_privatization_private_2dEvar1"}
-! TWO_DIM-NEXT:   %[[SHAPE_SHIFT:.*]] = fir.shape_shift %[[DIMS0]]#0, %[[DIMS0]]#1, %[[DIMS1]]#0, %[[DIMS1]]#1 : (index, index, index, index) -> !fir.shapeshift<2>
-
-! TWO_DIM-NEXT:   %[[PRIV_DECL:.*]]:2 = hlfir.declare %[[PRIV_ALLOCA]](%[[SHAPE_SHIFT]]) {uniq_name = "_QFdelayed_privatization_private_2dEvar1"}
-! TWO_DIM-NEXT:  omp.yield(%[[PRIV_DECL]]#0 : [[TYPE]])
+! TWO_DIM-NEXT:   %[[DIMS_1:.*]]:3 = fir.box_dims %[[PRIV_ARG_VAL]], %[[C1]]
+! TWO_DIM-NEXT:   %[[SHAPE:.*]] = fir.shape %[[DIMS_0]]#1, %[[DIMS_1]]#1
+! TWO_DIM-NEXT:   %[[ARRAY_ALLOC:.*]] = fir.allocmem !fir.array<?x?xi32>, %[[DIMS_0]]#1, %[[DIMS_1]]#1
+! TWO_DIM-NEXT:   %[[TRUE:.*]] = arith.constant true
+! TWO_DIM-NEXT:   %[[DECL:.*]]:2 = hlfir.declare %[[ARRAY_ALLOC]](%[[SHAPE]])
+! TWO_DIM-NEXT:   %[[C0_0:.*]] = arith.constant 0
+! TWO_DIM-NEXT:   %[[DIMS2_0:.*]]:3 = fir.box_dims %[[PRIV_ARG_VAL]], %[[C0_0]]
+! TWO_DIM-NEXT:   %[[C1_0:.*]] = arith.constant 1
+! TWO_DIM-NEXT:   %[[DIMS2_1:.*]]:3 = fir.box_dims %[[PRIV_ARG_VAL]], %[[C1_0]]
+! TWO_DIM-NEXT:   %[[SHAPE_SHIFT:.*]] = fir.shape_shift %[[DIMS2_0]]#0, %[[DIMS2_0]]#1, %[[DIMS2_1]]#0, %[[DIMS2_1]]#1
+! TWO_DIM-NEXT:   %[[REBOX:.*]] = fir.rebox %[[DECL]]#0(%[[SHAPE_SHIFT]])
+! TWO_DIM-NEXT:   fir.store %[[REBOX]] to %[[PRIV_BOX_ALLOC]]
+! TWO_DIM-NEXT:   omp.yield(%[[PRIV_BOX_ALLOC]] : [[TYPE]])
 
 ! TWO_DIM-NEXT: } copy {
 ! TWO_DIM-NEXT: ^bb0(%[[PRIV_ORIG_ARG:.*]]: [[TYPE]], %[[PRIV_PRIV_ARG:.*]]: [[TYPE]]):
-! TWO_DIM-NEXT:  hlfir.assign %[[PRIV_ORIG_ARG]] to %[[PRIV_PRIV_ARG]]
+! TWO_DIM-NEXT:  %[[PRIV_ORIG_ARG_VAL:.*]] = fir.load %[[PRIV_ORIG_ARG:.*]] : [[TYPE]]
+! TWO_DIM-NEXT:  hlfir.assign %[[PRIV_ORIG_ARG_VAL]] to %[[PRIV_PRIV_ARG]]
 ! TWO_DIM-NEXT:   omp.yield(%[[PRIV_PRIV_ARG]] : [[TYPE]])
 ! TWO_DIM-NEXT: }
 
@@ -90,11 +105,18 @@ program main
 end program
 
 ! ONE_DIM_DEFAULT_LB-LABEL: omp.private {type = private}
-! ONE_DIM_DEFAULT_LB-SAME: @[[PRIVATIZER_SYM:.*]] : [[TYPE:!fir.ref<!fir.array<10xi32>>]] alloc {
-
-! ONE_DIM_DEFAULT_LB-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]):
-
-! ONE_DIM_DEFAULT_LB:   %[[C10:.*]] = arith.constant 10 : index
-! ONE_DIM_DEFAULT_LB:   %[[PRIV_ALLOCA:.*]] = fir.alloca !fir.array<10xi32>
-! ONE_DIM_DEFAULT_LB:   %[[SHAPE:.*]] = fir.shape %[[C10]] : (index) -> !fir.shape<1>
-! ONE_DIM_DEFAULT_LB:   hlfir.declare %[[PRIV_ALLOCA]](%[[SHAPE]])
+! ONE_DIM_DEFAULT_LB-SAME: @[[PRIVATIZER_SYM:.*]] : [[BOX_TYPE:!fir.box<!fir.array<10xi32>>]] init {
+
+! ONE_DIM_DEFAULT_LB-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE:!fir.ref<!fir.box<!fir.array<10xi32>>>]], %[[PRIV_BOX_ALLOC:.*]]: [[TYPE]]):
+! ONE_DIM_DEFAULT_LB-NEXT:   %[[PRIV_ARG_VAL:.*]] = fir.load %[[PRIV_ARG]]
+! ONE_DIM_DEFAULT_LB-NEXT:   %[[C10:.*]] = arith.constant 10 : index
+! ONE_DIM_DEFAULT_LB-NEXT:   %[[SHAPE:.*]] = fir.shape %[[C10]]
+! ONE_DIM_DEFAULT_LB-NEXT:   %[[ARRAY_ALLOC:.*]] = fir.allocmem !fir.array<10xi32>
+! ONE_DIM_DEFAULT_LB-NEXT:   %[[TRUE:.*]] = arith.constant true
+! ONE_DIM_DEFAULT_LB-NEXT:   %[[DECL:.*]]:2 = hlfir.declare %[[ARRAY_ALLOC]](%[[SHAPE]])
+! ONE_DIM_DEFAULT_LB-NEXT:   %[[C0_0:.*]] = arith.constant 0
+! ONE_DIM_DEFAULT_LB-NEXT:   %[[DIMS2:.*]]:3 = fir.box_dims %[[PRIV_ARG_VAL]], %[[C0_0]]
+! ONE_DIM_DEFAULT_LB-NEXT:   %[[SHAPE_SHIFT:.*]] = fir.shape_shift %[[DIMS2]]#0, %[[DIMS2]]#1
+! ONE_DIM_DEFAULT_LB-NEXT:   %[[EMBOX:.*]] = fir.embox %[[DECL]]#0(%[[SHAPE_SHIFT]])
+! ONE_DIM_DEFAULT_LB-NEXT:   fir.store %[[EMBOX]] to %[[PRIV_BOX_ALLOC]]
+! ONE_DIM_DEFAULT_LB-NEXT:   omp.yield(%[[PRIV_BOX_ALLOC]] : [[TYPE]])
diff --git flang/test/Lower/OpenMP/delayed-privatization-character-array.f90 flang/test/Lower/OpenMP/delayed-privatization-character-array.f90
index 9a9d0c01212c..4c7287283c7a 100644
--- flang/test/Lower/OpenMP/delayed-privatization-character-array.f90
+++ flang/test/Lower/OpenMP/delayed-privatization-character-array.f90
@@ -23,19 +23,14 @@ subroutine delayed_privatization_character_array_static_len(var1)
 end subroutine
 
 ! STATIC_LEN-LABEL: omp.private {type = firstprivate}
-! STATIC_LEN-SAME: @[[PRIVATIZER_SYM:.*]] : [[TYPE:!fir.ref<!fir.array<5x!fir.char<1,10>>>]] alloc {
+! STATIC_LEN-SAME: @[[PRIVATIZER_SYM:.*]] : [[TYPE:!fir.box<!fir.array<5x!fir.char<1,10>>>]] init {
 
-! STATIC_LEN-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]):
-! STATIC_LEN-DAG:    %[[C5:.*]] = arith.constant 5 : index
-! STATIC_LEN-DAG:    %[[C10:.*]] = arith.constant 10 : index
-! STATIC_LEN-NEXT:   %[[PRIV_ALLOC:.*]] = fir.alloca !fir.array<5x!fir.char<1,10>>
-! STATIC_LEN-NEXT:   %[[ARRAY_SHAPE:.*]] = fir.shape %[[C5]]
-! STATIC_LEN-NEXT:   %[[PRIV_DECL:.*]]:2 = hlfir.declare %[[PRIV_ALLOC]](%[[ARRAY_SHAPE]]) typeparams %[[C10]]
-! STATIC_LEN-NEXT:   omp.yield(%[[PRIV_DECL]]#0
-
-! STATIC_LEN-NEXT: } copy {
-! STATIC_LEN-NEXT: ^bb0(%[[PRIV_ORIG_ARG:.*]]: [[TYPE]], %[[PRIV_PRIV_ARG:.*]]: [[TYPE]]):
-! STATIC_LEN-NEXT:   hlfir.assign %[[PRIV_ORIG_ARG]] to %[[PRIV_PRIV_ARG]]
+! STATIC_LEN-NEXT: ^bb0(%[[MOLD_REF:.*]]: !fir.ref<[[TYPE]]>, %[[ALLOC:.*]]: !fir.ref<[[TYPE]]>):
+!                    [init region]
+! STATIC_LEN:      } copy {
+! STATIC_LEN-NEXT: ^bb0(%[[PRIV_ORIG_ARG:.*]]: !fir.ref<[[TYPE]]>, %[[PRIV_PRIV_ARG:.*]]: !fir.ref<[[TYPE]]>):
+! STATIC_LEN-NEXT:   %[[ORIG:.*]] = fir.load %[[PRIV_ORIG_ARG]] : !fir.ref<[[TYPE]]>
+! STATIC_LEN-NEXT:   hlfir.assign %[[ORIG]] to %[[PRIV_PRIV_ARG]]
 
 ! STATIC_LEN-NEXT:   omp.yield(%[[PRIV_PRIV_ARG]]
 ! STATIC_LEN-NEXT: }
@@ -53,15 +48,5 @@ subroutine delayed_privatization_character_array_dynamic_len(var1, char_len, arr
 end subroutine
 
 ! DYN_LEN-LABEL: omp.private {type = private}
-! DYN_LEN-SAME: @[[PRIVATIZER_SYM:.*]] : [[TYPE:!fir.box<!fir.array<\?x!fir.char<1,\?>>>]] alloc {
-
-! DYN_LEN-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]):
-
-! DYN_LEN:        %[[C0:.*]] = arith.constant 0 : index
-! DYN_LEN-NEXT:   %[[BOX_DIM:.*]]:3 = fir.box_dims %[[PRIV_ARG]], %[[C0]]
-! DYN_LEN:        %[[CHAR_LEN:.*]] = fir.box_elesize %[[PRIV_ARG]]
-! DYN_LEN-NEXT:   %[[PRIV_ALLOC:.*]] = fir.alloca !fir.array<?x!fir.char<1,?>>(%[[CHAR_LEN]] : index)
-! DYN_LEN-NEXT:   %[[ARRAY_SHAPE:.*]] = fir.shape
-! DYN_LEN-NEXT:   %[[PRIV_DECL:.*]]:2 = hlfir.declare %[[PRIV_ALLOC]](%[[ARRAY_SHAPE]]) typeparams %[[CHAR_LEN]]
-
-! DYN_LEN-NEXT:   omp.yield(%[[PRIV_DECL]]#0
+! DYN_LEN-SAME: @[[PRIVATIZER_SYM:.*]] : [[TYPE:!fir.box<!fir.array<\?x!fir.char<1,\?>>>]] init {
+! DYN_LEN-NEXT: ^bb0(%[[MOLD_ARG:.*]]: !fir.ref<!fir.box<!fir.array<?x!fir.char<1,?>>>>, %[[ALLOC_ARG:.*]]: !fir.ref<!fir.box<!fir.array<?x!fir.char<1,?>>>>)
diff --git flang/test/Lower/OpenMP/delayed-privatization-character.f90 flang/test/Lower/OpenMP/delayed-privatization-character.f90
index db678ab13bbe..3d1a31296337 100644
--- flang/test/Lower/OpenMP/delayed-privatization-character.f90
+++ flang/test/Lower/OpenMP/delayed-privatization-character.f90
@@ -24,13 +24,13 @@ subroutine delayed_privatization_character(var1, l)
 end subroutine
 
 ! DYN_LEN-LABEL: omp.private {type = firstprivate}
-! DYN_LEN-SAME: @[[PRIVATIZER_SYM:.*]] : [[TYPE:!fir.boxchar<1>]] alloc {
+! DYN_LEN-SAME: @[[PRIVATIZER_SYM:.*]] : [[TYPE:!fir.boxchar<1>]] init {
 
-! DYN_LEN-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]):
+! DYN_LEN-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]], %[[ALLOC_ARG:.*]]: [[TYPE]]):
 ! DYN_LEN-NEXT:   %[[UNBOX:.*]]:2 = fir.unboxchar %[[PRIV_ARG]]
-! DYN_LEN:        %[[PRIV_ALLOC:.*]] = fir.alloca !fir.char<1,?>(%[[UNBOX]]#1 : index)
-! DYN_LEN-NEXT:   %[[PRIV_DECL:.*]]:2 = hlfir.declare %[[PRIV_ALLOC]] typeparams %[[UNBOX]]#1
-! DYN_LEN-NEXT:   omp.yield(%[[PRIV_DECL]]#0 : !fir.boxchar<1>)
+! DYN_LEN-NEXT:   %[[PRIV_ALLOC:.*]] = fir.allocmem !fir.char<1,?>(%[[UNBOX]]#1 : index)
+! DYN_LEN-NEXT:   %[[EMBOXCHAR:.*]] = fir.emboxchar %[[PRIV_ALLOC]], %[[UNBOX]]#1
+! DYN_LEN:        omp.yield(%[[EMBOXCHAR]] : !fir.boxchar<1>)
 
 ! DYN_LEN-NEXT: } copy {
 ! DYN_LEN-NEXT: ^bb0(%[[PRIV_ORIG_ARG:.*]]: [[TYPE]], %[[PRIV_PRIV_ARG:.*]]: [[TYPE]]):
@@ -51,9 +51,4 @@ subroutine delayed_privatization_character_static_len(var1)
 end subroutine
 
 ! STATIC_LEN-LABEL: omp.private {type = private}
-! STATIC_LEN-SAME: @[[PRIVATIZER_SYM:.*]] : [[TYPE:!fir.ref<!fir.char<1,10>>]] alloc {
-
-! STATIC_LEN-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]):
-! STATIC_LEN-NEXT:   %[[C10:.*]] = arith.constant 10 : index
-! STATIC_LEN-NEXT:   %[[PRIV_ALLOC:.*]] = fir.alloca !fir.char<1,10>
-! STATIC_LEN-NEXT:   %[[PRIV_DECL:.*]]:2 = hlfir.declare %[[PRIV_ALLOC]] typeparams %[[C10]]
+! STATIC_LEN-SAME: @[[PRIVATIZER_SYM:.*]] : [[TYPE:!fir.char<1,10>]]
diff --git flang/test/Lower/OpenMP/delayed-privatization-default-init.f90 flang/test/Lower/OpenMP/delayed-privatization-default-init.f90
index 022b592db74b..87d4605217a8 100644
--- flang/test/Lower/OpenMP/delayed-privatization-default-init.f90
+++ flang/test/Lower/OpenMP/delayed-privatization-default-init.f90
@@ -29,19 +29,16 @@ subroutine delayed_privatization_default_init_firstprivate
   !$omp end parallel
 end subroutine
 
-! CHECK-LABEL:   omp.private {type = firstprivate} @_QFdelayed_privatization_default_init_firstprivateEa_firstprivate_ref_rec__QFdelayed_privatization_default_init_firstprivateTt : !fir.ref<!fir.type<_QFdelayed_privatization_default_init_firstprivateTt{i:i32}>> alloc {
-! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.type<_QFdelayed_privatization_default_init_firstprivateTt{i:i32}>>):
-! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.type<_QFdelayed_privatization_default_init_firstprivateTt{i:i32}> {bindc_name = "a", pinned, uniq_name = "_QFdelayed_privatization_default_init_firstprivateEa"}
-! CHECK-NEXT:      %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFdelayed_privatization_default_init_firstprivateEa"} : (!fir.ref<!fir.type<_QFdelayed_privatization_default_init_firstprivateTt{i:i32}>>) -> (!fir.ref<!fir.type<_QFdelayed_privatization_default_init_firstprivateTt{i:i32}>>, !fir.ref<!fir.type<_QFdelayed_privatization_default_init_firstprivateTt{i:i32}>>)
-! CHECK:           omp.yield(%[[VAL_9]]#0 : !fir.ref<!fir.type<_QFdelayed_privatization_default_init_firstprivateTt{i:i32}>>)
-! CHECK:   }
+! CHECK-LABEL:   omp.private {type = firstprivate}
+! CHECK-SAME:        @_QFdelayed_privatization_default_init_firstprivateEa_firstprivate_rec__QFdelayed_privatization_default_init_firstprivateTt :
+! CHECK-SAME:        [[TYPE:!fir.type<_QFdelayed_privatization_default_init_firstprivateTt{i:i32}>]] copy {
 
-! CHECK-LABEL:   omp.private {type = private} @_QFdelayed_privatization_default_initEa_private_ref_rec__QFdelayed_privatization_default_initTt : !fir.ref<!fir.type<_QFdelayed_privatization_default_initTt{i:i32}>> alloc {
-! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.type<_QFdelayed_privatization_default_initTt{i:i32}>>):
-! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.type<_QFdelayed_privatization_default_initTt{i:i32}> {bindc_name = "a", pinned, uniq_name = "_QFdelayed_privatization_default_initEa"}
+! CHECK-LABEL:   omp.private {type = private}
+! CHECK-SAME:        @_QFdelayed_privatization_default_initEa_private_rec__QFdelayed_privatization_default_initTt :
+! CHECK-SAME:        [[TYPE:!fir.type<_QFdelayed_privatization_default_initTt{i:i32}>]] init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<[[TYPE]]>, %[[VAL_1:.*]]: !fir.ref<[[TYPE]]>):
 ! CHECK:           %[[VAL_2:.*]] = fir.embox %[[VAL_1]] : (!fir.ref<!fir.type<_QFdelayed_privatization_default_initTt{i:i32}>>) -> !fir.box<!fir.type<_QFdelayed_privatization_default_initTt{i:i32}>>
 ! CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_2]] : (!fir.box<!fir.type<_QFdelayed_privatization_default_initTt{i:i32}>>) -> !fir.box<none>
 ! CHECK:           fir.call @_FortranAInitialize(%[[VAL_6]],{{.*}}
-! CHECK-NEXT:      %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFdelayed_privatization_default_initEa"} : (!fir.ref<!fir.type<_QFdelayed_privatization_default_initTt{i:i32}>>) -> (!fir.ref<!fir.type<_QFdelayed_privatization_default_initTt{i:i32}>>, !fir.ref<!fir.type<_QFdelayed_privatization_default_initTt{i:i32}>>)
-! CHECK:           omp.yield(%[[VAL_9]]#0 : !fir.ref<!fir.type<_QFdelayed_privatization_default_initTt{i:i32}>>)
+! CHECK:           omp.yield(%[[VAL_1]] : !fir.ref<!fir.type<_QFdelayed_privatization_default_initTt{i:i32}>>)
 ! CHECK:   }
diff --git flang/test/Lower/OpenMP/delayed-privatization-firstprivate.f90 flang/test/Lower/OpenMP/delayed-privatization-firstprivate.f90
index 119f77ea2662..904ea783ad5b 100644
--- flang/test/Lower/OpenMP/delayed-privatization-firstprivate.f90
+++ flang/test/Lower/OpenMP/delayed-privatization-firstprivate.f90
@@ -15,12 +15,7 @@ subroutine delayed_privatization_firstprivate
 end subroutine
 
 ! CHECK-LABEL: omp.private {type = firstprivate}
-! CHECK-SAME: @[[VAR1_PRIVATIZER_SYM:.*]] : !fir.ref<i32> alloc {
-! CHECK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: !fir.ref<i32>):
-! CHECK-NEXT:   %[[PRIV_ALLOC:.*]] = fir.alloca i32 {bindc_name = "var1", pinned, uniq_name = "_QFdelayed_privatization_firstprivateEvar1"}
-! CHECK-NEXT:   %[[PRIV_DECL:.*]]:2 = hlfir.declare %[[PRIV_ALLOC]] {uniq_name = "_QFdelayed_privatization_firstprivateEvar1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK-NEXT:   omp.yield(%[[PRIV_DECL]]#0 : !fir.ref<i32>)
-! CHECK: } copy {
+! CHECK-SAME: @[[VAR1_PRIVATIZER_SYM:.*]] : i32 copy {
 ! CHECK: ^bb0(%[[PRIV_ORIG_ARG:.*]]: !fir.ref<i32>, %[[PRIV_PRIV_ARG:.*]]: !fir.ref<i32>):
 ! CHECK:    %[[ORIG_VAL:.*]] = fir.load %[[PRIV_ORIG_ARG]] : !fir.ref<i32>
 ! CHECK:    hlfir.assign %[[ORIG_VAL]] to %[[PRIV_PRIV_ARG]] : i32, !fir.ref<i32>
diff --git flang/test/Lower/OpenMP/delayed-privatization-lastprivate-of-private.f90 flang/test/Lower/OpenMP/delayed-privatization-lastprivate-of-private.f90
new file mode 100644
index 000000000000..be075825c5bd
--- /dev/null
+++ flang/test/Lower/OpenMP/delayed-privatization-lastprivate-of-private.f90
@@ -0,0 +1,22 @@
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s | FileCheck %s
+! RUN: bbc -emit-hlfir -fopenmp -o - %s | FileCheck %s
+
+! Check that we can lower this without crashing
+
+! CHECK: func.func @_QPlastprivate_of_private
+subroutine lastprivate_of_private(a)
+  real :: a(100)
+  integer i
+  ! CHECK: omp.parallel private({{.*}}) {
+  !$omp parallel private(a)
+    ! CHECK: omp.parallel {
+    !$omp parallel shared(a)
+    ! CHECK: omp.wsloop {
+    !$omp do lastprivate(a)
+    ! CHECK: omp.loop_nest
+      do i=1,100
+        a(i) = 1.0
+      end do
+    !$omp end parallel
+  !$omp end parallel
+end subroutine
diff --git flang/test/Lower/OpenMP/delayed-privatization-pointer.f90 flang/test/Lower/OpenMP/delayed-privatization-pointer.f90
index c96b0b49fd53..1dc345c11568 100644
--- flang/test/Lower/OpenMP/delayed-privatization-pointer.f90
+++ flang/test/Lower/OpenMP/delayed-privatization-pointer.f90
@@ -14,21 +14,42 @@ subroutine delayed_privatization_pointer
 !$omp end parallel
 end subroutine
 
-! CHECK-LABEL: omp.private {type = firstprivate}
-! CHECK-SAME: @[[PRIVATIZER_SYM:.*]] : [[TYPE:!fir.ref<!fir.box<!fir.ptr<i32>>>]] alloc {
+subroutine delayed_privatization_lenparams(length)
+  integer, intent(in) :: length
+  character(length), pointer :: var
 
-! CHECK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]):
+  !$omp parallel firstprivate(var)
+    var = 'a'
+  !$omp end parallel
+end subroutine
 
-! CHECK-NEXT:   %[[PRIV_ALLOC:.*]] = fir.alloca !fir.box<!fir.ptr<i32>> {bindc_name = "var1", pinned, uniq_name = "_QFdelayed_privatization_pointerEvar1"}
+! CHECK-LABEL: omp.private {type = firstprivate}
+! CHECK-SAME: @[[PRIVATIZER_SYM2:.*]] : [[TYPE:!fir.box<!fir.ptr<!fir.char<1,\?>>>]] init {
+! CHECK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: !fir.ref<[[TYPE]]>, %[[PRIV_ALLOC:.*]]: !fir.ref<[[TYPE]]>):
+! CHECK-NEXT:   %[[ARG:.*]] = fir.load %[[PRIV_ARG]]
+! CHECK-NEXT:   %[[SIZE:.*]] = fir.box_elesize %[[ARG]]
+! CHECK-NEXT:   %[[NULL:.*]] = fir.zero_bits !fir.ptr<!fir.char<1,?>>
+! CHECK-NEXT:   %[[INIT:.*]] = fir.embox %[[NULL]] typeparams %[[SIZE]]
+! CHECK-NEXT:   fir.store %[[INIT]] to %[[PRIV_ALLOC]]
+! CHECK-NEXT:   omp.yield(%[[PRIV_ALLOC]] : !fir.ref<[[TYPE]]>)
+! CHECK-NEXT: } copy {
+! CHECK: ^bb0(%[[PRIV_ORIG_ARG:.*]]: !fir.ref<[[TYPE]]>, %[[PRIV_PRIV_ARG:.*]]: !fir.ref<[[TYPE]]>):
+! CHECK-NEXT:    %[[ORIG_BASE_VAL:.*]] = fir.load %[[PRIV_ORIG_ARG]]
+! CHECK-NEXT:   fir.store %[[ORIG_BASE_VAL]] to %[[PRIV_PRIV_ARG]]
+! CHECK-NEXT:   omp.yield(%[[PRIV_PRIV_ARG]] : !fir.ref<[[TYPE]]>)
+! CHECK-NEXT: }
+
+! CHECK-LABEL: omp.private {type = firstprivate}
+! CHECK-SAME: @[[PRIVATIZER_SYM:.*]] : [[TYPE:!fir.box<!fir.ptr<i32>>]] init {
+! CHECK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: !fir.ref<[[TYPE]]>, %[[PRIV_ALLOC:.*]]: !fir.ref<[[TYPE]]>):
+! CHECK-NEXT:   %[[ARG:.*]] = fir.load %[[PRIV_ARG]]
 ! CHECK-NEXT:   %[[NULL:.*]] = fir.zero_bits !fir.ptr<i32>
 ! CHECK-NEXT:   %[[INIT:.*]] = fir.embox %[[NULL]] : (!fir.ptr<i32>) -> !fir.box<!fir.ptr<i32>>
 ! CHECK-NEXT:   fir.store %[[INIT]] to %[[PRIV_ALLOC]] : !fir.ref<!fir.box<!fir.ptr<i32>>>
-! CHECK-NEXT:   %[[PRIV_DECL:.*]]:2 = hlfir.declare %[[PRIV_ALLOC]]
-! CHECK-NEXT:   omp.yield(%[[PRIV_DECL]]#0 : [[TYPE]])
-
+! CHECK-NEXT:   omp.yield(%[[PRIV_ALLOC]] : !fir.ref<[[TYPE]]>)
 ! CHECK-NEXT: } copy {
-! CHECK: ^bb0(%[[PRIV_ORIG_ARG:.*]]: [[TYPE]], %[[PRIV_PRIV_ARG:.*]]: [[TYPE]]):
+! CHECK: ^bb0(%[[PRIV_ORIG_ARG:.*]]: !fir.ref<[[TYPE]]>, %[[PRIV_PRIV_ARG:.*]]: !fir.ref<[[TYPE]]>):
 ! CHECK-NEXT:    %[[ORIG_BASE_VAL:.*]] = fir.load %[[PRIV_ORIG_ARG]]
- ! CHECK-NEXT:   fir.store %[[ORIG_BASE_VAL]] to %[[PRIV_PRIV_ARG]] : !fir.ref<!fir.box<!fir.ptr<i32>>>
-! CHECK-NEXT:   omp.yield(%[[PRIV_PRIV_ARG]] : [[TYPE]])
+! CHECK-NEXT:   fir.store %[[ORIG_BASE_VAL]] to %[[PRIV_PRIV_ARG]] : !fir.ref<!fir.box<!fir.ptr<i32>>>
+! CHECK-NEXT:   omp.yield(%[[PRIV_PRIV_ARG]] : !fir.ref<[[TYPE]]>)
 ! CHECK-NEXT: }
diff --git flang/test/Lower/OpenMP/delayed-privatization-private-firstprivate.f90 flang/test/Lower/OpenMP/delayed-privatization-private-firstprivate.f90
index 7d202f46c09d..d961210dcbc3 100644
--- flang/test/Lower/OpenMP/delayed-privatization-private-firstprivate.f90
+++ flang/test/Lower/OpenMP/delayed-privatization-private-firstprivate.f90
@@ -17,13 +17,11 @@ subroutine delayed_privatization_private_firstprivate
 end subroutine
 
 ! CHECK-LABEL: omp.private {type = firstprivate}
-! CHECK-SAME: @[[VAR2_PRIVATIZER_SYM:.*]] : !fir.ref<i32> alloc {
-! CHECK: } copy {
+! CHECK-SAME: @[[VAR2_PRIVATIZER_SYM:.*]] : i32 copy {
 ! CHECK: }
 
 ! CHECK-LABEL: omp.private {type = private}
-! CHECK-SAME: @[[VAR1_PRIVATIZER_SYM:.*]] : !fir.ref<i32> alloc {
-! CHECK: }
+! CHECK-SAME: @[[VAR1_PRIVATIZER_SYM:.*]] : i32
 
 ! CHECK-LABEL: func.func @_QPdelayed_privatization_private_firstprivate() {
 ! CHECK:  %[[VAR1_ALLOC:.*]] = fir.alloca i32 {bindc_name = "var1"
diff --git flang/test/Lower/OpenMP/delayed-privatization-private.f90 flang/test/Lower/OpenMP/delayed-privatization-private.f90
index 7208521bcd77..69c362e4828b 100644
--- flang/test/Lower/OpenMP/delayed-privatization-private.f90
+++ flang/test/Lower/OpenMP/delayed-privatization-private.f90
@@ -15,12 +15,8 @@ subroutine delayed_privatization_private
 end subroutine
 
 ! CHECK-LABEL: omp.private {type = private}
-! CHECK-SAME: @[[PRIVATIZER_SYM:.*]] : !fir.ref<i32> alloc {
-! CHECK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: !fir.ref<i32>):
-! CHECK-NEXT:   %[[PRIV_ALLOC:.*]] = fir.alloca i32 {bindc_name = "var1", pinned, uniq_name = "_QFdelayed_privatization_privateEvar1"}
-! CHECK-NEXT:   %[[PRIV_DECL:.*]]:2 = hlfir.declare %[[PRIV_ALLOC]] {uniq_name = "_QFdelayed_privatization_privateEvar1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK-NEXT:   omp.yield(%[[PRIV_DECL]]#0 : !fir.ref<i32>)
-! CHECK-NOT: } copy {
+! CHECK-SAME: @[[PRIVATIZER_SYM:.*]] : i32
+! CHECK-NOT: copy {
 
 ! CHECK-LABEL: @_QPdelayed_privatization_private
 ! CHECK: %[[ORIG_ALLOC:.*]] = fir.alloca i32 {bindc_name = "var1", uniq_name = "_QFdelayed_privatization_privateEvar1"}
diff --git flang/test/Lower/OpenMP/delayed-privatization-reduction-byref.f90 flang/test/Lower/OpenMP/delayed-privatization-reduction-byref.f90
index 6c00bb23f15b..f463f2b4630a 100644
--- flang/test/Lower/OpenMP/delayed-privatization-reduction-byref.f90
+++ flang/test/Lower/OpenMP/delayed-privatization-reduction-byref.f90
@@ -19,7 +19,7 @@ subroutine red_and_delayed_private
 end subroutine
 
 ! CHECK-LABEL: omp.private {type = private}
-! CHECK-SAME: @[[PRIVATIZER_SYM:.*]] : !fir.ref<i32> alloc {
+! CHECK-SAME: @[[PRIVATIZER_SYM:.*]] : i32
 
 ! CHECK-LABEL: omp.declare_reduction
 ! CHECK-SAME: @[[REDUCTION_SYM:.*]] : !fir.ref<i32> alloc
diff --git flang/test/Lower/OpenMP/delayed-privatization-reduction.f90 flang/test/Lower/OpenMP/delayed-privatization-reduction.f90
index 38139e52ce95..a1ddbc30d6e4 100644
--- flang/test/Lower/OpenMP/delayed-privatization-reduction.f90
+++ flang/test/Lower/OpenMP/delayed-privatization-reduction.f90
@@ -22,7 +22,7 @@ subroutine red_and_delayed_private
 end subroutine
 
 ! CHECK-LABEL: omp.private {type = private}
-! CHECK-SAME: @[[PRIVATIZER_SYM:.*]] : !fir.ref<i32> alloc {
+! CHECK-SAME: @[[PRIVATIZER_SYM:.*]] : i32
 
 ! CHECK-LABEL: omp.declare_reduction
 ! CHECK-SAME: @[[REDUCTION_SYM:.*]] : i32 init
diff --git flang/test/Lower/OpenMP/derived-type-allocatable.f90 flang/test/Lower/OpenMP/derived-type-allocatable.f90
index 1d6e22212eed..81ede56d7f7e 100644
--- flang/test/Lower/OpenMP/derived-type-allocatable.f90
+++ flang/test/Lower/OpenMP/derived-type-allocatable.f90
@@ -13,32 +13,55 @@ module m1
 
 contains
 
+!CHECK-LABEL: omp.private {type = private} @_QMm1Ftest_class_allocatable_array
+!CHECK:       fir.call @_FortranAInitialize
+!CHECK-NOT:   omp.barrier
+!CHECK:       omp.yield
+
+!CHECK-LABEL: omp.private {type = private} @_QMm1Ftest_class_allocatable
+!CHECK:       fir.call @_FortranAInitialize
+!CHECK-NOT:   omp.barrier
+!CHECK:       omp.yield
+
+!CHECK-LABEL: omp.private {type = private} @_QMm1Ftest_allocatable
+!CHECK:       fir.call @_FortranAInitialize
+!CHECK-NOT:   omp.barrier
+!CHECK:       omp.yield
+
 !CHECK-LABEL: omp.private {type = private} @_QMm1Ftest_pointer
 !CHECK-NOT:   fir.call @_FortranAInitializeClone
+!CHECK-NOT:   omp.barrier
 !CHECK:       omp.yield
 
 !CHECK-LABEL: omp.private {type = private} @_QMm1Ftest_nested
 !CHECK:       fir.call @_FortranAInitializeClone
-!CHECK-NEXT:  omp.yield
+!CHECK-NOT:   omp.barrier
+!CHECK:       omp.yield
 
 !CHECK-LABEL: omp.private {type = private} @_QMm1Ftest_array_of_allocs
 !CHECK:       fir.call @_FortranAInitializeClone
-!CHECK-NEXT:  omp.yield
+!CHECK-NOT:   omp.barrier
+!CHECK:       omp.yield
 !CHECK:       } dealloc {
 !CHECK:       fir.call @_FortranAAllocatableDeallocate
 !CHECK:       omp.yield
 
 !CHECK-LABEL: omp.private {type = firstprivate} @_QMm1Ftest_array
+!CHECK:       fir.call @_FortranAInitialize(
 !CHECK-NOT:   fir.call @_FortranAInitializeClone
+!CHECK-NOT:   omp.barrier
 !CHECK:       omp.yield
 
 !CHECK-LABEL: omp.private {type = private} @_QMm1Ftest_array
+!CHECK:       fir.call @_FortranAInitialize(
 !CHECK:       fir.call @_FortranAInitializeClone
-!CHECK-NEXT:  omp.yield
+!CHECK-NOT:   omp.barrier
+!CHECK:       omp.yield
 
 !CHECK-LABEL: omp.private {type = private} @_QMm1Ftest_scalar
 !CHECK:       fir.call @_FortranAInitializeClone
-!CHECK-NEXT:  omp.yield
+!CHECK-NOT:   omp.barrier
+!CHECK:       omp.yield
 
   subroutine test_scalar()
     type(x) :: v
@@ -105,4 +128,34 @@ contains
     !$omp parallel private(ptr)
     !$omp end parallel
   end subroutine
+
+  subroutine test_allocatable()
+    type needs_init
+      integer :: i = 1
+    end type
+    type(needs_init), allocatable :: a
+
+    !$omp parallel private(a)
+    !$omp end parallel
+  end subroutine
+
+  subroutine test_class_allocatable()
+    type needs_init
+      integer :: i = 1
+    end type
+    class(needs_init), allocatable :: a
+
+    !$omp parallel private(a)
+    !$omp end parallel
+  end subroutine
+
+  subroutine test_class_allocatable_array()
+    type needs_init
+      integer :: i = 1
+    end type
+    class(needs_init), allocatable :: a(:)
+
+    !$omp parallel private(a)
+    !$omp end parallel
+  end subroutine
 end module
diff --git flang/test/Lower/OpenMP/firstprivate-alloc-comp.f90 flang/test/Lower/OpenMP/firstprivate-alloc-comp.f90
index 2453fe2c5208..4d0a2a0b9024 100644
--- flang/test/Lower/OpenMP/firstprivate-alloc-comp.f90
+++ flang/test/Lower/OpenMP/firstprivate-alloc-comp.f90
@@ -13,7 +13,7 @@ end
 
   call firstprivate_alloc_comp()
 end
-! CHECK-LABEL:   omp.private {type = firstprivate} @_QFfirstprivate_alloc_compEx_firstprivate_ref_rec__QFfirstprivate_alloc_compTt1 : !fir.ref<!fir.type<_QFfirstprivate_alloc_compTt1{c:!fir.box<!fir.heap<!fir.array<?xi32>>>}>> alloc {
+! CHECK-LABEL:   omp.private {type = firstprivate} @_QFfirstprivate_alloc_compEx_firstprivate_rec__QFfirstprivate_alloc_compTt1 : !fir.type<_QFfirstprivate_alloc_compTt1{c:!fir.box<!fir.heap<!fir.array<?xi32>>>}> init {
 ! CHECK:     fir.call @_FortranAInitialize(
 ! CHECK:   } copy {
 ! ...
diff --git flang/test/Lower/OpenMP/implicit-dsa.f90 flang/test/Lower/OpenMP/implicit-dsa.f90
index a1912a46f9ae..f0f149bb415b 100644
--- flang/test/Lower/OpenMP/implicit-dsa.f90
+++ flang/test/Lower/OpenMP/implicit-dsa.f90
@@ -6,99 +6,82 @@
 ! Privatizers
 
 ! CHECK-LABEL: omp.private
-! CHECK-SAME:      {type = private} @[[TEST6_Y_PRIV:.*]] : !fir.ref<i32>
-! CHECK:         fir.alloca i32 {bindc_name = "y"
-! CHECK-NOT:   } copy {
+! CHECK-SAME:      {type = private} @[[TEST6_Y_PRIV:.*]] : i32
+! CHECK-NOT:   copy {
 
 ! CHECK-LABEL: omp.private
-! CHECK-SAME:      {type = private} @[[TEST6_X_PRIV:.*]] : !fir.ref<i32>
-! CHECK:         fir.alloca i32 {bindc_name = "x"
-! CHECK-NOT:   } copy {
+! CHECK-SAME:      {type = private} @[[TEST6_X_PRIV:.*]] : i32
+! CHECK-NOT:   copy {
 
 ! CHECK-LABEL: omp.private
-! CHECK-SAME:      {type = firstprivate} @[[TEST6_Z_FIRSTPRIV:.*]] : !fir.ref<i32>
-! CHECK:         fir.alloca i32 {bindc_name = "z"
-! CHECK:       } copy {
+! CHECK-SAME:      {type = firstprivate} @[[TEST6_Z_FIRSTPRIV:.*]] : i32
+! CHECK-SAME:  copy {
 ! CHECK:         hlfir.assign
 
 ! CHECK-LABEL: omp.private
-! CHECK-SAME:      {type = firstprivate} @[[TEST6_Y_FIRSTPRIV:.*]] : !fir.ref<i32>
-! CHECK:         fir.alloca i32 {bindc_name = "y"
-! CHECK:       } copy {
+! CHECK-SAME:      {type = firstprivate} @[[TEST6_Y_FIRSTPRIV:.*]] : i32
+! CHECK-SAME:  copy {
 ! CHECK:         hlfir.assign
 
 ! CHECK-LABEL: omp.private
-! CHECK-SAME:      {type = firstprivate} @[[TEST6_X_FIRSTPRIV:.*]] : !fir.ref<i32>
-! CHECK:         fir.alloca i32 {bindc_name = "x"
-! CHECK:       } copy {
+! CHECK-SAME:      {type = firstprivate} @[[TEST6_X_FIRSTPRIV:.*]] : i32
+! CHECK-SAME:  copy {
 ! CHECK:         hlfir.assign
 
 ! CHECK-LABEL: omp.private
-! CHECK-SAME:      {type = firstprivate} @[[TEST5_X_FIRSTPRIV:.*]] : !fir.ref<i32>
-! CHECK:         fir.alloca i32 {bindc_name = "x"
-! CHECK:       } copy {
+! CHECK-SAME:      {type = firstprivate} @[[TEST5_X_FIRSTPRIV:.*]] : i32
+! CHECK-SAME:  copy {
 ! CHECK:         hlfir.assign
 
 ! CHECK-LABEL: omp.private
-! CHECK-SAME:      {type = private} @[[TEST5_X_PRIV:.*]] : !fir.ref<i32>
-! CHECK:         fir.alloca i32 {bindc_name = "x"
-! CHECK-NOT:   } copy {
+! CHECK-SAME:      {type = private} @[[TEST5_X_PRIV:.*]] : i32
+! CHECK-NOT:   copy {
 
 ! CHECK-LABEL: omp.private
-! CHECK-SAME:      {type = firstprivate} @[[TEST4_Y_FIRSTPRIV:.*]] : !fir.ref<i32>
-! CHECK:         fir.alloca i32 {bindc_name = "y"
-! CHECK:       } copy {
+! CHECK-SAME:      {type = firstprivate} @[[TEST4_Y_FIRSTPRIV:.*]] : i32
+! CHECK-SAME:  copy {
 ! CHECK:         hlfir.assign
 
 ! CHECK-LABEL: omp.private
-! CHECK-SAME:      {type = firstprivate} @[[TEST4_Z_FIRSTPRIV:.*]] : !fir.ref<i32>
-! CHECK:         fir.alloca i32 {bindc_name = "z"
-! CHECK:       } copy {
+! CHECK-SAME:      {type = firstprivate} @[[TEST4_Z_FIRSTPRIV:.*]] : i32
+! CHECK-SAME:  copy {
 ! CHECK:         hlfir.assign
 
 ! CHECK-LABEL: omp.private
-! CHECK-SAME:      {type = firstprivate} @[[TEST4_X_FIRSTPRIV:.*]] : !fir.ref<i32>
-! CHECK:         fir.alloca i32 {bindc_name = "x"
-! CHECK:       } copy {
+! CHECK-SAME:      {type = firstprivate} @[[TEST4_X_FIRSTPRIV:.*]] : i32
+! CHECK-SAME:  copy {
 ! CHECK:         hlfir.assign
 
 ! CHECK-LABEL: omp.private
-! CHECK-SAME:      {type = private} @[[TEST4_Y_PRIV:.*]] : !fir.ref<i32>
-! CHECK:         fir.alloca i32 {bindc_name = "y"
-! CHECK-NOT:   } copy {
+! CHECK-SAME:      {type = private} @[[TEST4_Y_PRIV:.*]] : i32
+! CHECK-NOT:   copy {
 
 ! CHECK-LABEL: omp.private
-! CHECK-SAME:      {type = private} @[[TEST4_Z_PRIV:.*]] : !fir.ref<i32>
-! CHECK:         fir.alloca i32 {bindc_name = "z"
-! CHECK-NOT:   } copy {
+! CHECK-SAME:      {type = private} @[[TEST4_Z_PRIV:.*]] : i32
+! CHECK-NOT:   copy {
 
 ! CHECK-LABEL: omp.private
-! CHECK-SAME:      {type = private} @[[TEST4_X_PRIV:.*]] : !fir.ref<i32>
-! CHECK:         fir.alloca i32 {bindc_name = "x"
-! CHECK-NOT:   } copy {
+! CHECK-SAME:      {type = private} @[[TEST4_X_PRIV:.*]] : i32
+! CHECK-NOT:   copy {
 
 ! CHECK-LABEL: omp.private
-! CHECK-SAME:      {type = firstprivate} @[[TEST3_X_FIRSTPRIV:.*]] : !fir.ref<i32>
-! CHECK:         fir.alloca i32 {bindc_name = "x"
-! CHECK:       } copy {
+! CHECK-SAME:      {type = firstprivate} @[[TEST3_X_FIRSTPRIV:.*]] : i32
+! CHECK:       copy {
 ! CHECK:         hlfir.assign
 
 ! CHECK-LABEL: omp.private
-! CHECK-SAME:      {type = firstprivate} @[[TEST2_X_FIRSTPRIV:.*]] : !fir.ref<i32>
-! CHECK:         fir.alloca i32 {bindc_name = "x"
-! CHECK:       } copy {
+! CHECK-SAME:      {type = firstprivate} @[[TEST2_X_FIRSTPRIV:.*]] : i32
+! CHECK:       copy {
 ! CHECK:         hlfir.assign
 
 ! CHECK-LABEL: omp.private
-! CHECK-SAME:      {type = firstprivate} @[[TEST1_X_FIRSTPRIV:.*]] : !fir.ref<i32>
-! CHECK:         fir.alloca i32 {bindc_name = "x"
-! CHECK:       } copy {
+! CHECK-SAME:      {type = firstprivate} @[[TEST1_X_FIRSTPRIV:.*]] : i32
+! CHECK:       copy {
 ! CHECK:         hlfir.assign
 
 ! CHECK-LABEL: omp.private
-! CHECK-SAME:      {type = private} @[[TEST1_Y_PRIV:.*]] : !fir.ref<i32>
-! CHECK:         fir.alloca i32 {bindc_name = "y"
-! CHECK-NOT:   } copy {
+! CHECK-SAME:      {type = private} @[[TEST1_Y_PRIV:.*]] : i32
+! CHECK-NOT:   copy {
 
 ! Basic cases.
 !CHECK-LABEL: func @_QPimplicit_dsa_test1
diff --git flang/test/Lower/OpenMP/loop-directive.f90 flang/test/Lower/OpenMP/loop-directive.f90
index 845905da0fcb..785f732e1b4f 100644
--- flang/test/Lower/OpenMP/loop-directive.f90
+++ flang/test/Lower/OpenMP/loop-directive.f90
@@ -4,8 +4,8 @@
 ! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
 
 ! CHECK: omp.declare_reduction @[[RED:add_reduction_i32]] : i32
-! CHECK: omp.private {type = private} @[[DUMMY_PRIV:.*test_privateEdummy_private.*]] : !fir.ref<i32>
-! CHECK: omp.private {type = private} @[[I_PRIV:.*test_no_clausesEi.*]] : !fir.ref<i32>
+! CHECK: omp.private {type = private} @[[DUMMY_PRIV:.*test_privateEdummy_private.*]] : i32
+! CHECK: omp.private {type = private} @[[I_PRIV:.*test_no_clausesEi.*]] : i32
 
 ! CHECK-LABEL: func.func @_QPtest_no_clauses
 subroutine test_no_clauses()
@@ -149,8 +149,8 @@ subroutine test_standalone_bind_teams
   num = N
 
   ! CHECK:     omp.distribute
-  ! CHECK-SAME:  private(@{{.*}}Ea_private_ref_100000xi32 {{[^,]*}},
-  ! CHECK-SAME:          @{{.*}}Ei_private_ref_i32 {{.*}} : {{.*}}) {
+  ! CHECK-SAME:  private(@{{.*}}Ea_private_box_100000xi32 {{[^,]*}},
+  ! CHECK-SAME:          @{{.*}}Ei_private_i32 {{.*}} : {{.*}}) {
   ! CHECK:       omp.loop_nest {{.*}} {
   ! CHECK:       }
   ! CHECK:     }
@@ -169,8 +169,8 @@ subroutine test_standalone_bind_parallel
   num = N
 
   ! CHECK:     omp.wsloop
-  ! CHECK-SAME:  private(@{{.*}}Ea_private_ref_100000xi32 {{[^,]*}},
-  ! CHECK-SAME:          @{{.*}}Ei_private_ref_i32 {{.*}} : {{.*}}) {
+  ! CHECK-SAME:  private(@{{.*}}Ea_private_box_100000xi32 {{[^,]*}},
+  ! CHECK-SAME:          @{{.*}}Ei_private_i32 {{.*}} : {{.*}}) {
   ! CHECK:       omp.loop_nest {{.*}} {
   ! CHECK:       }
   ! CHECK:     }
diff --git flang/test/Lower/OpenMP/parallel-firstprivate-clause-scalar.f90 flang/test/Lower/OpenMP/parallel-firstprivate-clause-scalar.f90
index f0bee355543a..017b2a3f2edd 100644
--- flang/test/Lower/OpenMP/parallel-firstprivate-clause-scalar.f90
+++ flang/test/Lower/OpenMP/parallel-firstprivate-clause-scalar.f90
@@ -1,31 +1,22 @@
 ! This test checks lowering of `FIRSTPRIVATE` clause for scalar types.
 
+! REQUIRES: x86_64-registered-target
 ! REQUIRES: shell
 ! RUN: bbc -fopenmp -emit-hlfir %s -o - \
-! RUN: | FileCheck %s --check-prefix=CHECK
+! RUN: | FileCheck %s --check-prefixes=CHECK%if target=x86_64{{.*}} %{,CHECK-KIND10%}%if flang-supports-f128-math %{,CHECK-KIND16%}
 
-!CHECK:  omp.private {type = firstprivate} @[[ARG2_LOGICAL_PRIVATIZER:_QFfirstprivate_logicalEarg2_firstprivate_ref_l8]] : !fir.ref<!fir.logical<1>> alloc
+!CHECK:  omp.private {type = firstprivate} @[[ARG2_LOGICAL_PRIVATIZER:_QFfirstprivate_logicalEarg2_firstprivate_l8]] : !fir.logical<1>
 
-!CHECK:  omp.private {type = firstprivate} @[[ARG1_LOGICAL_PRIVATIZER:_QFfirstprivate_logicalEarg1_firstprivate_ref_l32]] : !fir.ref<!fir.logical<4>> alloc {
-!CHECK:  ^bb0(%{{.*}}: !fir.ref<!fir.logical<4>>):
-!CHECK:    %[[PVT_ALLOC:.*]] = fir.alloca !fir.logical<4> {{.*}}
-!CHECK:    %[[PVT_DECL:.*]]:2 = hlfir.declare %[[PVT_ALLOC]] {{.*}}
-!CHECK:    omp.yield(%[[PVT_DECL]]#0 : !fir.ref<!fir.logical<4>>)
-!CHECK:  } copy {
+!CHECK:  omp.private {type = firstprivate} @[[ARG1_LOGICAL_PRIVATIZER:_QFfirstprivate_logicalEarg1_firstprivate_l32]] : !fir.logical<4> copy {
 !CHECK:  ^bb0(%[[ORIG_REF:.*]]: !fir.ref<!fir.logical<4>>, %[[PVT_REF:.*]]: !fir.ref<!fir.logical<4>>):
 !CHECK:    %[[ORIG_VAL:.*]] = fir.load %[[ORIG_REF]] : {{.*}}
 !CHECK:    hlfir.assign %[[ORIG_VAL]] to %[[PVT_REF]] {{.*}}
 !CHECK:    omp.yield(%[[PVT_REF]] : !fir.ref<!fir.logical<4>>)
 !CHECK:  }
 
-!CHECK:  omp.private {type = firstprivate} @[[ARG2_COMPLEX_PRIVATIZER:_QFfirstprivate_complexEarg2_firstprivate_ref_z64]] : !fir.ref<complex<f64>> alloc
+!CHECK:  omp.private {type = firstprivate} @[[ARG2_COMPLEX_PRIVATIZER:_QFfirstprivate_complexEarg2_firstprivate_z64]] : complex<f64>
 
-!CHECK:  omp.private {type = firstprivate} @[[ARG1_COMPLEX_PRIVATIZER:_QFfirstprivate_complexEarg1_firstprivate_ref_z32]] : !fir.ref<complex<f32>> alloc {
-!CHECK:  ^bb0(%{{.*}}: !fir.ref<complex<f32>>):
-!CHECK:    %[[PVT_ALLOC:.*]] = fir.alloca complex<f32> {bindc_name = "arg1", {{.*}}}
-!CHECK:    %[[PVT_DECL:.*]]:2 = hlfir.declare %[[PVT_ALLOC]] {{.*}}
-!CHECK:    omp.yield(%[[PVT_DECL]]#0 : !fir.ref<complex<f32>>)
-!CHECK:  } copy {
+!CHECK:  omp.private {type = firstprivate} @[[ARG1_COMPLEX_PRIVATIZER:_QFfirstprivate_complexEarg1_firstprivate_z32]] : complex<f32> copy {
 !CHECK:  ^bb0(%[[ORIG_REF:.*]]: !fir.ref<complex<f32>>, %[[PVT_REF:.*]]: !fir.ref<complex<f32>>):
 !CHECK:    %[[ORIG_VAL:.*]] = fir.load %[[ORIG_REF]] : {{.*}}
 !CHECK:    hlfir.assign %[[ORIG_VAL]] to %[[PVT_REF]] {{.*}}
@@ -114,38 +105,64 @@ subroutine firstprivate_logical(arg1, arg2, arg3, arg4, arg5)
 
 end subroutine
 
-!CHECK-DAG: func @_QPfirstprivate_real(%[[ARG1:.*]]: !fir.ref<f32>{{.*}}, %[[ARG2:.*]]: !fir.ref<f16>{{.*}}, %[[ARG3:.*]]: !fir.ref<f32>{{.*}}, %[[ARG4:.*]]: !fir.ref<f64>{{.*}}, %[[ARG5:.*]]: !fir.ref<f80>{{.*}}, %[[ARG6:.*]]: !fir.ref<f128>{{.*}}) {
+!CHECK-LABEL: func @_QPfirstprivate_real(
+!CHECK-SAME: %[[ARG1:.*]]: !fir.ref<f32>{{.*}}, %[[ARG2:.*]]: !fir.ref<f16>{{.*}}, %[[ARG3:.*]]: !fir.ref<f32>{{.*}}, %[[ARG4:.*]]: !fir.ref<f64>{{.*}}) {
 !CHECK:   %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_realEarg1"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
 !CHECK:   %[[ARG2_DECL:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_realEarg2"} : (!fir.ref<f16>, !fir.dscope) -> (!fir.ref<f16>, !fir.ref<f16>)
 !CHECK:   %[[ARG3_DECL:.*]]:2 = hlfir.declare %[[ARG3]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_realEarg3"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
 !CHECK:   %[[ARG4_DECL:.*]]:2 = hlfir.declare %[[ARG4]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_realEarg4"} : (!fir.ref<f64>, !fir.dscope) -> (!fir.ref<f64>, !fir.ref<f64>)
-!CHECK:   %[[ARG5_DECL:.*]]:2 = hlfir.declare %[[ARG5]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_realEarg5"} : (!fir.ref<f80>, !fir.dscope) -> (!fir.ref<f80>, !fir.ref<f80>)
-!CHECK:   %[[ARG6_DECL:.*]]:2 = hlfir.declare %[[ARG6]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_realEarg6"} : (!fir.ref<f128>, !fir.dscope) -> (!fir.ref<f128>, !fir.ref<f128>)
-!CHECK:  omp.parallel private({{.*firstprivate.*}} {{.*}}#0 -> %[[ARG1_PVT:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[ARG2_PVT:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[ARG3_PVT:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[ARG4_PVT:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[ARG5_PVT:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[ARG6_PVT:.*]] : {{.*}}) {
+!CHECK:  omp.parallel private({{.*firstprivate.*}} {{.*}}#0 -> %[[ARG1_PVT:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[ARG2_PVT:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[ARG3_PVT:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[ARG4_PVT:.*]] : {{.*}}) {
 !CHECK:     %[[ARG1_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG1_PVT]] {uniq_name = "_QFfirstprivate_realEarg1"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 !CHECK:     %[[ARG2_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG2_PVT]] {uniq_name = "_QFfirstprivate_realEarg2"} : (!fir.ref<f16>) -> (!fir.ref<f16>, !fir.ref<f16>)
 !CHECK:     %[[ARG3_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG3_PVT]] {uniq_name = "_QFfirstprivate_realEarg3"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 !CHECK:     %[[ARG4_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG4_PVT]] {uniq_name = "_QFfirstprivate_realEarg4"} : (!fir.ref<f64>) -> (!fir.ref<f64>, !fir.ref<f64>)
-!CHECK:     %[[ARG5_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG5_PVT]] {uniq_name = "_QFfirstprivate_realEarg5"} : (!fir.ref<f80>) -> (!fir.ref<f80>, !fir.ref<f80>)
-!CHECK:     %[[ARG6_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG6_PVT]] {uniq_name = "_QFfirstprivate_realEarg6"} : (!fir.ref<f128>) -> (!fir.ref<f128>, !fir.ref<f128>)
-!CHECK:     fir.call @_QPqux(%[[ARG1_PVT_DECL]]#1, %[[ARG2_PVT_DECL]]#1, %[[ARG3_PVT_DECL]]#1, %[[ARG4_PVT_DECL]]#1, %[[ARG5_PVT_DECL]]#1, %[[ARG6_PVT_DECL]]#1) {{.*}}: (!fir.ref<f32>, !fir.ref<f16>, !fir.ref<f32>, !fir.ref<f64>, !fir.ref<f80>, !fir.ref<f128>) -> ()
+!CHECK:     fir.call @_QPqux(%[[ARG1_PVT_DECL]]#1, %[[ARG2_PVT_DECL]]#1, %[[ARG3_PVT_DECL]]#1, %[[ARG4_PVT_DECL]]#1) {{.*}}: (!fir.ref<f32>, !fir.ref<f16>, !fir.ref<f32>, !fir.ref<f64>) -> ()
 !CHECK:     omp.terminator
 !CHECK:   }
-
 subroutine firstprivate_real(arg1, arg2, arg3, arg4, arg5, arg6)
         real :: arg1
         real(kind=2) :: arg2
         real(kind=4) :: arg3
         real(kind=8) :: arg4
-        real(kind=10) :: arg5
-        real(kind=16) :: arg6
 
-!$OMP PARALLEL FIRSTPRIVATE(arg1, arg2, arg3, arg4, arg5, arg6)
-        call qux(arg1, arg2, arg3, arg4, arg5, arg6)
+!$OMP PARALLEL FIRSTPRIVATE(arg1, arg2, arg3, arg4)
+        call qux(arg1, arg2, arg3, arg4)
 !$OMP END PARALLEL
 
 end subroutine
 
+!CHECK-KIND10-LABEL: func @_QPfirstprivate_real10(
+!CHECK-KIND10-SAME: %[[ARG1:.*]]: !fir.ref<f80>{{.*}}) {
+!CHECK-KIND10:   %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_real10Earg1"} : (!fir.ref<f80>, !fir.dscope) -> (!fir.ref<f80>, !fir.ref<f80>)
+!CHECK-KIND10:  omp.parallel private({{.*firstprivate.*}} {{.*}}#0 -> %[[ARG1_PVT:.*]] : {{.*}}) {
+!CHECK-KIND10:     %[[ARG1_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG1_PVT]] {uniq_name = "_QFfirstprivate_real10Earg1"} : (!fir.ref<f80>) -> (!fir.ref<f80>, !fir.ref<f80>)
+!CHECK-KIND10:     fir.call @_QPqux10(%[[ARG1_PVT_DECL]]#1) {{.*}} : (!fir.ref<f80>) -> ()
+!CHECK-KIND10:     omp.terminator
+!CHECK-KIND10:   }
+subroutine firstprivate_real10(arg1)
+        integer, parameter :: kind10 = merge(10, 4, selected_real_kind(p=18).eq.10)
+        real(kind=kind10) :: arg1
+!$OMP PARALLEL FIRSTPRIVATE(arg1)
+        call qux10(arg1)
+!$OMP END PARALLEL
+end subroutine
+
+!CHECK-KIND16-LABEL: func @_QPfirstprivate_real16(
+!CHECK-KIND16-SAME: %[[ARG1:.*]]: !fir.ref<f128>{{.*}}) {
+!CHECK-KIND16:   %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_real16Earg1"} : (!fir.ref<f128>, !fir.dscope) -> (!fir.ref<f128>, !fir.ref<f128>)
+!CHECK-KIND16:  omp.parallel private({{.*firstprivate.*}} {{.*}}#0 -> %[[ARG1_PVT:.*]] : {{.*}}) {
+!CHECK-KIND16:     %[[ARG1_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG1_PVT]] {uniq_name = "_QFfirstprivate_real16Earg1"} : (!fir.ref<f128>) -> (!fir.ref<f128>, !fir.ref<f128>)
+!CHECK-KIND16:     fir.call @_QPqux16(%[[ARG1_PVT_DECL]]#1) {{.*}} : (!fir.ref<f128>) -> ()
+!CHECK-KIND16:     omp.terminator
+!CHECK-KIND16:   }
+subroutine firstprivate_real16(arg1)
+        integer, parameter :: kind16 = merge(16, 4, selected_real_kind(p=33).eq.16)
+        real(kind=kind16) :: arg1
+!$OMP PARALLEL FIRSTPRIVATE(arg1)
+        call qux16(arg1)
+!$OMP END PARALLEL
+end subroutine
+
 !CHECK-LABEL:   func.func @_QPmultiple_firstprivate(
 !CHECK-SAME:                                        %[[A_ADDR:.*]]: !fir.ref<i32> {fir.bindc_name = "a"},
 !CHECK-SAME:                                        %[[B_ADDR:.*]]: !fir.ref<i32> {fir.bindc_name = "b"}) {
diff --git flang/test/Lower/OpenMP/parallel-private-clause-fixes.f90 flang/test/Lower/OpenMP/parallel-private-clause-fixes.f90
index 99323e69113b..2c1b4d9e5d77 100644
--- flang/test/Lower/OpenMP/parallel-private-clause-fixes.f90
+++ flang/test/Lower/OpenMP/parallel-private-clause-fixes.f90
@@ -3,30 +3,23 @@
 ! RUN: bbc -fopenmp -emit-hlfir %s -o - \
 ! RUN: | FileCheck %s
 
-! CHECK:  omp.private {type = private} @[[BOX_HEAP_CHAR_PRIVATIZER:_QFsub01Eaaa_private_ref_box_heap_c8xU]] : !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>> alloc {
-! CHECK:  ^bb0(%[[ORIG_REF:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>):
-! CHECK:             %[[VAL_4:.*]] = fir.alloca !fir.box<!fir.heap<!fir.char<1,?>>> {bindc_name = "aaa", pinned, uniq_name = "_QFsub01Eaaa"}
+! CHECK:  omp.private {type = private} @[[BOX_HEAP_CHAR_PRIVATIZER:_QFsub01Eaaa_private_box_heap_c8xU]] : !fir.box<!fir.heap<!fir.char<1,?>>> init {
+! CHECK:  ^bb0(%[[ORIG_REF:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, %[[VAL_4:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>):
 ! CHECK:             %[[VAL_5:.*]] = fir.load %[[ORIG_REF]] : !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>
+! CHECK:             %[[ELESIZE:.*]] = fir.box_elesize %[[VAL_5]]
 ! CHECK:             %[[VAL_6:.*]] = fir.box_addr %[[VAL_5]] : (!fir.box<!fir.heap<!fir.char<1,?>>>) -> !fir.heap<!fir.char<1,?>>
 ! CHECK:             %[[VAL_7:.*]] = fir.convert %[[VAL_6]] : (!fir.heap<!fir.char<1,?>>) -> i64
 ! CHECK:             %[[VAL_8:.*]] = arith.constant 0 : i64
-! CHECK:             %[[VAL_9:.*]] = arith.cmpi ne, %[[VAL_7]], %[[VAL_8]] : i64
+! CHECK:             %[[VAL_9:.*]] = arith.cmpi eq, %[[VAL_7]], %[[VAL_8]] : i64
 ! CHECK:             fir.if %[[VAL_9]] {
-! CHECK:               %[[ELEM_SIZE:.*]] = fir.box_elesize %{{.*}} : (!fir.box<!fir.heap<!fir.char<1,?>>>) -> index
-! CHECK:               %[[VAL_10:.*]] = arith.constant 0 : index
-! CHECK:               %[[VAL_11:.*]] = arith.cmpi sgt, %[[ELEM_SIZE]], %[[VAL_10]] : index
-! CHECK:               %[[VAL_12:.*]] = arith.select %[[VAL_11]], %[[ELEM_SIZE]], %[[VAL_10]] : index
-! CHECK:               %[[VAL_13:.*]] = fir.allocmem !fir.char<1,?>(%[[VAL_12]] : index) {fir.must_be_heap = true, uniq_name = "_QFsub01Eaaa.alloc"}
-! CHECK:               %[[VAL_14:.*]] = fir.embox %[[VAL_13]] typeparams %[[VAL_12]] : (!fir.heap<!fir.char<1,?>>, index) -> !fir.box<!fir.heap<!fir.char<1,?>>>
-! CHECK:               fir.store %[[VAL_14]] to %[[VAL_4]] : !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>
-! CHECK:             } else {
-! CHECK:               %[[VAL_15:.*]] = fir.zero_bits !fir.heap<!fir.char<1,?>>
-! CHECK:               %[[VAL_16:.*]] = arith.constant 0 : index
-! CHECK:               %[[VAL_17:.*]] = fir.embox %[[VAL_15]] typeparams %[[VAL_16]] : (!fir.heap<!fir.char<1,?>>, index) -> !fir.box<!fir.heap<!fir.char<1,?>>>
+! CHECK:               %[[VAL_17:.*]] = fir.embox %[[VAL_6]] typeparams %[[ELESIZE]] : (!fir.heap<!fir.char<1,?>>, index) -> !fir.box<!fir.heap<!fir.char<1,?>>>
 ! CHECK:               fir.store %[[VAL_17]] to %[[VAL_4]] : !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>
+! CHECK:             } else {
+! CHECK:               %[[VAL_13:.*]] = fir.allocmem !fir.char<1,?>(%[[ELESIZE]] : index)
+! CHECK:               %[[VAL_14:.*]] = fir.embox %[[VAL_13]] typeparams %[[ELESIZE]] : (!fir.heap<!fir.char<1,?>>, index) -> !fir.box<!fir.heap<!fir.char<1,?>>>
+! CHECK:               fir.store %[[VAL_14]] to %[[VAL_4]] : !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>
 ! CHECK:             }
-! CHECK:             %[[VAL_18:.*]]:2 = hlfir.declare %[[VAL_4]] {fortran_attrs = #{{.*}}<allocatable>, uniq_name = "_QFsub01Eaaa"} : (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>)
-!CHECK:              omp.yield(%[[VAL_18]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>)
+!CHECK:              omp.yield(%[[VAL_4]] : !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>)
 !CHECK:  } dealloc {
 !CHECK:  ^bb0(%[[ORIG_REF:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>):
 ! CHECK:             %[[VAL_19:.*]] = fir.load %[[ORIG_REF]] : !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>
@@ -35,13 +28,7 @@
 ! CHECK:             %[[VAL_22:.*]] = arith.constant 0 : i64
 ! CHECK:             %[[VAL_23:.*]] = arith.cmpi ne, %[[VAL_21]], %[[VAL_22]] : i64
 ! CHECK:             fir.if %[[VAL_23]] {
-! CHECK:               %[[VAL_24:.*]] = fir.load %[[ORIG_REF]] : !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>
-! CHECK:               %[[VAL_25:.*]] = fir.box_addr %[[VAL_24]] : (!fir.box<!fir.heap<!fir.char<1,?>>>) -> !fir.heap<!fir.char<1,?>>
-! CHECK:               fir.freemem %[[VAL_25]] : !fir.heap<!fir.char<1,?>>
-! CHECK:               %[[VAL_26:.*]] = fir.zero_bits !fir.heap<!fir.char<1,?>>
-! CHECK:               %[[VAL_27:.*]] = arith.constant 0 : index
-! CHECK:               %[[VAL_28:.*]] = fir.embox %[[VAL_26]] typeparams %[[VAL_27]] : (!fir.heap<!fir.char<1,?>>, index) -> !fir.box<!fir.heap<!fir.char<1,?>>>
-! CHECK:               fir.store %[[VAL_28]] to %[[ORIG_REF]] : !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>
+! CHECK:               fir.freemem %[[VAL_20]] : !fir.heap<!fir.char<1,?>>
 !CHECK:    }
 !CHECK:    omp.yield
 !CHECK:  }
diff --git flang/test/Lower/OpenMP/parallel-private-clause-str.f90 flang/test/Lower/OpenMP/parallel-private-clause-str.f90
index 70cb4a9fde3b..44cb08e029aa 100644
--- flang/test/Lower/OpenMP/parallel-private-clause-str.f90
+++ flang/test/Lower/OpenMP/parallel-private-clause-str.f90
@@ -8,45 +8,38 @@
 ! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 \
 ! RUN: | FileCheck %s
 
-!CHECK:  omp.private {type = private} @[[STR_ARR_PRIVATIZER:_QFtest_allocatable_string_arrayEc_private_ref_box_heap_Uxc8xU]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>> alloc {
-!CHECK:  ^bb0(%[[ORIG_REF:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>>):
-!CHECK:      %[[C_PVT_BOX_REF:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>> {bindc_name = "c", pinned, uniq_name = "_QFtest_allocatable_string_arrayEc"}
+!CHECK:  omp.private {type = private} @[[STR_ARR_PRIVATIZER:_QFtest_allocatable_string_arrayEc_private_box_heap_Uxc8xU]] : [[TYPE:.*]] init {
+!CHECK:  ^bb0(%[[ORIG_REF:.*]]: !fir.ref<[[TYPE]]>, %[[C_PVT_BOX_REF:.*]]: !fir.ref<[[TYPE]]>):
 !CHECK:      %{{.*}} = fir.load %[[ORIG_REF]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>>
 !CHECK:      fir.if %{{.*}} {
-!CHECK:        %[[C_PVT_ALLOC:.*]] = fir.allocmem !fir.array<?x!fir.char<1,?>>(%{{.*}} : index), %{{.*}} {fir.must_be_heap = true, uniq_name = "_QFtest_allocatable_string_arrayEc.alloc"}
-!CHECK:        %[[C_PVT_BOX:.*]] = fir.embox %[[C_PVT_ALLOC]](%{{.*}}) typeparams %{{.*}} : (!fir.heap<!fir.array<?x!fir.char<1,?>>>, !fir.shapeshift<1>, index) -> !fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>
+!CHECK:      } else {
+!CHECK:        %[[C_PVT_ALLOC:.*]] = fir.allocmem !fir.array<?x!fir.char<1,?>>(%{{.*}} : index), %{{.*}}
+!CHECK:        %[[C_PVT_BOX:.*]] = fir.rebox
 !CHECK:        fir.store %[[C_PVT_BOX]] to %[[C_PVT_BOX_REF]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>>
 !CHECK:      }
-!CHECK:      %[[C_PVT_DECL:.*]]:2 = hlfir.declare %[[C_PVT_BOX_REF]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_allocatable_string_arrayEc"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>>)
-!CHECK:      omp.yield(%[[C_PVT_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>>)
+!CHECK:      omp.yield(%[[C_PVT_BOX_REF]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>>)
 !CHECK:  } dealloc {
 !CHECK:  ^bb0(%[[ORIG_REF:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>>):
 !CHECK:      %{{.*}} = fir.load %[[ORIG_REF]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>>
 !CHECK:      fir.if %{{.*}} {
-!CHECK:        %[[C_PVT_BOX:.*]] = fir.load %[[ORIG_REF]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>>
-!CHECK:        %[[C_PVT_ADDR:.*]] = fir.box_addr %[[C_PVT_BOX]] : (!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>) -> !fir.heap<!fir.array<?x!fir.char<1,?>>>
-!CHECK:        fir.freemem %[[C_PVT_ADDR]] : !fir.heap<!fir.array<?x!fir.char<1,?>>>
+!CHECK:        fir.freemem %{{.*}} : !fir.heap<!fir.array<?x!fir.char<1,?>>>
 !CHECK:      }
 !CHECK:      omp.yield
 !CHECK:  }
 
-!CHECK:  omp.private {type = private} @[[STR_PRIVATIZER:_QFtest_allocatable_stringEc_private_ref_box_heap_c8xU]] : !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>> alloc {
-!CHECK:  ^bb0(%[[ORIG_REF:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>):
-!CHECK:      %[[C_PVT_BOX_REF:.*]] = fir.alloca !fir.box<!fir.heap<!fir.char<1,?>>> {bindc_name = "c", pinned, uniq_name = "_QFtest_allocatable_stringEc"}
+!CHECK:  omp.private {type = private} @[[STR_PRIVATIZER:_QFtest_allocatable_stringEc_private_box_heap_c8xU]] : [[TYPE:!fir.box<!fir.heap<!fir.char<1,\?>>>]] init {
+!CHECK:  ^bb0(%[[ORIG_REF:.*]]: !fir.ref<[[TYPE]]>, %[[C_PVT_BOX_REF:.*]]: !fir.ref<[[TYPE]]>):
 !CHECK:      %[[C_BOX:.*]] = fir.load %[[ORIG_REF]] : !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>
 !CHECK:      fir.if %{{.*}} {
-!CHECK:        %[[C_PVT_MEM:.*]] = fir.allocmem !fir.char<1,?>(%{{.*}} : index) {fir.must_be_heap = true, uniq_name = "_QFtest_allocatable_stringEc.alloc"}
+!CHECK:        %[[C_PVT_MEM:.*]] = fir.allocmem !fir.char<1,?>(%{{.*}} : index)
 !CHECK:        %[[C_PVT_BOX:.*]] = fir.embox %[[C_PVT_MEM]] typeparams %{{.*}} : (!fir.heap<!fir.char<1,?>>, index) -> !fir.box<!fir.heap<!fir.char<1,?>>>
 !CHECK:        fir.store %[[C_PVT_BOX]] to %[[C_PVT_BOX_REF]] : !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>
 !CHECK:      }
-!CHECK:      %[[C_PVT_DECL:.*]]:2 = hlfir.declare %[[C_PVT_BOX_REF]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_allocatable_stringEc"} : (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>)
-!CHECK:      omp.yield(%[[C_PVT_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>)
+!CHECK:      omp.yield(%[[C_PVT_BOX_REF]] : !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>)
 !CHECK:  } dealloc {
 !CHECK:  ^bb0(%[[ORIG_REF:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>):
 !CHECK:      fir.if %{{.*}} {
-!CHECK:        %[[C_PVT_BOX:.*]] = fir.load %[[ORIG_REF]] : !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>
-!CHECK:        %[[C_PVT_BOX_ADDR:.*]] = fir.box_addr %[[C_PVT_BOX]] : (!fir.box<!fir.heap<!fir.char<1,?>>>) -> !fir.heap<!fir.char<1,?>>
-!CHECK:        fir.freemem %[[C_PVT_BOX_ADDR]] : !fir.heap<!fir.char<1,?>>
+!CHECK:        fir.freemem %{{.*}} : !fir.heap<!fir.char<1,?>>
 !CHECK:      }
 !CHECK:      omp.yield
 !CHECK:  }
diff --git flang/test/Lower/OpenMP/parallel-private-clause.f90 flang/test/Lower/OpenMP/parallel-private-clause.f90
index 7114314df05d..3ed2efb2b592 100644
--- flang/test/Lower/OpenMP/parallel-private-clause.f90
+++ flang/test/Lower/OpenMP/parallel-private-clause.f90
@@ -5,12 +5,10 @@
 ! RUN: bbc --use-desc-for-alloc=false -fopenmp -emit-hlfir %s -o - \
 ! RUN: | FileCheck %s --check-prefix=FIRDialect
 
-! FIRDialect: omp.private {type = private} @_QFsimd_loop_1Er_private_ref_box_heap_f32 {{.*}} alloc {
-! FIRDialect:     [[R:%.*]] = fir.alloca !fir.box<!fir.heap<f32>> {bindc_name = "r", pinned, uniq_name = "{{.*}}Er"}
+! FIRDialect: omp.private {type = private} @_QFsimd_loop_1Er_private_box_heap_f32 : !fir.box<!fir.heap<f32>> init {
+! FIRDialect:     fir.store {{%.*}} to [[R:.*]] : !fir.ref<!fir.box<!fir.heap<f32>>>
 ! FIRDialect:     fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-! FIRDialect:     fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-! FIRDialect:     [[R_DECL:%.*]]:2 = hlfir.declare [[R]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "{{.*}}r"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
-! FIRDialect:     omp.yield([[R_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<f32>>>)
+! FIRDialect:     omp.yield([[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>)
 ! FIRDialect:   } dealloc {
 ! FIRDialect:  ^bb0([[R_DECL:%.*]]: !fir.ref<!fir.box<!fir.heap<f32>>>):
 ! FIRDialect:     {{%.*}} = fir.load [[R_DECL]] : !fir.ref<!fir.box<!fir.heap<f32>>>
@@ -18,36 +16,31 @@
 ! FIRDialect:     [[LD:%.*]] = fir.load [[R_DECL]] : !fir.ref<!fir.box<!fir.heap<f32>>>
 ! FIRDialect:     [[AD:%.*]] = fir.box_addr [[LD]] : (!fir.box<!fir.heap<f32>>) -> !fir.heap<f32>
 ! FIRDialect:     fir.freemem [[AD]] : !fir.heap<f32>
-! FIRDialect:     fir.store {{%.*}} to [[R_DECL]] : !fir.ref<!fir.box<!fir.heap<f32>>>
 ! FIRDialect:     omp.yield
 ! FIRDialect:   }
 
-!FIRDialect: omp.private {type = private} @[[DERIVED_PRIVATIZER:_QFprivate_clause_derived_typeEt_private_ref_rec__QFprivate_clause_derived_typeTmy_type]] : !fir.ref<!fir.type<_QFprivate_clause_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>> alloc {
-!FIRDialect:   ^bb0(%{{.*}}: !fir.ref<!fir.type<_QFprivate_clause_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>>):
-!FIRDialect:     %[[PRIV_ALLOC:.*]] = fir.alloca !fir.type<_QFprivate_clause_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}> {bindc_name = "t", pinned, uniq_name = "_QFprivate_clause_derived_typeEt"}
-!FIRDialect:     %[[PRIV_DECL:.*]]:2 = hlfir.declare %[[PRIV_ALLOC]] {uniq_name = "_QFprivate_clause_derived_typeEt"} : (!fir.ref<!fir.type<_QFprivate_clause_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>>) -> (!fir.ref<!fir.type<_QFprivate_clause_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>>, !fir.ref<!fir.type<_QFprivate_clause_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>>)
-!FIRDialect:     omp.yield(%[[PRIV_DECL]]#0 : !fir.ref<!fir.type<_QFprivate_clause_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>>)
-!FIRDialect: }
+!FIRDialect: omp.private {type = private} @[[DERIVED_PRIVATIZER:_QFprivate_clause_derived_typeEt_private_rec__QFprivate_clause_derived_typeTmy_type]] : !fir.type<_QFprivate_clause_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>
 
 !FIRDialect: func @_QPprivate_clause(%[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "arg1"}, %[[ARG2:.*]]: !fir.ref<!fir.array<10xi32>> {fir.bindc_name = "arg2"}, %[[ARG3:.*]]: !fir.boxchar<1> {fir.bindc_name = "arg3"}, %[[ARG4:.*]]: !fir.boxchar<1> {fir.bindc_name = "arg4"}) {
 !FIRDialect-DAG: %[[ALPHA:.*]] = fir.alloca i32 {bindc_name = "alpha", uniq_name = "{{.*}}alpha"}
-!FIRDialect-DAG: %[[ALPHA_DECL:.*]]:2 = hlfir.declare %[[ALPHA]] {uniq_name = "{{.*}}alpha"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !FIRDialect-DAG: %[[ALPHA_ARRAY:.*]] = fir.alloca !fir.array<10xi32> {bindc_name = "alpha_array", uniq_name = "{{.*}}alpha_array"}
-!FIRDialect-DAG: %[[ALPHA_ARRAY_DECL:.*]]:2 = hlfir.declare %[[ALPHA_ARRAY]]({{.*}}) {uniq_name = "{{.*}}alpha_array"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
 !FIRDialect-DAG: %[[BETA:.*]] = fir.alloca !fir.char<1,5> {bindc_name = "beta", uniq_name = "{{.*}}beta"}
-!FIRDialect-DAG: %[[BETA_DECL:.*]]:2 = hlfir.declare %[[BETA]] typeparams {{.*}} {uniq_name = "{{.*}}beta"} : (!fir.ref<!fir.char<1,5>>, index) -> (!fir.ref<!fir.char<1,5>>, !fir.ref<!fir.char<1,5>>)
 !FIRDialect-DAG: %[[BETA_ARRAY:.*]] = fir.alloca !fir.array<10x!fir.char<1,5>> {bindc_name = "beta_array", uniq_name = "{{.*}}beta_array"}
+
+!FIRDialect-DAG: %[[ALPHA_DECL:.*]]:2 = hlfir.declare %[[ALPHA]] {uniq_name = "{{.*}}alpha"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!FIRDialect-DAG: %[[ALPHA_ARRAY_DECL:.*]]:2 = hlfir.declare %[[ALPHA_ARRAY]]({{.*}}) {uniq_name = "{{.*}}alpha_array"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+!FIRDialect-DAG: %[[BETA_DECL:.*]]:2 = hlfir.declare %[[BETA]] typeparams {{.*}} {uniq_name = "{{.*}}beta"} : (!fir.ref<!fir.char<1,5>>, index) -> (!fir.ref<!fir.char<1,5>>, !fir.ref<!fir.char<1,5>>)
 !FIRDialect-DAG: %[[BETA_ARRAY_DECL:.*]]:2 = hlfir.declare %[[BETA_ARRAY]]({{.*}}) typeparams {{.*}} {uniq_name = "{{.*}}beta_array"} : (!fir.ref<!fir.array<10x!fir.char<1,5>>>, !fir.shape<1>, index) -> (!fir.ref<!fir.array<10x!fir.char<1,5>>>, !fir.ref<!fir.array<10x!fir.char<1,5>>>)
 
-!FIRDialect-DAG: omp.parallel private(@{{.*}} %{{.*}}#0 -> %[[ALPHA_PVT:.*]], @{{.*}} %{{.*}}#0 -> %[[ALPHA_ARRAY_PVT:.*]], @{{.*}} %{{.*}}#0 -> %[[BETA_PVT:.*]], @{{.*}} %{{.*}}#0 -> %[[BETA_ARRAY_PVT:.*]], @{{.*}} %{{.*}}#0 -> %[[ARG1_PVT:.*]], @{{.*}} %{{.*}}#0 -> %[[ARG2_PVT:.*]], @{{.*}} %{{.*}}#0 -> %[[ARG3_PVT:.*]], @{{.*}} %{{.*}}#0 -> %[[ARG4_PVT:.*]] : {{.*}}) {
+!FIRDialect:     omp.parallel private(@{{.*}} %[[ALPHA_DECL]]#0 -> %[[ALPHA_PVT:.*]], @{{.*}} %{{.*}} -> %[[ALPHA_ARRAY_PVT:.*]], @{{.*}} %{{.*}}#0 -> %[[BETA_PVT:.*]], @{{.*}} %{{.*}} -> %[[BETA_ARRAY_PVT:.*]], @{{.*}} %{{.*}}#0 -> %[[ARG1_PVT:.*]], @{{.*}} %{{.*}} -> %[[ARG2_PVT:.*]], @{{.*}} %{{.*}}#0 -> %[[ARG3_PVT:.*]], @{{.*}} %{{.*}} -> %[[ARG4_PVT:.*]] : {{.*}}) {
 !FIRDialect-DAG:  %[[ALPHA_PVT_DECL:.*]]:2 = hlfir.declare %[[ALPHA_PVT]] {uniq_name = "{{.*}}alpha"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!FIRDialect-DAG:  %[[ALPHA_ARRAY_PVT_DECL:.*]]:2 = hlfir.declare %[[ALPHA_ARRAY_PVT]]({{.*}}) {uniq_name = "{{.*}}alpha_array"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+!FIRDialect-DAG:  %[[ALPHA_ARRAY_PVT_DECL:.*]]:2 = hlfir.declare %[[ALPHA_ARRAY_PVT]] {uniq_name = "{{.*}}alpha_array"} :
 !FIRDialect-DAG:  %[[BETA_PVT_DECL:.*]]:2 = hlfir.declare %[[BETA_PVT]] typeparams {{.*}} {uniq_name = "{{.*}}beta"} : (!fir.ref<!fir.char<1,5>>, index) -> (!fir.ref<!fir.char<1,5>>, !fir.ref<!fir.char<1,5>>)
-!FIRDialect-DAG:  %[[BETA_ARRAY_PVT_DECL:.*]]:2 = hlfir.declare %[[BETA_ARRAY_PVT]]({{.*}}) typeparams {{.*}} {uniq_name = "{{.*}}beta_array"} : (!fir.ref<!fir.array<10x!fir.char<1,5>>>, !fir.shape<1>, index) -> (!fir.ref<!fir.array<10x!fir.char<1,5>>>, !fir.ref<!fir.array<10x!fir.char<1,5>>>)
+!FIRDialect-DAG:  %[[BETA_ARRAY_PVT_DECL:.*]]:2 = hlfir.declare %[[BETA_ARRAY_PVT]] {uniq_name = "{{.*}}beta_array"} :
 !FIRDialect-DAG:  %[[ARG1_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG1_PVT]] {uniq_name = "{{.*}}arg1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!FIRDialect-DAG:  %[[ARG2_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG2_PVT]]({{.*}}) {uniq_name = "{{.*}}arg2"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+!FIRDialect-DAG:  %[[ARG2_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG2_PVT]] {uniq_name = "{{.*}}arg2"} :
 !FIRDialect-DAG:  %[[ARG3_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG3_PVT]] typeparams {{.*}} {uniq_name = "{{.*}}arg3"} : (!fir.ref<!fir.char<1,5>>, index) -> (!fir.ref<!fir.char<1,5>>, !fir.ref<!fir.char<1,5>>)
-!FIRDialect-DAG:  %[[ARG4_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG4_PVT]]({{.*}}) typeparams {{.*}} {uniq_name = "{{.*}}arg4"} : (!fir.ref<!fir.array<10x!fir.char<1,5>>>, !fir.shape<1>, index) -> (!fir.ref<!fir.array<10x!fir.char<1,5>>>, !fir.ref<!fir.array<10x!fir.char<1,5>>>)
+!FIRDialect-DAG:  %[[ARG4_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG4_PVT]] {uniq_name = "{{.*}}arg4"} :
 !FIRDialect:     omp.terminator
 !FIRDialect:  }
 
diff --git flang/test/Lower/OpenMP/parallel-reduction-allocatable-array.f90 flang/test/Lower/OpenMP/parallel-reduction-allocatable-array.f90
index dabd495d733b..25dbb75c54a8 100644
--- flang/test/Lower/OpenMP/parallel-reduction-allocatable-array.f90
+++ flang/test/Lower/OpenMP/parallel-reduction-allocatable-array.f90
@@ -30,7 +30,9 @@ end program
 ! CHECK:           %[[C0_I64:.*]] = arith.constant 0 : i64
 ! CHECK:           %[[IS_NULL:.*]] = arith.cmpi eq, %[[ADDRI]], %[[C0_I64]] : i64
 ! CHECK:           fir.if %[[IS_NULL]] {
-! CHECK:             %[[NULL_BOX:.*]] = fir.embox %[[ADDR]] : (!fir.heap<!fir.array<?xi32>>) -> !fir.box<!fir.heap<!fir.array<?xi32>>>
+! CHECK:             %[[C0_INDEX:.*]] = arith.constant 0 : index
+! CHECK:             %[[SHAPE:.*]] = fir.shape %[[C0_INDEX]]
+! CHECK:             %[[NULL_BOX:.*]] = fir.embox %[[ADDR]](%[[SHAPE]]) : (!fir.heap<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.heap<!fir.array<?xi32>>>
 ! CHECK:             fir.store %[[NULL_BOX]] to %[[ALLOC]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 ! CHECK:           } else {
 ! CHECK:             %[[VAL_3:.*]] = arith.constant 0 : index
diff --git flang/test/Lower/OpenMP/parallel-reduction-pointer-array.f90 flang/test/Lower/OpenMP/parallel-reduction-pointer-array.f90
index 1e07018a6887..a22a8f693d8a 100644
--- flang/test/Lower/OpenMP/parallel-reduction-pointer-array.f90
+++ flang/test/Lower/OpenMP/parallel-reduction-pointer-array.f90
@@ -31,7 +31,9 @@ end program
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i64
 ! CHECK:           %[[VAL_7:.*]] = arith.cmpi eq, %[[VAL_5]], %[[VAL_6]] : i64
 ! CHECK:           fir.if %[[VAL_7]] {
-! CHECK:             %[[VAL_8:.*]] = fir.embox %[[VAL_4]] : (!fir.ptr<!fir.array<?xi32>>) -> !fir.box<!fir.ptr<!fir.array<?xi32>>>
+! CHECK:             %[[C0:.*]] = arith.constant 0 : index
+! CHECK:             %[[SHAPE:.*]] = fir.shape %[[C0]]
+! CHECK:             %[[VAL_8:.*]] = fir.embox %[[VAL_4]](%[[SHAPE]]) : (!fir.ptr<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.ptr<!fir.array<?xi32>>>
 ! CHECK:             fir.store %[[VAL_8]] to %[[ALLOC]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
 ! CHECK:           } else {
 ! CHECK:             %[[VAL_9:.*]] = arith.constant 0 : index
diff --git flang/test/Lower/OpenMP/pointer-to-array.f90 flang/test/Lower/OpenMP/pointer-to-array.f90
new file mode 100644
index 000000000000..1861b3907bcf
--- /dev/null
+++ flang/test/Lower/OpenMP/pointer-to-array.f90
@@ -0,0 +1,43 @@
+! Regression test for crash compiling privatizer for a pointer to an array.
+! The crash was because the fir.embox was not given a shape but it needs one.
+
+!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
+
+! ALLOCATABLE case (2nd subroutine)
+!CHECK-LABEL: omp.private {type = firstprivate}
+!CHECK-SAME: @{{.*}} : !fir.box<!fir.heap<!fir.array<?x!fir.type<{{.*}}>>>> init {
+!CHECK:        if %{{.*}} {
+!CHECK:        %[[SHAPE:.*]] = fir.shape
+!CHECK:        %[[BOX:.*]] = fir.embox %{{.*}}(%[[SHAPE]])
+!CHECK:        } else {
+
+! POINTER case (1st subroutine)
+!CHECK-LABEL: omp.private {type = firstprivate}
+!CHECK-SAME: @{{.*}} : !fir.box<!fir.ptr<!fir.array<?x!fir.type<{{.*}}>>>> init {
+!CHECK:        %[[SHAPE:.*]] = fir.shape
+!CHECK:        %[[ADDR:.*]] = fir.zero_bits
+!CHECK:        %[[BOX:.*]] = fir.embox %[[ADDR]](%[[SHAPE]])
+
+subroutine pointer_to_array_derived
+  type t
+    integer :: i
+  end type
+  type(t), pointer :: a(:)
+  allocate(a(1))
+  a(1)%i = 2
+  !$omp parallel firstprivate(a)
+  if (a(1)%i/=2) stop 2
+  !$omp end parallel
+end subroutine
+
+subroutine allocatable_array_derived
+  type t
+    integer :: i
+  end type
+  type(t), allocatable :: a(:)
+  allocate(a(1))
+  a(1)%i = 2
+  !$omp parallel firstprivate(a)
+  if (a(1)%i/=2) stop 2
+  !$omp end parallel
+end subroutine
diff --git flang/test/Lower/OpenMP/private-commonblock.f90 flang/test/Lower/OpenMP/private-commonblock.f90
index f6d285a3b011..84a604cf1099 100644
--- flang/test/Lower/OpenMP/private-commonblock.f90
+++ flang/test/Lower/OpenMP/private-commonblock.f90
@@ -17,6 +17,8 @@ subroutine private_common
   !$omp end parallel
 end subroutine
 
+!CHECK:    %[[D_BOX_ADDR:.*]] = fir.alloca !fir.box<!fir.array<5x!fir.char<1,5>>>
+!CHECK:    %[[B_BOX_ADDR:.*]] = fir.alloca !fir.box<!fir.array<10xf32>>
 !CHECK:    %[[BLK_ADDR:.*]] = fir.address_of(@blk_) : !fir.ref<!fir.array<74xi8>>
 !CHECK:    %[[I8_ARR:.*]] = fir.convert %[[BLK_ADDR]] : (!fir.ref<!fir.array<74xi8>>) -> !fir.ref<!fir.array<?xi8>>
 !CHECK:    %[[C0:.*]] = arith.constant 0 : index
@@ -48,17 +50,24 @@ end subroutine
 !CHECK:    %[[D_REF:.*]] = fir.convert %[[D_DECL]]#1 : (!fir.ref<!fir.array<5x!fir.char<1,5>>>) -> !fir.ref<!fir.char<1,5>>
 !CHECK:    %[[D_BOX:.*]] = fir.emboxchar %[[D_REF]], %[[TP5]] : (!fir.ref<!fir.char<1,5>>, index) -> !fir.boxchar<1>
 !CHECK:    fir.call @_QPsub1(%[[A_DECL]]#1, %[[B_DECL]]#1, %[[C_BOX]], %[[D_BOX]]) fastmath<contract> : (!fir.ref<i32>, !fir.ref<!fir.array<10xf32>>, !fir.boxchar<1>, !fir.boxchar<1>) -> ()
-!CHECK:    omp.parallel private(@{{.*}} %{{.*}}#0 -> %[[A_PVT_REF:.*]], @{{.*}} %{{.*}}#0 -> %[[B_PVT_REF:.*]], @{{.*}} %{{.*}}#0 -> %[[C_PVT_REF:.*]], @{{.*}} %{{.*}}#0 -> %[[D_PVT_REF:.*]] : {{.*}}) {
+!CHECK:    %[[B_BOX:.*]] = fir.embox %[[B_DECL]]#0(%[[SH10]])
+!CHECK:    fir.store %[[B_BOX]] to %[[B_BOX_ADDR]]
+!CHECK:    %[[D_BOX:.*]] = fir.embox %[[D_DECL]]#0(%[[SH5]])
+!CHECK:    fir.store %[[D_BOX]] to %[[D_BOX_ADDR]]
+!CHECK:    omp.parallel private(@{{.*}} %{{.*}}#0 -> %[[A_PVT_REF:.*]], @{{.*}} %[[B_BOX_ADDR]] -> %[[B_PVT_REF:.*]], @{{.*}} %{{.*}}#0 -> %[[C_PVT_REF:.*]], @{{.*}} %[[D_BOX_ADDR]] -> %[[D_PVT_REF:.*]] : {{.*}}) {
 !CHECK:      %[[A_PVT_DECL:.*]]:2 = hlfir.declare %[[A_PVT_REF]] {uniq_name = "_QFprivate_clause_commonblockEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK:      %[[SH10:.*]] = fir.shape %c10{{.*}} : (index) -> !fir.shape<1>
-!CHECK:      %[[B_PVT_DECL:.*]]:2 = hlfir.declare %[[B_PVT_REF]](%[[SH10]]) {uniq_name = "_QFprivate_clause_commonblockEb"} : (!fir.ref<!fir.array<10xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>)
+!CHECK:      %[[B_PVT_DECL:.*]]:2 = hlfir.declare %[[B_PVT_REF]] {uniq_name = "_QFprivate_clause_commonblockEb"} :
 !CHECK:      %[[C_PVT_DECL:.*]]:2 = hlfir.declare %[[C_PVT_REF]] typeparams %{{.*}} {uniq_name = "_QFprivate_clause_commonblockEc"} : (!fir.ref<!fir.char<1,5>>, index) -> (!fir.ref<!fir.char<1,5>>, !fir.ref<!fir.char<1,5>>)
-!CHECK:      %[[SH5:.*]] = fir.shape %c5{{.*}} : (index) -> !fir.shape<1>
-!CHECK:      %[[D_PVT_DECL:.*]]:2 = hlfir.declare %[[D_PVT_REF]](%[[SH5]]) typeparams %c5{{.*}} {uniq_name = "_QFprivate_clause_commonblockEd"} : (!fir.ref<!fir.array<5x!fir.char<1,5>>>, !fir.shape<1>, index) -> (!fir.ref<!fir.array<5x!fir.char<1,5>>>, !fir.ref<!fir.array<5x!fir.char<1,5>>>)
+!CHECK:      %[[D_PVT_DECL:.*]]:2 = hlfir.declare %[[D_PVT_REF]]
+!CHECK:      %[[B_LOADED:.*]] = fir.load %[[B_PVT_DECL]]#0
+!CHECK:      %[[B_ADDR:.*]] = fir.box_addr %[[B_LOADED]]
 !CHECK:      %[[C_PVT_BOX:.*]] = fir.emboxchar %[[C_PVT_DECL]]#1, %{{.*}} : (!fir.ref<!fir.char<1,5>>, index) -> !fir.boxchar<1>
-!CHECK:      %[[D_PVT_REF:.*]] = fir.convert %[[D_PVT_DECL]]#1 : (!fir.ref<!fir.array<5x!fir.char<1,5>>>) -> !fir.ref<!fir.char<1,5>>
+
+!CHECK:      %[[D_LOADED:.*]] = fir.load %[[D_PVT_DECL]]#0
+!CHECK:      %[[D_ADDR:.*]] = fir.box_addr %[[D_LOADED]]
+!CHECK:      %[[D_PVT_REF:.*]] = fir.convert %[[D_ADDR]] : (!fir.ref<!fir.array<5x!fir.char<1,5>>>) -> !fir.ref<!fir.char<1,5>>
 !CHECK:      %[[D_PVT_BOX:.*]] = fir.emboxchar %[[D_PVT_REF]], %{{.*}} : (!fir.ref<!fir.char<1,5>>, index) -> !fir.boxchar<1>
-!CHECK:      fir.call @_QPsub2(%[[A_PVT_DECL]]#1, %[[B_PVT_DECL]]#1, %[[C_PVT_BOX]], %[[D_PVT_BOX]]) fastmath<contract> : (!fir.ref<i32>, !fir.ref<!fir.array<10xf32>>, !fir.boxchar<1>, !fir.boxchar<1>) -> ()
+!CHECK:      fir.call @_QPsub2(%[[A_PVT_DECL]]#1, %[[B_ADDR]], %[[C_PVT_BOX]], %[[D_PVT_BOX]]) fastmath<contract> : (!fir.ref<i32>, !fir.ref<!fir.array<10xf32>>, !fir.boxchar<1>, !fir.boxchar<1>) -> ()
 !CHECK:      omp.terminator
 !CHECK:    }
 !CHECK:    %[[C_BOX:.*]] = fir.emboxchar %[[C_DECL]]#1, %{{.*}} : (!fir.ref<!fir.char<1,5>>, index) -> !fir.boxchar<1>
diff --git flang/test/Lower/OpenMP/private-derived-type.f90 flang/test/Lower/OpenMP/private-derived-type.f90
index df1c7c3f9222..91d8fa753f2e 100644
--- flang/test/Lower/OpenMP/private-derived-type.f90
+++ flang/test/Lower/OpenMP/private-derived-type.f90
@@ -15,16 +15,14 @@ subroutine s4
   !$omp end parallel
 end subroutine s4
 
-! CHECK:  omp.private {type = private} @[[DERIVED_PRIV:.*]] : !fir.ref<!fir.type<{{.*}}y3{x:!fir.box<!fir.heap<i32>>}>> alloc {
-! CHECK:             %[[VAL_23:.*]] = fir.alloca !fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}> {bindc_name = "v", pinned, uniq_name = "_QFs4Ev"}
-! CHECK:             %[[VAL_25:.*]] = fir.embox %[[VAL_23]] : (!fir.ref<!fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}>>) -> !fir.box<!fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}>>
+! CHECK:  omp.private {type = private} @[[DERIVED_PRIV:.*]] : !fir.type<{{.*}}y3{x:!fir.box<!fir.heap<i32>>}> init {
+! CHECK:             %[[VAL_25:.*]] = fir.embox %[[VAL_23:.*]] : (!fir.ref<!fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}>>) -> !fir.box<!fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}>>
 ! CHECK:             %[[VAL_26:.*]] = fir.address_of
 ! CHECK:             %[[VAL_27:.*]] = arith.constant 8 : i32
 ! CHECK:             %[[VAL_28:.*]] = fir.convert %[[VAL_25]] : (!fir.box<!fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}>>) -> !fir.box<none>
 ! CHECK:             %[[VAL_29:.*]] = fir.convert %[[VAL_26]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
 !                    Check we do call FortranAInitialize on the derived type
 ! CHECK:             fir.call @_FortranAInitialize(%[[VAL_28]], %[[VAL_29]], %[[VAL_27]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> ()
-! CHECK:             %[[VAL_24:.*]]:2 = hlfir.declare %[[VAL_23]] {uniq_name = "_QFs4Ev"} : (!fir.ref<!fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}>>) -> (!fir.ref<!fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}>>, !fir.ref<!fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}>>)
 ! CHECK:  }
 
 ! CHECK-LABEL:   func.func @_QPs4() {
diff --git flang/test/Lower/OpenMP/same_var_first_lastprivate.f90 flang/test/Lower/OpenMP/same_var_first_lastprivate.f90
index e8e4a0802e00..ee914f23aacf 100644
--- flang/test/Lower/OpenMP/same_var_first_lastprivate.f90
+++ flang/test/Lower/OpenMP/same_var_first_lastprivate.f90
@@ -10,11 +10,7 @@ subroutine first_and_lastprivate
   !$omp end parallel do
 end subroutine
 
-! CHECK:  omp.private {type = firstprivate} @{{.*}}Evar_firstprivate_ref_i32 : {{.*}} alloc {
-! CHECK:    %[[ALLOC:.*]] = fir.alloca i32 {{.*}}
-! CHECK:    %[[ALLOC_DECL:.*]]:2 = hlfir.declare %[[ALLOC]]
-! CHECK:    omp.yield(%[[ALLOC_DECL]]#0 : !fir.ref<i32>)
-! CHECK:  } copy {
+! CHECK:  omp.private {type = firstprivate} @{{.*}}Evar_firstprivate_i32 : {{.*}} copy {
 ! CHECK: ^{{.*}}(%[[ORIG_REF:.*]]: {{.*}}, %[[PRIV_REF:.*]]: {{.*}}):
 ! CHECK:    %[[ORIG_VAL:.*]] = fir.load %[[ORIG_REF]]
 ! CHECK:    hlfir.assign %[[ORIG_VAL]] to %[[PRIV_REF]]
@@ -25,7 +21,7 @@ end subroutine
 ! CHECK:    %[[ORIG_VAR_DECL:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "{{.*}}Evar"}
 ! CHECK:    omp.parallel {
 ! CHECK:      omp.barrier
-! CHECK:      omp.wsloop private(@{{.*}}var_firstprivate_ref_i32 {{.*}}) {
+! CHECK:      omp.wsloop private(@{{.*}}var_firstprivate_i32 {{.*}}) {
 ! CHECK:        omp.loop_nest {{.*}} {
 ! CHECK:          %[[PRIV_VAR_DECL:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "{{.*}}Evar"}
 ! CHECK:          fir.if %{{.*}} {
diff --git flang/test/Lower/OpenMP/simd.f90 flang/test/Lower/OpenMP/simd.f90
index 0345ace24aaa..fc3d908801ff 100644
--- flang/test/Lower/OpenMP/simd.f90
+++ flang/test/Lower/OpenMP/simd.f90
@@ -254,7 +254,7 @@ subroutine lastprivate_with_simd
   real :: sum
 
   
-!CHECK: omp.simd private(@_QFlastprivate_with_simdEsum_private_ref_f32 %[[VAR_SUM_DECLARE]]#0 -> %[[VAR_SUM_PINNED:.*]], @{{.*}}) {
+!CHECK: omp.simd private(@_QFlastprivate_with_simdEsum_private_f32 %[[VAR_SUM_DECLARE]]#0 -> %[[VAR_SUM_PINNED:.*]], @{{.*}}) {
 !CHECK: omp.loop_nest (%[[ARG:.*]]) : i32 = ({{.*}} to ({{.*}}) inclusive step ({{.*}}) {
 !CHECK: %[[VAR_SUM_PINNED_DECLARE:.*]]:2 = hlfir.declare %[[VAR_SUM_PINNED]] {{.*}}
 !CHECK: %[[ADD_RESULT:.*]] = arith.addi {{.*}}
diff --git flang/test/Lower/OpenMP/task2.f90 flang/test/Lower/OpenMP/task2.f90
index 734e75c5bba0..85f934f109af 100644
--- flang/test/Lower/OpenMP/task2.f90
+++ flang/test/Lower/OpenMP/task2.f90
@@ -2,7 +2,7 @@
 
 
 !CHECK-LABEL: omp.private
-!CHECK-SAME:      {type = firstprivate} @[[PRIVATIZER:.*]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> alloc {
+!CHECK-SAME:      {type = firstprivate} @[[PRIVATIZER:.*]] : !fir.box<!fir.heap<!fir.array<?xi32>>> init {
 !CHECK:         fir.if
 !CHECK:       } copy {
 !CHECK:         fir.if
diff --git flang/test/Lower/OpenMP/wsloop-reduction-allocatable-array-minmax.f90 flang/test/Lower/OpenMP/wsloop-reduction-allocatable-array-minmax.f90
index ce45d09d77a2..f0daef1a4a35 100644
--- flang/test/Lower/OpenMP/wsloop-reduction-allocatable-array-minmax.f90
+++ flang/test/Lower/OpenMP/wsloop-reduction-allocatable-array-minmax.f90
@@ -44,7 +44,9 @@ end program
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i64
 ! CHECK:           %[[VAL_7:.*]] = arith.cmpi eq, %[[VAL_5]], %[[VAL_6]] : i64
 ! CHECK:           fir.if %[[VAL_7]] {
-! CHECK:             %[[VAL_8:.*]] = fir.embox %[[VAL_4]] : (!fir.heap<!fir.array<?xi32>>) -> !fir.box<!fir.heap<!fir.array<?xi32>>>
+! CHECK:             %[[C0:.*]] = arith.constant 0 : index
+! CHECK:             %[[SHAPE:.*]] = fir.shape %[[C0]]
+! CHECK:             %[[VAL_8:.*]] = fir.embox %[[VAL_4]](%[[SHAPE]]) : (!fir.heap<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.heap<!fir.array<?xi32>>>
 ! CHECK:             fir.store %[[VAL_8]] to %[[ALLOC]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 ! CHECK:           } else {
 ! CHECK:             %[[VAL_9:.*]] = arith.constant 0 : index
@@ -103,7 +105,9 @@ end program
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i64
 ! CHECK:           %[[VAL_7:.*]] = arith.cmpi eq, %[[VAL_5]], %[[VAL_6]] : i64
 ! CHECK:           fir.if %[[VAL_7]] {
-! CHECK:             %[[VAL_8:.*]] = fir.embox %[[VAL_4]] : (!fir.heap<!fir.array<?xi32>>) -> !fir.box<!fir.heap<!fir.array<?xi32>>>
+! CHECK:             %[[C0:.*]] = arith.constant 0 : index
+! CHECK:             %[[SHAPE:.*]] = fir.shape %[[C0]]
+! CHECK:             %[[VAL_8:.*]] = fir.embox %[[VAL_4]](%[[SHAPE]]) : (!fir.heap<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.heap<!fir.array<?xi32>>>
 ! CHECK:             fir.store %[[VAL_8]] to %[[ALLOC]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 ! CHECK:           } else {
 ! CHECK:             %[[VAL_9:.*]] = arith.constant 0 : index
diff --git flang/test/Lower/assignment.f90 flang/test/Lower/assignment.f90
index dde5110bdfa9..defeec5b7edf 100644
--- flang/test/Lower/assignment.f90
+++ flang/test/Lower/assignment.f90
@@ -1,4 +1,4 @@
-! RUN: %flang_fc1 %s -o "-" -emit-fir -cpp -flang-deprecated-no-hlfir | FileCheck %s --check-prefixes=CHECK%if target=x86_64{{.*}} %{,CHECK-X86-64%}
+! RUN: %flang_fc1 %s -o "-" -emit-fir -cpp -flang-deprecated-no-hlfir | FileCheck %s --check-prefixes=CHECK,%if flang-supports-f128-math %{F128%} %else %{F64%}%if target=x86_64-unknown-linux{{.*}} %{,CHECK-X86-64%}
 
 subroutine sub1(a)
   integer :: a
@@ -261,27 +261,29 @@ end
 ! CHECK:         return %[[RET]] : complex<f32>
 
 subroutine real_constant()
+  integer, parameter :: rk = merge(16, 8, selected_real_kind(33, 4931)==16)
   real(2) :: a
   real(4) :: b
   real(8) :: c
 #if __x86_64__
   real(10) :: d
 #endif
-  real(16) :: e
+  real(rk) :: e
   a = 2.0_2
   b = 4.0_4
   c = 8.0_8
 #if __x86_64__
   d = 10.0_10
 #endif
-  e = 16.0_16
+  e = 16.0_rk
 end
 
 ! CHECK: %[[A:.*]] = fir.alloca f16
 ! CHECK: %[[B:.*]] = fir.alloca f32
 ! CHECK: %[[C:.*]] = fir.alloca f64
 ! CHECK-X86-64: %[[D:.*]] = fir.alloca f80
-! CHECK: %[[E:.*]] = fir.alloca f128
+! F128: %[[E:.*]] = fir.alloca f128
+! F64: %[[E:.*]] = fir.alloca f64
 ! CHECK: %[[C2:.*]] = arith.constant 2.000000e+00 : f16
 ! CHECK: fir.store %[[C2]] to %[[A]] : !fir.ref<f16>
 ! CHECK: %[[C4:.*]] = arith.constant 4.000000e+00 : f32
@@ -290,8 +292,10 @@ end
 ! CHECK: fir.store %[[C8]] to %[[C]] : !fir.ref<f64>
 ! CHECK-X86-64: %[[C10:.*]] = arith.constant 1.000000e+01 : f80
 ! CHECK-X86-64: fir.store %[[C10]] to %[[D]] : !fir.ref<f80>
-! CHECK: %[[C16:.*]] = arith.constant 1.600000e+01 : f128
-! CHECK: fir.store %[[C16]] to %[[E]] : !fir.ref<f128>
+! F128: %[[C16:.*]] = arith.constant 1.600000e+01 : f128
+! F64: %[[C16:.*]] = arith.constant 1.600000e+01 : f64
+! F128: fir.store %[[C16]] to %[[E]] : !fir.ref<f128>
+! F64: fir.store %[[C16]] to %[[E]] : !fir.ref<f64>
 
 subroutine complex_constant()
   complex(4) :: a
diff --git flang/test/Lower/basic-function.f90 flang/test/Lower/basic-function.f90
index 5f2fabe1b325..c250a988bf7b 100644
--- flang/test/Lower/basic-function.f90
+++ flang/test/Lower/basic-function.f90
@@ -1,4 +1,4 @@
-! RUN: bbc %s -o "-" -emit-fir -hlfir=false | FileCheck %s
+! RUN: bbc -emit-hlfir %s -o - | FileCheck %s --check-prefixes=CHECK%if target=x86_64{{.*}} %{,CHECK-KIND10%}%if flang-supports-f128-math %{,CHECK-KIND16%}
 
 integer(1) function fct1()
 end
@@ -102,15 +102,19 @@ end
 ! CHECK-LABEL: func @_QPrfct4() -> f64
 ! CHECK:         return %{{.*}} : f64
 
-real(10) function rfct5()
+function rfct5()
+  integer, parameter :: kind10 = merge(10, 4, selected_real_kind(p=18).eq.10)
+  real(kind10) :: rfct5
 end
-! CHECK-LABEL: func @_QPrfct5() -> f80
-! CHECK:         return %{{.*}} : f80
+! CHECK-KIND10-LABEL: func @_QPrfct5() -> f80
+! CHECK-KIND10:         return %{{.*}} : f80
 
-real(16) function rfct6()
+function rfct6()
+  integer, parameter :: kind16 = merge(16, 4, selected_real_kind(p=33).eq.16)
+  real(kind16) :: rfct6
 end
-! CHECK-LABEL: func @_QPrfct6() -> f128
-! CHECK:         return %{{.*}} : f128
+! CHECK-KIND16-LABEL: func @_QPrfct6() -> f128
+! CHECK-KIND16:         return %{{.*}} : f128
 
 complex(2) function cplxfct1()
 end
@@ -132,15 +136,19 @@ end
 ! CHECK-LABEL: func @_QPcplxfct4() -> complex<f64>
 ! CHECK:         return %{{.*}} : complex<f64>
 
-complex(10) function cplxfct5()
+function cplxfct5()
+  integer, parameter :: kind10 = merge(10, 4, selected_real_kind(p=18).eq.10)
+  complex(kind10) :: cplxfct5
 end
-! CHECK-LABEL: func @_QPcplxfct5() -> complex<f80>
-! CHECK:         return %{{.*}} : complex<f80>
+! CHECK-KIND10-LABEL: func @_QPcplxfct5() -> complex<f80>
+! CHECK-KIND10:         return %{{.*}} : complex<f80>
 
-complex(16) function cplxfct6()
+function cplxfct6()
+  integer, parameter :: kind16 = merge(16, 4, selected_real_kind(p=33).eq.16)
+  complex(kind16) :: cplxfct6
 end
-! CHECK-LABEL: func @_QPcplxfct6() -> complex<f128>
-! CHECK:         return %{{.*}} : complex<f128>
+! CHECK-KIND16-LABEL: func @_QPcplxfct6() -> complex<f128>
+! CHECK-KIND16:         return %{{.*}} : complex<f128>
 
 function fct_with_character_return(i)
   character(10) :: fct_with_character_return
diff --git flang/test/Lower/entry-statement-init.f90 flang/test/Lower/entry-statement-init.f90
new file mode 100644
index 000000000000..731ccebef687
--- /dev/null
+++ flang/test/Lower/entry-statement-init.f90
@@ -0,0 +1,26 @@
+! RUN: bbc -emit-hlfir -o - %s | FileCheck %s
+
+! Test initialization and finalizations of dummy arguments in entry statements.
+
+module m
+  type t
+  end type
+contains
+ subroutine test1(x)
+   class(t), intent(out) :: x
+   entry test1_entry()
+ end subroutine
+ subroutine test2(x)
+   class(t), intent(out) :: x
+   entry test2_entry(x)
+ end subroutine
+end module
+! CHECK-LABEL:   func.func @_QMmPtest1_entry(
+! CHECK-NOT: Destroy
+! CHECK-NOT: Initialize
+! CHECK:           return
+
+! CHECK-LABEL:   func.func @_QMmPtest2_entry(
+! CHECK: Destroy
+! CHECK: Initialize
+! CHECK:           return
diff --git flang/test/Lower/math-lowering/abs.f90 flang/test/Lower/math-lowering/abs.f90
index ead603c1d13e..9d3b8b92cfdd 100644
--- flang/test/Lower/math-lowering/abs.f90
+++ flang/test/Lower/math-lowering/abs.f90
@@ -1,9 +1,9 @@
-! RUN: bbc -emit-fir %s -o - --math-runtime=fast | FileCheck --check-prefixes=ALL,FAST %s
-! RUN: %flang_fc1 -emit-fir -mllvm -math-runtime=fast %s -o - | FileCheck --check-prefixes=ALL,FAST %s
-! RUN: bbc -emit-fir %s -o - --math-runtime=relaxed | FileCheck --check-prefixes=ALL,RELAXED %s
-! RUN: %flang_fc1 -emit-fir -mllvm -math-runtime=relaxed %s -o - | FileCheck --check-prefixes=ALL,RELAXED %s
-! RUN: bbc -emit-fir %s -o - --math-runtime=precise | FileCheck --check-prefixes=ALL,PRECISE %s
-! RUN: %flang_fc1 -emit-fir -mllvm -math-runtime=precise %s -o - | FileCheck --check-prefixes=ALL,PRECISE %s
+! RUN: bbc -emit-fir %s -o - --math-runtime=fast | FileCheck --check-prefixes=ALL,FAST-%if flang-supports-f128-math %{F128%} %else %{F64%} %s
+! RUN: %flang_fc1 -emit-fir -mllvm -math-runtime=fast %s -o - | FileCheck --check-prefixes=ALL,FAST,FAST-%if flang-supports-f128-math %{F128%} %else %{F64%} %s
+! RUN: bbc -emit-fir %s -o - --math-runtime=relaxed | FileCheck --check-prefixes=ALL,RELAXED,RELAXED-%if flang-supports-f128-math %{F128%} %else %{F64%} %s
+! RUN: %flang_fc1 -emit-fir -mllvm -math-runtime=relaxed %s -o - | FileCheck --check-prefixes=ALL,RELAXED,RELAXED-%if flang-supports-f128-math %{F128%} %else %{F64%} %s
+! RUN: bbc -emit-fir %s -o - --math-runtime=precise | FileCheck --check-prefixes=ALL,PRECISE,PRECISE-%if flang-supports-f128-math %{F128%} %else %{F64%} %s
+! RUN: %flang_fc1 -emit-fir -mllvm -math-runtime=precise %s -o - | FileCheck --check-prefixes=ALL,PRECISE,PRECISE-%if flang-supports-f128-math %{F128%} %else %{F64%} %s
 
 function test_real4(x)
   real :: x, test_real4
@@ -26,13 +26,17 @@ end function
 ! PRECISE: {{%[A-Za-z0-9._]+}} = fir.call @fabs({{%[A-Za-z0-9._]+}}) {{.*}}: (f64) -> f64
 
 function test_real16(x)
-  real(16) :: x, test_real16
+  integer, parameter :: rk = merge(16, 8, selected_real_kind(33, 4931)==16)
+  real(rk) :: x, test_real16
   test_real16 = abs(x)
 end function
 ! ALL-LABEL: @_QPtest_real16
-! FAST: {{%[A-Za-z0-9._]+}} = math.absf {{%[A-Za-z0-9._]+}} {{.*}}: f128
-! RELAXED: {{%[A-Za-z0-9._]+}} = math.absf {{%[A-Za-z0-9._]+}} {{.*}}: f128
-! PRECISE: {{%[A-Za-z0-9._]+}} = fir.call @llvm.fabs.f128({{%[A-Za-z0-9._]+}}) {{.*}}: (f128) -> f128
+! FAST-F128: {{%[A-Za-z0-9._]+}} = math.absf {{%[A-Za-z0-9._]+}} {{.*}}: f128
+! FAST-F64: {{%[A-Za-z0-9._]+}} = math.absf {{%[A-Za-z0-9._]+}} {{.*}}: f64
+! RELAXED-F128: {{%[A-Za-z0-9._]+}} = math.absf {{%[A-Za-z0-9._]+}} {{.*}}: f128
+! RELAXED-F64: {{%[A-Za-z0-9._]+}} = math.absf {{%[A-Za-z0-9._]+}} {{.*}}: f64
+! PRECISE-F128: {{%[A-Za-z0-9._]+}} = fir.call @llvm.fabs.f128({{%[A-Za-z0-9._]+}}) {{.*}}: (f128) -> f128
+! PRECISE-F64: {{%[A-Za-z0-9._]+}} = fir.call @fabs({{%[A-Za-z0-9._]+}}) {{.*}}: (f64) -> f64
 
 function test_complex4(c)
   complex(4) :: c, test_complex4
@@ -50,6 +54,6 @@ end function
 
 ! PRECISE-DAG: func.func private @fabsf(f32) -> f32 attributes {fir.bindc_name = "fabsf", fir.runtime}
 ! PRECISE-DAG: func.func private @fabs(f64) -> f64 attributes {fir.bindc_name = "fabs", fir.runtime}
-! PRECISE-DAG: func.func private @llvm.fabs.f128(f128) -> f128 attributes {fir.bindc_name = "llvm.fabs.f128", fir.runtime}
+! PRECISE-F128-DAG: func.func private @llvm.fabs.f128(f128) -> f128 attributes {fir.bindc_name = "llvm.fabs.f128", fir.runtime}
 ! PRECISE-DAG: func.func private @cabsf(complex<f32>) -> f32 attributes {fir.bindc_name = "cabsf", fir.runtime}
 ! PRECISE-DAG: func.func private @cabs(complex<f64>) -> f64 attributes {fir.bindc_name = "cabs", fir.runtime}
diff --git flang/test/Lower/math-lowering/aint.f90 flang/test/Lower/math-lowering/aint.f90
index e8b17aad675c..6c7809f8ea1a 100644
--- flang/test/Lower/math-lowering/aint.f90
+++ flang/test/Lower/math-lowering/aint.f90
@@ -1,9 +1,10 @@
-! RUN: bbc -emit-fir %s -o - --math-runtime=fast | FileCheck --check-prefixes=ALL %s
-! RUN: %flang_fc1 -emit-fir -mllvm -math-runtime=fast %s -o - | FileCheck --check-prefixes=ALL %s
-! RUN: bbc -emit-fir %s -o - --math-runtime=relaxed | FileCheck --check-prefixes=ALL %s
-! RUN: %flang_fc1 -emit-fir -mllvm -math-runtime=relaxed %s -o - | FileCheck --check-prefixes=ALL %s
-! RUN: bbc -emit-fir %s -o - --math-runtime=precise | FileCheck --check-prefixes=ALL %s
-! RUN: %flang_fc1 -emit-fir -mllvm -math-runtime=precise %s -o - | FileCheck --check-prefixes=ALL %s
+! REQUIRES: x86_64-registered-target
+! RUN: bbc -target x86_64-unknown-linux-gnu -emit-fir %s -o - --math-runtime=fast | FileCheck --check-prefixes=ALL %s
+! RUN: %flang_fc1 -target x86_64-unknown-linux-gnu -emit-fir -mllvm -math-runtime=fast %s -o - | FileCheck --check-prefixes=ALL %s
+! RUN: bbc -target x86_64-unknown-linux-gnu -emit-fir %s -o - --math-runtime=relaxed | FileCheck --check-prefixes=ALL %s
+! RUN: %flang_fc1 -target x86_64-unknown-linux-gnu -emit-fir -mllvm -math-runtime=relaxed %s -o - | FileCheck --check-prefixes=ALL %s
+! RUN: bbc -target x86_64-unknown-linux-gnu -emit-fir %s -o - --math-runtime=precise | FileCheck --check-prefixes=ALL %s
+! RUN: %flang_fc1 -target x86_64-unknown-linux-gnu -emit-fir -mllvm -math-runtime=precise %s -o - | FileCheck --check-prefixes=ALL %s
 
 function test_real4(x)
   real :: x, test_real4
@@ -21,14 +22,6 @@ end function
 ! ALL-LABEL: @_QPtest_real8
 ! ALL: {{%[A-Za-z0-9._]+}} = fir.call @llvm.trunc.f64({{%[A-Za-z0-9._]+}}) {{.*}}: (f64) -> f64
 
-function test_real10(x)
-  real(10) :: x, test_real10
-  test_real10 = aint(x)
-end function
-
-! ALL-LABEL: @_QPtest_real10
-! ALL: {{%[A-Za-z0-9._]+}} = fir.call @llvm.trunc.f80({{%[A-Za-z0-9._]+}}) {{.*}}: (f80) -> f80
-
 ! TODO: wait until fp128 is supported well in llvm.trunc
 !function test_real16(x)
 !  real(16) :: x, test_real16
@@ -37,4 +30,3 @@ end function
 
 ! ALL-DAG: func.func private @llvm.trunc.f32(f32) -> f32 attributes {fir.bindc_name = "llvm.trunc.f32", fir.runtime}
 ! ALL-DAG: func.func private @llvm.trunc.f64(f64) -> f64 attributes {fir.bindc_name = "llvm.trunc.f64", fir.runtime}
-! ALL-DAG: func.func private @llvm.trunc.f80(f80) -> f80 attributes {fir.bindc_name = "llvm.trunc.f80", fir.runtime}
diff --git flang/test/Lower/math-lowering/anint.f90 flang/test/Lower/math-lowering/anint.f90
index 45dc3ef1d858..f39009de7e4f 100644
--- flang/test/Lower/math-lowering/anint.f90
+++ flang/test/Lower/math-lowering/anint.f90
@@ -1,9 +1,11 @@
-! RUN: bbc -emit-fir %s -o - --math-runtime=fast | FileCheck --check-prefixes=ALL,FAST %s
-! RUN: %flang_fc1 -emit-fir -mllvm -math-runtime=fast %s -o - | FileCheck --check-prefixes=ALL,FAST %s
-! RUN: bbc -emit-fir %s -o - --math-runtime=relaxed | FileCheck --check-prefixes=ALL,RELAXED %s
-! RUN: %flang_fc1 -emit-fir -mllvm -math-runtime=relaxed %s -o - | FileCheck --check-prefixes=ALL,RELAXED %s
-! RUN: bbc -emit-fir %s -o - --math-runtime=precise | FileCheck --check-prefixes=ALL,PRECISE %s
-! RUN: %flang_fc1 -emit-fir -mllvm -math-runtime=precise %s -o - | FileCheck --check-prefixes=ALL,PRECISE %s
+! RUN: %flang_fc1 -emit-hlfir -o - -mllvm -math-runtime=fast %s \
+! RUN: | FileCheck %s --check-prefixes=ALL,FAST%if target=x86_64{{.*}} %{,ALL-KIND10,FAST-KIND10%}
+
+! RUN: %flang_fc1 -emit-hlfir -o - -mllvm -math-runtime=relaxed %s \
+! RUN: | FileCheck %s --check-prefixes=ALL,RELAXED%if target=x86_64{{.*}} %{,ALL-KIND10,RELAXED-KIND10%}
+
+! RUN: %flang_fc1 -emit-hlfir -o - -mllvm -math-runtime=precise %s \
+! RUN: | FileCheck %s --check-prefixes=ALL,PRECISE%if target=x86_64{{.*}} %{,ALL-KIND10,PRECISE-KIND10%}
 
 function test_real4(x)
   real :: x, test_real4
@@ -26,14 +28,15 @@ end function
 ! PRECISE: {{%[A-Za-z0-9._]+}} = fir.call @llvm.round.f64({{%[A-Za-z0-9._]+}}) {{.*}}: (f64) -> f64
 
 function test_real10(x)
-  real(10) :: x, test_real10
+  integer, parameter :: kind10 = merge(10, 4, selected_real_kind(p=18).eq.10)
+  real(kind10) :: x, test_real10
   test_real10 = anint(x)
 end function
 
-! ALL-LABEL: @_QPtest_real10
-! FAST: {{%[A-Za-z0-9._]+}} = llvm.intr.round({{%[A-Za-z0-9._]+}}) : (f80) -> f80
-! RELAXED: {{%[A-Za-z0-9._]+}} = llvm.intr.round({{%[A-Za-z0-9._]+}}) : (f80) -> f80
-! PRECISE: {{%[A-Za-z0-9._]+}} = fir.call @llvm.round.f80({{%[A-Za-z0-9._]+}}) {{.*}}: (f80) -> f80
+! ALL-KIND10-LABEL: @_QPtest_real10
+! FAST-KIND10: {{%[A-Za-z0-9._]+}} = llvm.intr.round({{%[A-Za-z0-9._]+}}) : (f80) -> f80
+! RELAXED-KIND10: {{%[A-Za-z0-9._]+}} = llvm.intr.round({{%[A-Za-z0-9._]+}}) : (f80) -> f80
+! PRECISE-KIND10: {{%[A-Za-z0-9._]+}} = fir.call @llvm.round.f80({{%[A-Za-z0-9._]+}}) {{.*}}: (f80) -> f80
 
 ! TODO: wait until fp128 is supported well in llvm.round
 !function test_real16(x)
@@ -43,4 +46,4 @@ end function
 
 ! PRECISE-DAG: func.func private @llvm.round.f32(f32) -> f32 attributes {fir.bindc_name = "llvm.round.f32", fir.runtime}
 ! PRECISE-DAG: func.func private @llvm.round.f64(f64) -> f64 attributes {fir.bindc_name = "llvm.round.f64", fir.runtime}
-! PRECISE-DAG: func.func private @llvm.round.f80(f80) -> f80 attributes {fir.bindc_name = "llvm.round.f80", fir.runtime}
+! PRECISE-KIND10-DAG: func.func private @llvm.round.f80(f80) -> f80 attributes {fir.bindc_name = "llvm.round.f80", fir.runtime}
diff --git flang/test/Lower/math-lowering/sign.f90 flang/test/Lower/math-lowering/sign.f90
index fbb47cdebef3..534a6679fbef 100644
--- flang/test/Lower/math-lowering/sign.f90
+++ flang/test/Lower/math-lowering/sign.f90
@@ -1,9 +1,11 @@
-! RUN: bbc -emit-fir %s -o - --math-runtime=fast | FileCheck --check-prefixes=ALL,FAST %s
-! RUN: %flang_fc1 -emit-fir -mllvm -math-runtime=fast %s -o - | FileCheck --check-prefixes=ALL,FAST %s
-! RUN: bbc -emit-fir %s -o - --math-runtime=relaxed | FileCheck --check-prefixes=ALL,RELAXED %s
-! RUN: %flang_fc1 -emit-fir -mllvm -math-runtime=relaxed %s -o - | FileCheck --check-prefixes=ALL,RELAXED %s
-! RUN: bbc -emit-fir %s -o - --math-runtime=precise | FileCheck --check-prefixes=ALL,PRECISE %s
-! RUN: %flang_fc1 -emit-fir -mllvm -math-runtime=precise %s -o - | FileCheck --check-prefixes=ALL,PRECISE %s
+! RUN: %flang_fc1 -emit-hlfir -o - -mllvm -math-runtime=fast %s \
+! RUN: | FileCheck %s --check-prefixes=ALL,FAST%if target=x86_64{{.*}} %{,ALL-KIND10,FAST-KIND10%}%if flang-supports-f128-math %{,ALL-KIND16,FAST-KIND16%}
+
+! RUN: %flang_fc1 -emit-hlfir -o - -mllvm -math-runtime=relaxed %s \
+! RUN: | FileCheck %s --check-prefixes=ALL,RELAXED%if target=x86_64{{.*}} %{,ALL-KIND10,RELAXED-KIND10%}%if flang-supports-f128-math %{,ALL-KIND16,RELAXED-KIND16%}
+
+! RUN: %flang_fc1 -emit-hlfir -o - -mllvm -math-runtime=precise %s \
+! RUN: | FileCheck %s --check-prefixes=ALL,PRECISE%if target=x86_64{{.*}} %{,ALL-KIND10,PRECISE-KIND10%}%if flang-supports-f128-math %{,ALL-KIND16,PRECISE-KIND16%}
 
 function test_real4(x, y)
   real :: x, y, test_real4
@@ -26,26 +28,28 @@ end function
 ! PRECISE: {{%[A-Za-z0-9._]+}} = fir.call @copysign({{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}}) {{.*}}: (f64, f64) -> f64
 
 function test_real10(x, y)
-  real(10) :: x, y, test_real10
+  integer, parameter :: kind10 = merge(10, 4, selected_real_kind(p=18).eq.10)
+  real(kind10) :: x, y, test_real10
   test_real10 = sign(x, y)
 end function
 
-! ALL-LABEL: @_QPtest_real10
-! FAST: {{%[A-Za-z0-9._]+}} = math.copysign {{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}} {{.*}}: f80
-! RELAXED: {{%[A-Za-z0-9._]+}} = math.copysign {{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}} {{.*}}: f80
-! PRECISE: {{%[A-Za-z0-9._]+}} = fir.call @copysignl({{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}}) {{.*}}: (f80, f80) -> f80
+! ALL-KIND10-LABEL: @_QPtest_real10
+! FAST-KIND10: {{%[A-Za-z0-9._]+}} = math.copysign {{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}} {{.*}}: f80
+! RELAXED-KIND10: {{%[A-Za-z0-9._]+}} = math.copysign {{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}} {{.*}}: f80
+! PRECISE-KIND10: {{%[A-Za-z0-9._]+}} = fir.call @copysignl({{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}}) {{.*}}: (f80, f80) -> f80
 
 function test_real16(x, y)
-  real(16) :: x, y, test_real16
+  integer, parameter :: kind16 = merge(16, 4, selected_real_kind(p=33).eq.16)
+  real(kind16) :: x, y, test_real16
   test_real16 = sign(x, y)
 end function
 
-! ALL-LABEL: @_QPtest_real16
-! FAST: {{%[A-Za-z0-9._]+}} = math.copysign {{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}} {{.*}}: f128
-! RELAXED: {{%[A-Za-z0-9._]+}} = math.copysign {{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}} {{.*}}: f128
-! PRECISE: {{%[A-Za-z0-9._]+}} = fir.call @llvm.copysign.f128({{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}}) {{.*}}: (f128, f128) -> f128
+! ALL-KIND16-LABEL: @_QPtest_real16
+! FAST-KIND16: {{%[A-Za-z0-9._]+}} = math.copysign {{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}} {{.*}}: f128
+! RELAXED-KIND16: {{%[A-Za-z0-9._]+}} = math.copysign {{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}} {{.*}}: f128
+! PRECISE-KIND16: {{%[A-Za-z0-9._]+}} = fir.call @llvm.copysign.f128({{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}}) {{.*}}: (f128, f128) -> f128
 
 ! PRECISE-DAG: func.func private @copysignf(f32, f32) -> f32 attributes {fir.bindc_name = "copysignf", fir.runtime}
 ! PRECISE-DAG: func.func private @copysign(f64, f64) -> f64 attributes {fir.bindc_name = "copysign", fir.runtime}
-! PRECISE-DAG: func.func private @copysignl(f80, f80) -> f80 attributes {fir.bindc_name = "copysignl", fir.runtime}
-! PRECISE-DAG: func.func private @llvm.copysign.f128(f128, f128) -> f128 attributes {fir.bindc_name = "llvm.copysign.f128", fir.runtime}
+! PRECISE-KIND10-DAG: func.func private @copysignl(f80, f80) -> f80 attributes {fir.bindc_name = "copysignl", fir.runtime}
+! PRECISE-KIND16-DAG: func.func private @llvm.copysign.f128(f128, f128) -> f128 attributes {fir.bindc_name = "llvm.copysign.f128", fir.runtime}
diff --git flang/test/Lower/real-descriptors.f90 flang/test/Lower/real-descriptors.f90
index ff7fdc68e7b3..eb1c4dfae5fd 100644
--- flang/test/Lower/real-descriptors.f90
+++ flang/test/Lower/real-descriptors.f90
@@ -1,43 +1,18 @@
-! RUN: bbc %s -o - | tco | FileCheck %s
-
-! CHECK-LABEL: define void @_QQmain()
-program p
-  ! CHECK-DAG:  alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8
-  ! CHECK-DAG:  alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8
-  ! CHECK-DAG:  alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8
-  ! CHECK-DAG:  alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8
-  ! CHECK-DAG:  alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8
-  ! CHECK-DAG:  alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8
-  ! CHECK-DAG:  alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8
-  ! CHECK-DAG:  alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8
-  ! CHECK-DAG:  alloca { x86_fp80, x86_fp80 }, i64 1, align 16
-  ! CHECK-DAG:  alloca { fp128, fp128 }, i64 1, align 16
-  ! CHECK-DAG:  alloca { half, half }, i64 1, align 8
-  ! CHECK-DAG:  alloca { bfloat, bfloat }, i64 1, align 8
-  ! CHECK-DAG:  alloca { float, float }, i64 1, align 8
-  ! CHECK-DAG:  alloca { double, double }, i64 1, align 8
-  ! CHECK-DAG:  alloca x86_fp80, i64 1, align 16
-  ! CHECK-DAG:  alloca fp128, i64 1, align 16
-  ! CHECK-DAG:  alloca half, i64 1, align 2
-  ! CHECK-DAG:  alloca bfloat, i64 1, align 2
-  ! CHECK-DAG:  alloca float, i64 1, align 4
-  ! CHECK-DAG:  alloca double, i64 1, align 8
+! RUN: bbc %s -o - | tco | FileCheck %s --check-prefixes=CHECK%if target=x86_64{{.*}} %{,CHECK-KIND10%}%if flang-supports-f128-math %{,CHECK-KIND16%}
 
+! CHECK-LABEL: define void {{.*}}test_reals
+subroutine test_reals(x2, x3, x4, x8, c2, c3, c4, c8)
   character(10) :: in = 'NaN NaN'
 
   real(kind=2)  :: x2
   real(kind=3)  :: x3
   real(kind=4)  :: x4
   real(kind=8)  :: x8
-  real(kind=10) :: x10
-  real(kind=16) :: x16
 
   complex(kind=2)  :: c2
   complex(kind=3)  :: c3
   complex(kind=4)  :: c4
   complex(kind=8)  :: c8
-  complex(kind=10) :: c10
-  complex(kind=16) :: c16
 
   read(in,*) x2
   ! CHECK:      insertvalue { ptr, i64, i32, i8, i8, i8, i8 } { ptr undef, i64 ptrtoint (ptr getelementptr (half, ptr null, i32 1) to i64), i32 {{[0-9]*}}, i8 0, i8 25, i8 0, i8 0 }, ptr %{{[0-9]*}}, 0
@@ -57,17 +32,6 @@ program p
   ! CHECK:      call i1 @_FortranAioOutputReal64(ptr %{{[0-9]*}}, double %{{[0-9]*}})
   print "(z16)", x8
 
-  read(in,*) x10
-  ! CHECK:      insertvalue { ptr, i64, i32, i8, i8, i8, i8 } { ptr undef, i64 ptrtoint (ptr getelementptr (x86_fp80, ptr null, i32 1) to i64), i32 {{[0-9]*}}, i8 0, i8 29, i8 0, i8 0 }, ptr %{{[0-9]*}}, 0
-  ! CHECK:      call i1 @_FortranAioOutputDescriptor(ptr %{{[0-9]*}}, ptr %{{[0-9]*}})
-  print "(z20)", x10
-
-  read(in,*) x16
-  ! CHECK:      insertvalue { ptr, i64, i32, i8, i8, i8, i8 } { ptr undef, i64 ptrtoint (ptr getelementptr (fp128, ptr null, i32 1) to i64), i32 {{[0-9]*}}, i8 0, i8 31, i8 0, i8 0 }, ptr %{{[0-9]*}}, 0
-  ! CHECK:      call i1 @_FortranAioOutputDescriptor(ptr %{{[0-9]*}}, ptr %{{[0-9]*}})
-  print "(z32)", x16
-
-  print*
   read(in,*) c2
   ! CHECK:      insertvalue { ptr, i64, i32, i8, i8, i8, i8 } { ptr undef, i64 ptrtoint (ptr getelementptr ({ half, half }, ptr null, i32 1) to i64), i32 {{[0-9]*}}, i8 0, i8 32, i8 0, i8 0 }, ptr %{{[0-9]*}}, 0
   ! CHECK:      call i1 @_FortranAioOutputDescriptor(ptr %{{[0-9]*}}, ptr %{{[0-9]*}})
@@ -86,13 +50,40 @@ program p
   ! CHECK:      call i1 @_FortranAioOutputComplex64(ptr %{{[0-9]*}}, double %{{[0-9]*}}, double %{{[0-9]*}})
   print "(z16,' ',z16)", c8
 
+end
+
+! CHECK-KIND16-LABEL: test_kind10
+subroutine test_kind10(x10, c10)
+  integer, parameter :: kind10 = merge(10, 4, selected_real_kind(p=18).eq.10)
+  character(10) :: in = 'NaN NaN'
+  real(kind=kind10) :: x10
+  complex(kind=kind10) :: c10
+
+  read(in,*) x10
+  ! CHECK-KIND10:  insertvalue { ptr, i64, i32, i8, i8, i8, i8 } { ptr undef, i64 ptrtoint (ptr getelementptr (x86_fp80, ptr null, i32 1) to i64), i32 {{[0-9]*}}, i8 0, i8 29, i8 0, i8 0 }, ptr %{{[0-9]*}}, 0
+  ! CHECK-KIND10:  call i1 @_FortranAioOutputDescriptor(ptr %{{[0-9]*}}, ptr %{{[0-9]*}})
+  print "(z20)", x10
+
   read(in,*) c10
-  ! CHECK:      insertvalue { ptr, i64, i32, i8, i8, i8, i8 } { ptr undef, i64 ptrtoint (ptr getelementptr ({ x86_fp80, x86_fp80 }, ptr null, i32 1) to i64), i32 {{[0-9]*}}, i8 0, i8 36, i8 0, i8 0 }, ptr %{{[0-9]*}}, 0
-  ! CHECK:      call i1 @_FortranAioOutputDescriptor(ptr %{{[0-9]*}}, ptr %{{[0-9]*}})
+  ! CHECK-KIND10:  insertvalue { ptr, i64, i32, i8, i8, i8, i8 } { ptr undef, i64 ptrtoint (ptr getelementptr ({ x86_fp80, x86_fp80 }, ptr null, i32 1) to i64), i32 {{[0-9]*}}, i8 0, i8 36, i8 0, i8 0 }, ptr %{{[0-9]*}}, 0
+  ! CHECK-KIND10:  call i1 @_FortranAioOutputDescriptor(ptr %{{[0-9]*}}, ptr %{{[0-9]*}})
   print "(z20,' ',z20)", c10
+end subroutine
+
+! CHECK-KIND16-LABEL: test_kind16
+subroutine test_kind16(x16, c16)
+  integer, parameter :: kind16 = merge(16, 4, selected_real_kind(p=33).eq.16)
+  character(10) :: in = 'NaN NaN'
+  real(kind=kind16) :: x16
+  complex(kind=kind16) :: c16
+
+  read(in,*) x16
+  ! CHECK-KIND16:  insertvalue { ptr, i64, i32, i8, i8, i8, i8 } { ptr undef, i64 ptrtoint (ptr getelementptr (fp128, ptr null, i32 1) to i64), i32 {{[0-9]*}}, i8 0, i8 31, i8 0, i8 0 }, ptr %{{[0-9]*}}, 0
+  ! CHECK-KIND16:  call i1 @_FortranAioOutputDescriptor(ptr %{{[0-9]*}}, ptr %{{[0-9]*}})
+  print "(z32)", x16
 
   read(in,*) c16
-  ! CHECK:      insertvalue { ptr, i64, i32, i8, i8, i8, i8 } { ptr undef, i64 ptrtoint (ptr getelementptr ({ fp128, fp128 }, ptr null, i32 1) to i64), i32 {{[0-9]*}}, i8 0, i8 38, i8 0, i8 0 }, ptr %{{[0-9]*}}, 0
-  ! CHECK:      call i1 @_FortranAioOutputDescriptor(ptr %{{[0-9]*}}, ptr %{{[0-9]*}})
+  ! CHECK-KIND16:  insertvalue { ptr, i64, i32, i8, i8, i8, i8 } { ptr undef, i64 ptrtoint (ptr getelementptr ({ fp128, fp128 }, ptr null, i32 1) to i64), i32 {{[0-9]*}}, i8 0, i8 38, i8 0, i8 0 }, ptr %{{[0-9]*}}, 0
+  ! CHECK-KIND16:  call i1 @_FortranAioOutputDescriptor(ptr %{{[0-9]*}}, ptr %{{[0-9]*}})
   print "(z32,' ',z32)", c16
-end
+end subroutine
diff --git flang/test/Lower/real-operations-1.f90 flang/test/Lower/real-operations-1.f90
index 137d0b5264c2..d155b7dc9570 100644
--- flang/test/Lower/real-operations-1.f90
+++ flang/test/Lower/real-operations-1.f90
@@ -1,13 +1,14 @@
-! RUN: bbc -hlfir=false %s -o - | FileCheck %s
-
+! RUN: bbc -emit-hlfir %s -o - | FileCheck %s --check-prefixes=CHECK%if target=x86_64{{.*}} %{,CHECK-KIND10%}%if flang-supports-f128-math %{,CHECK-KIND16%}
 ! Test real add on real kinds.
 
 ! CHECK-LABEL: real2
 REAL(2) FUNCTION real2(x0, x1)
   REAL(2) :: x0
   REAL(2) :: x1
-  ! CHECK-DAG: %[[v1:.+]] = fir.load %arg0 : !fir.ref<f16>
-  ! CHECK-DAG: %[[v2:.+]] = fir.load %arg1 : !fir.ref<f16>
+  ! CHECK: %[[x0:.*]]:2 = hlfir.declare{{.*}}x0"
+  ! CHECK: %[[x1:.*]]:2 = hlfir.declare{{.*}}x1"
+  ! CHECK-DAG: %[[v1:.+]] = fir.load %[[x0]]#0 : !fir.ref<f16>
+  ! CHECK-DAG: %[[v2:.+]] = fir.load %[[x1]]#0 : !fir.ref<f16>
   ! CHECK: %[[v3:.+]] = arith.addf %[[v1]], %[[v2]] {{.*}}: f16
   real2 = x0 + x1
   ! CHECK: return %{{.*}} : f16
@@ -17,8 +18,10 @@ END FUNCTION real2
 REAL(3) FUNCTION real3(x0, x1)
   REAL(3) :: x0
   REAL(3) :: x1
-  ! CHECK-DAG: %[[v1:.+]] = fir.load %arg0 : !fir.ref<bf16>
-  ! CHECK-DAG: %[[v2:.+]] = fir.load %arg1 : !fir.ref<bf16>
+  ! CHECK: %[[x0:.*]]:2 = hlfir.declare{{.*}}x0"
+  ! CHECK: %[[x1:.*]]:2 = hlfir.declare{{.*}}x1"
+  ! CHECK-DAG: %[[v1:.+]] = fir.load %[[x0]]#0 : !fir.ref<bf16>
+  ! CHECK-DAG: %[[v2:.+]] = fir.load %[[x1]]#0 : !fir.ref<bf16>
   ! CHECK: %[[v3:.+]] = arith.addf %[[v1]], %[[v2]] {{.*}}: bf16
   real3 = x0 + x1
   ! CHECK: return %{{.*}} : bf16
@@ -28,8 +31,10 @@ END FUNCTION real3
 REAL(4) FUNCTION real4(x0, x1)
   REAL(4) :: x0
   REAL(4) :: x1
-  ! CHECK-DAG: %[[v1:.+]] = fir.load %arg0 : !fir.ref<f32>
-  ! CHECK-DAG: %[[v2:.+]] = fir.load %arg1 : !fir.ref<f32>
+  ! CHECK: %[[x0:.*]]:2 = hlfir.declare{{.*}}x0"
+  ! CHECK: %[[x1:.*]]:2 = hlfir.declare{{.*}}x1"
+  ! CHECK-DAG: %[[v1:.+]] = fir.load %[[x0]]#0 : !fir.ref<f32>
+  ! CHECK-DAG: %[[v2:.+]] = fir.load %[[x1]]#0 : !fir.ref<f32>
   ! CHECK: %[[v3:.+]] = arith.addf %[[v1]], %[[v2]] {{.*}}: f32
   real4 = x0 + x1
   ! CHECK: return %{{.*}} : f32
@@ -39,8 +44,10 @@ END FUNCTION real4
 REAL FUNCTION defreal(x0, x1)
   REAL :: x0
   REAL :: x1
-  ! CHECK-DAG: %[[v1:.+]] = fir.load %arg0 : !fir.ref<f32>
-  ! CHECK-DAG: %[[v2:.+]] = fir.load %arg1 : !fir.ref<f32>
+  ! CHECK: %[[x0:.*]]:2 = hlfir.declare{{.*}}x0"
+  ! CHECK: %[[x1:.*]]:2 = hlfir.declare{{.*}}x1"
+  ! CHECK-DAG: %[[v1:.+]] = fir.load %[[x0]]#0 : !fir.ref<f32>
+  ! CHECK-DAG: %[[v2:.+]] = fir.load %[[x1]]#0 : !fir.ref<f32>
   ! CHECK: %[[v3:.+]] = arith.addf %[[v1]], %[[v2]] {{.*}}: f32
   defreal = x0 + x1
   ! CHECK: return %{{.*}} : f32
@@ -50,8 +57,10 @@ END FUNCTION defreal
 REAL(8) FUNCTION real8(x0, x1)
   REAL(8) :: x0
   REAL(8) :: x1
-  ! CHECK-DAG: %[[v1:.+]] = fir.load %arg0 : !fir.ref<f64>
-  ! CHECK-DAG: %[[v2:.+]] = fir.load %arg1 : !fir.ref<f64>
+  ! CHECK: %[[x0:.*]]:2 = hlfir.declare{{.*}}x0"
+  ! CHECK: %[[x1:.*]]:2 = hlfir.declare{{.*}}x1"
+  ! CHECK-DAG: %[[v1:.+]] = fir.load %[[x0]]#0 : !fir.ref<f64>
+  ! CHECK-DAG: %[[v2:.+]] = fir.load %[[x1]]#0 : !fir.ref<f64>
   ! CHECK: %[[v3:.+]] = arith.addf %[[v1]], %[[v2]] {{.*}}: f64
   real8 = x0 + x1
   ! CHECK: return %{{.*}} : f64
@@ -61,44 +70,58 @@ END FUNCTION real8
 DOUBLE PRECISION FUNCTION doubleprec(x0, x1)
   DOUBLE PRECISION :: x0
   DOUBLE PRECISION :: x1
-  ! CHECK-DAG: %[[v1:.+]] = fir.load %arg0 : !fir.ref<f64>
-  ! CHECK-DAG: %[[v2:.+]] = fir.load %arg1 : !fir.ref<f64>
+  ! CHECK: %[[x0:.*]]:2 = hlfir.declare{{.*}}x0"
+  ! CHECK: %[[x1:.*]]:2 = hlfir.declare{{.*}}x1"
+  ! CHECK-DAG: %[[v1:.+]] = fir.load %[[x0]]#0 : !fir.ref<f64>
+  ! CHECK-DAG: %[[v2:.+]] = fir.load %[[x1]]#0 : !fir.ref<f64>
   ! CHECK: %[[v3:.+]] = arith.addf %[[v1]], %[[v2]] {{.*}}: f64
   doubleprec = x0 + x1
   ! CHECK: return %{{.*}} : f64
 END FUNCTION doubleprec
 
-! CHECK-LABEL: real10
-REAL(10) FUNCTION real10(x0, x1)
-  REAL(10) :: x0
-  REAL(10) :: x1
-  ! CHECK-DAG: %[[v1:.+]] = fir.load %arg0 : !fir.ref<f80>
-  ! CHECK-DAG: %[[v2:.+]] = fir.load %arg1 : !fir.ref<f80>
-  ! CHECK: %[[v3:.+]] = arith.addf %[[v1]], %[[v2]] {{.*}}: f80
+! CHECK-KIND10-LABEL: real10
+FUNCTION real10(x0, x1)
+  INTEGER, PARAMETER :: kind10 = merge(10, 4, selected_real_kind(p=18).eq.10)
+  REAL(kind10) :: real10
+  REAL(kind10) :: x0
+  REAL(kind10) :: x1
+  ! CHECK-KIND10: %[[x0:.*]]:2 = hlfir.declare{{.*}}x0"
+  ! CHECK-KIND10: %[[x1:.*]]:2 = hlfir.declare{{.*}}x1"
+  ! CHECK-KIND10-DAG: %[[v1:.+]] = fir.load %[[x0]]#0 : !fir.ref<f80>
+  ! CHECK-KIND10-DAG: %[[v2:.+]] = fir.load %[[x1]]#0 : !fir.ref<f80>
+  ! CHECK-KIND10: %[[v3:.+]] = arith.addf %[[v1]], %[[v2]] {{.*}}: f80
   real10 = x0 + x1
-  ! CHECK: return %{{.*}} : f80
+  ! CHECK-KIND10: return %{{.*}} : f80
 END FUNCTION real10
 
-! CHECK-LABEL: real16
-REAL(16) FUNCTION real16(x0, x1)
-  REAL(16) :: x0
-  REAL(16) :: x1
-  ! CHECK-DAG: %[[v1:.+]] = fir.load %arg0 : !fir.ref<f128>
-  ! CHECK-DAG: %[[v2:.+]] = fir.load %arg1 : !fir.ref<f128>
-  ! CHECK: %[[v3:.+]] = arith.addf %[[v1]], %[[v2]] {{.*}}: f128
+! CHECK-KIND16-LABEL: real16(
+FUNCTION real16(x0, x1)
+  INTEGER, PARAMETER :: kind16 = merge(16, 4, selected_real_kind(p=33).eq.16)
+  REAL(kind16) :: real16
+  REAL(kind16) :: x0
+  REAL(kind16) :: x1
+  ! CHECK-KIND16: %[[x0:.*]]:2 = hlfir.declare{{.*}}x0"
+  ! CHECK-KIND16: %[[x1:.*]]:2 = hlfir.declare{{.*}}x1"
+  ! CHECK-KIND16-DAG: %[[v1:.+]] = fir.load %[[x0]]#0 : !fir.ref<f128>
+  ! CHECK-KIND16-DAG: %[[v2:.+]] = fir.load %[[x1]]#0 : !fir.ref<f128>
+  ! CHECK-KIND16: %[[v3:.+]] = arith.addf %[[v1]], %[[v2]] {{.*}}: f128
   real16 = x0 + x1
-  ! CHECK: return %{{.*}} : f128
+  ! CHECK-KIND16: return %{{.*}} : f128
 END FUNCTION real16
 
-! CHECK-LABEL: real16b
-REAL(16) FUNCTION real16b(x0, x1)
-  REAL(16) :: x0
-  REAL(16) :: x1
-  ! CHECK-DAG: %[[v0:.+]] = arith.constant 4.0{{.*}} : f128
-  ! CHECK-DAG: %[[v1:.+]] = fir.load %arg0 : !fir.ref<f128>
-  ! CHECK-DAG: %[[v2:.+]] = fir.load %arg1 : !fir.ref<f128>
-  ! CHECK: %[[v3:.+]] = arith.addf %[[v1]], %[[v2]] {{.*}}: f128
-  ! CHECK: %[[v4:.+]] = arith.subf %[[v3]], %[[v0]] {{.*}}: f128
+! CHECK-KIND16-LABEL: real16b
+FUNCTION real16b(x0, x1)
+  INTEGER, PARAMETER :: kind16 = merge(16, 4, selected_real_kind(p=33).eq.16)
+  REAL(kind16) :: real16b
+  REAL(kind16) :: x0
+  REAL(kind16) :: x1
+  ! CHECK-KIND16: %[[x0:.*]]:2 = hlfir.declare{{.*}}x0"
+  ! CHECK-KIND16: %[[x1:.*]]:2 = hlfir.declare{{.*}}x1"
+  ! CHECK-KIND16-DAG: %[[v1:.+]] = fir.load %[[x0]]#0 : !fir.ref<f128>
+  ! CHECK-KIND16-DAG: %[[v2:.+]] = fir.load %[[x1]]#0 : !fir.ref<f128>
+  ! CHECK-KIND16-DAG: %[[v3:.+]] = arith.addf %[[v1]], %[[v2]] {{.*}}: f128
+  ! CHECK-KIND16-DAG: %[[v0:.+]] = arith.constant 4.0{{.*}} : f128
+  ! CHECK-KIND16: %[[v4:.+]] = arith.subf %[[v3]], %[[v0]] {{.*}}: f128
   real16b = x0 + x1 - 4.0_16
-  ! CHECK: return %{{.*}} : f128
+  ! CHECK-KIND16: return %{{.*}} : f128
 END FUNCTION real16b
diff --git flang/test/Parser/OpenMP/declare-mapper-unparse.f90 flang/test/Parser/OpenMP/declare-mapper-unparse.f90
index 5ba147d20955..407bfd29153f 100644
--- flang/test/Parser/OpenMP/declare-mapper-unparse.f90
+++ flang/test/Parser/OpenMP/declare-mapper-unparse.f90
@@ -13,7 +13,7 @@ program main
   !$omp declare mapper(mymapper : ty :: mapped) map(mapped, mapped%x)
 
 !PARSE-TREE:      OpenMPDeclareMapperConstruct
-!PARSE-TREE:        OmpDeclareMapperSpecifier
+!PARSE-TREE:        OmpMapperSpecifier
 !PARSE-TREE:         Name = 'mymapper'
 !PARSE-TREE:         TypeSpec -> DerivedTypeSpec
 !PARSE-TREE:           Name = 'ty'
@@ -28,7 +28,7 @@ program main
   !$omp declare mapper(ty :: mapped) map(mapped, mapped%x)
   
 !PARSE-TREE:      OpenMPDeclareMapperConstruct
-!PARSE-TREE:        OmpDeclareMapperSpecifier
+!PARSE-TREE:        OmpMapperSpecifier
 !PARSE-TREE:         TypeSpec -> DerivedTypeSpec
 !PARSE-TREE:           Name = 'ty'
 !PARSE-TREE:         Name = 'mapped'    
diff --git flang/test/Parser/OpenMP/metadirective-dirspec.f90 flang/test/Parser/OpenMP/metadirective-dirspec.f90
new file mode 100644
index 000000000000..73520c41fe77
--- /dev/null
+++ flang/test/Parser/OpenMP/metadirective-dirspec.f90
@@ -0,0 +1,242 @@
+!RUN: %flang_fc1 -fdebug-unparse -fopenmp -fopenmp-version=52 %s | FileCheck --ignore-case --check-prefix="UNPARSE" %s
+!RUN: %flang_fc1 -fdebug-dump-parse-tree -fopenmp -fopenmp-version=52 %s | FileCheck --check-prefix="PARSE-TREE" %s
+
+!Directive specification where directives have arguments
+
+subroutine f00(x)
+  integer :: x(10)
+  !$omp metadirective when(user={condition(.true.)}: &
+  !$omp & allocate(x))
+end
+
+!UNPARSE: SUBROUTINE f00 (x)
+!UNPARSE:  INTEGER x(10_4)
+!UNPARSE: !$OMP METADIRECTIVE  WHEN(USER={CONDITION(.true._4)}: ALLOCATE(x))
+!UNPARSE: END SUBROUTINE
+
+!PARSE-TREE: DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OmpMetadirectiveDirective
+!PARSE-TREE: | OmpClauseList -> OmpClause -> When -> OmpWhenClause
+!PARSE-TREE: | | Modifier -> OmpContextSelectorSpecification -> OmpTraitSetSelector
+!PARSE-TREE: | | | OmpTraitSetSelectorName -> Value = User
+!PARSE-TREE: | | | OmpTraitSelector
+!PARSE-TREE: | | | | OmpTraitSelectorName -> Value = Condition
+!PARSE-TREE: | | | | Properties
+!PARSE-TREE: | | | | | OmpTraitProperty -> Scalar -> Expr = '.true._4'
+!PARSE-TREE: | | | | | | LiteralConstant -> LogicalLiteralConstant
+!PARSE-TREE: | | | | | | | bool = 'true'
+!PARSE-TREE: | | OmpDirectiveSpecification
+!PARSE-TREE: | | | llvm::omp::Directive = allocate
+!PARSE-TREE: | | | OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'x'
+!PARSE-TREE: | | | OmpClauseList ->
+
+subroutine f01(x)
+  integer :: x
+  !$omp metadirective when(user={condition(.true.)}: &
+  !$omp & critical(x))
+end
+
+!UNPARSE: SUBROUTINE f01 (x)
+!UNPARSE:  INTEGER x
+!UNPARSE: !$OMP METADIRECTIVE  WHEN(USER={CONDITION(.true._4)}: CRITICAL(x))
+!UNPARSE: END SUBROUTINE
+
+!PARSE-TREE: DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OmpMetadirectiveDirective
+!PARSE-TREE: | OmpClauseList -> OmpClause -> When -> OmpWhenClause
+!PARSE-TREE: | | Modifier -> OmpContextSelectorSpecification -> OmpTraitSetSelector
+!PARSE-TREE: | | | OmpTraitSetSelectorName -> Value = User
+!PARSE-TREE: | | | OmpTraitSelector
+!PARSE-TREE: | | | | OmpTraitSelectorName -> Value = Condition
+!PARSE-TREE: | | | | Properties
+!PARSE-TREE: | | | | | OmpTraitProperty -> Scalar -> Expr = '.true._4'
+!PARSE-TREE: | | | | | | LiteralConstant -> LogicalLiteralConstant
+!PARSE-TREE: | | | | | | | bool = 'true'
+!PARSE-TREE: | | OmpDirectiveSpecification
+!PARSE-TREE: | | | llvm::omp::Directive = critical
+!PARSE-TREE: | | | OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'x'
+!PARSE-TREE: | | | OmpClauseList ->
+
+subroutine f02
+  !$omp metadirective when(user={condition(.true.)}: &
+  !$omp & declare mapper(mymapper : integer :: v) map(tofrom: v))
+end
+
+!UNPARSE: SUBROUTINE f02
+!UNPARSE: !$OMP METADIRECTIVE  WHEN(USER={CONDITION(.true._4)}: DECLARE MAPPER(mymapper:INTEGER:&
+!UNPARSE: !$OMP&:v) MAP(TOFROM: v))
+!UNPARSE: END SUBROUTINE
+
+!PARSE-TREE: OpenMPDeclarativeConstruct -> OmpMetadirectiveDirective
+!PARSE-TREE: | OmpClauseList -> OmpClause -> When -> OmpWhenClause
+!PARSE-TREE: | | Modifier -> OmpContextSelectorSpecification -> OmpTraitSetSelector
+!PARSE-TREE: | | | OmpTraitSetSelectorName -> Value = User
+!PARSE-TREE: | | | OmpTraitSelector
+!PARSE-TREE: | | | | OmpTraitSelectorName -> Value = Condition
+!PARSE-TREE: | | | | Properties
+!PARSE-TREE: | | | | | OmpTraitProperty -> Scalar -> Expr = '.true._4'
+!PARSE-TREE: | | | | | | LiteralConstant -> LogicalLiteralConstant
+!PARSE-TREE: | | | | | | | bool = 'true'
+!PARSE-TREE: | | OmpDirectiveSpecification
+!PARSE-TREE: | | | llvm::omp::Directive = declare mapper
+!PARSE-TREE: | | | OmpArgument -> OmpMapperSpecifier
+!PARSE-TREE: | | | | Name = 'mymapper'
+!PARSE-TREE: | | | | TypeSpec -> IntrinsicTypeSpec -> IntegerTypeSpec ->
+!PARSE-TREE: | | | | Name = 'v'
+!PARSE-TREE: | | | OmpClauseList -> OmpClause -> Map -> OmpMapClause
+!PARSE-TREE: | | | | Modifier -> OmpMapType -> Value = Tofrom
+!PARSE-TREE: | | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'v'
+!PARSE-TREE: | | | | bool = 'true'
+!PARSE-TREE: ImplicitPart ->
+
+subroutine f03
+  type :: tt1
+    integer :: x
+  endtype
+  type :: tt2
+    real :: a
+  endtype
+  !$omp metadirective when(user={condition(.true.)}: &
+  !$omp & declare reduction(+ : tt1, tt2 : omp_out = omp_in + omp_out))
+end
+
+!UNPARSE: SUBROUTINE f03
+!UNPARSE:  TYPE :: tt1
+!UNPARSE:   INTEGER :: x
+!UNPARSE:  END TYPE
+!UNPARSE:  TYPE :: tt2
+!UNPARSE:   REAL :: a
+!UNPARSE:  END TYPE
+!UNPARSE: !$OMP METADIRECTIVE  WHEN(USER={CONDITION(.true._4)}: DECLARE REDUCTION(+:tt1,tt2: omp_out=omp_in+omp_out
+!UNPARSE: ))
+!UNPARSE: END SUBROUTINE
+
+!PARSE-TREE: DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OmpMetadirectiveDirective
+!PARSE-TREE: | OmpClauseList -> OmpClause -> When -> OmpWhenClause
+!PARSE-TREE: | | Modifier -> OmpContextSelectorSpecification -> OmpTraitSetSelector
+!PARSE-TREE: | | | OmpTraitSetSelectorName -> Value = User
+!PARSE-TREE: | | | OmpTraitSelector
+!PARSE-TREE: | | | | OmpTraitSelectorName -> Value = Condition
+!PARSE-TREE: | | | | Properties
+!PARSE-TREE: | | | | | OmpTraitProperty -> Scalar -> Expr = '.true._4'
+!PARSE-TREE: | | | | | | LiteralConstant -> LogicalLiteralConstant
+!PARSE-TREE: | | | | | | | bool = 'true'
+!PARSE-TREE: | | OmpDirectiveSpecification
+!PARSE-TREE: | | | llvm::omp::Directive = declare reduction
+!PARSE-TREE: | | | OmpArgument -> OmpReductionSpecifier
+!PARSE-TREE: | | | | OmpReductionIdentifier -> DefinedOperator -> IntrinsicOperator = Add
+!PARSE-TREE: | | | | OmpTypeNameList -> OmpTypeSpecifier -> TypeSpec -> DerivedTypeSpec
+!PARSE-TREE: | | | | | Name = 'tt1'
+!PARSE-TREE: | | | | OmpTypeSpecifier -> TypeSpec -> DerivedTypeSpec
+!PARSE-TREE: | | | | | Name = 'tt2'
+!PARSE-TREE: | | | | OmpReductionCombiner -> AssignmentStmt = 'omp_out=omp_in+omp_out'
+!PARSE-TREE: | | | | | Variable = 'omp_out'
+!PARSE-TREE: | | | | | | Designator -> DataRef -> Name = 'omp_out'
+!PARSE-TREE: | | | | | Expr = 'omp_in+omp_out'
+!PARSE-TREE: | | | | | | Add
+!PARSE-TREE: | | | | | | | Expr = 'omp_in'
+!PARSE-TREE: | | | | | | | | Designator -> DataRef -> Name = 'omp_in'
+!PARSE-TREE: | | | | | | | Expr = 'omp_out'
+!PARSE-TREE: | | | | | | | | Designator -> DataRef -> Name = 'omp_out'
+!PARSE-TREE: | | | OmpClauseList ->
+
+subroutine f04
+  !$omp metadirective when(user={condition(.true.)}: &
+  !$omp & declare simd(f04))
+end
+
+!UNPARSE: SUBROUTINE f04
+!UNPARSE: !$OMP METADIRECTIVE  WHEN(USER={CONDITION(.true._4)}: DECLARE SIMD(f04))
+!UNPARSE: END SUBROUTINE
+
+!PARSE-TREE: OpenMPDeclarativeConstruct -> OmpMetadirectiveDirective
+!PARSE-TREE: | OmpClauseList -> OmpClause -> When -> OmpWhenClause
+!PARSE-TREE: | | Modifier -> OmpContextSelectorSpecification -> OmpTraitSetSelector
+!PARSE-TREE: | | | OmpTraitSetSelectorName -> Value = User
+!PARSE-TREE: | | | OmpTraitSelector
+!PARSE-TREE: | | | | OmpTraitSelectorName -> Value = Condition
+!PARSE-TREE: | | | | Properties
+!PARSE-TREE: | | | | | OmpTraitProperty -> Scalar -> Expr = '.true._4'
+!PARSE-TREE: | | | | | | LiteralConstant -> LogicalLiteralConstant
+!PARSE-TREE: | | | | | | | bool = 'true'
+!PARSE-TREE: | | OmpDirectiveSpecification
+!PARSE-TREE: | | | llvm::omp::Directive = declare simd
+!PARSE-TREE: | | | OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'f04'
+!PARSE-TREE: | | | OmpClauseList ->
+!PARSE-TREE: ImplicitPart ->
+
+subroutine f05
+  !$omp metadirective when(user={condition(.true.)}: &
+  !$omp & declare target(f05))
+end
+
+!UNPARSE: SUBROUTINE f05
+!UNPARSE: !$OMP METADIRECTIVE  WHEN(USER={CONDITION(.true._4)}: DECLARE TARGET(f05))
+!UNPARSE: END SUBROUTINE
+
+!PARSE-TREE: OpenMPDeclarativeConstruct -> OmpMetadirectiveDirective
+!PARSE-TREE: | OmpClauseList -> OmpClause -> When -> OmpWhenClause
+!PARSE-TREE: | | Modifier -> OmpContextSelectorSpecification -> OmpTraitSetSelector
+!PARSE-TREE: | | | OmpTraitSetSelectorName -> Value = User
+!PARSE-TREE: | | | OmpTraitSelector
+!PARSE-TREE: | | | | OmpTraitSelectorName -> Value = Condition
+!PARSE-TREE: | | | | Properties
+!PARSE-TREE: | | | | | OmpTraitProperty -> Scalar -> Expr = '.true._4'
+!PARSE-TREE: | | | | | | LiteralConstant -> LogicalLiteralConstant
+!PARSE-TREE: | | | | | | | bool = 'true'
+!PARSE-TREE: | | OmpDirectiveSpecification
+!PARSE-TREE: | | | llvm::omp::Directive = declare target
+!PARSE-TREE: | | | OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'f05'
+!PARSE-TREE: | | | OmpClauseList ->
+!PARSE-TREE: ImplicitPart ->
+
+subroutine f06(x, y)
+  integer :: x, y
+  !$omp metadirective when(user={condition(.true.)}: &
+  !$omp & flush(x, y))
+end
+
+!UNPARSE: SUBROUTINE f06 (x, y)
+!UNPARSE:  INTEGER x, y
+!UNPARSE: !$OMP METADIRECTIVE  WHEN(USER={CONDITION(.true._4)}: FLUSH(x, y))
+!UNPARSE: END SUBROUTINE
+
+!PARSE-TREE: DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OmpMetadirectiveDirective
+!PARSE-TREE: | OmpClauseList -> OmpClause -> When -> OmpWhenClause
+!PARSE-TREE: | | Modifier -> OmpContextSelectorSpecification -> OmpTraitSetSelector
+!PARSE-TREE: | | | OmpTraitSetSelectorName -> Value = User
+!PARSE-TREE: | | | OmpTraitSelector
+!PARSE-TREE: | | | | OmpTraitSelectorName -> Value = Condition
+!PARSE-TREE: | | | | Properties
+!PARSE-TREE: | | | | | OmpTraitProperty -> Scalar -> Expr = '.true._4'
+!PARSE-TREE: | | | | | | LiteralConstant -> LogicalLiteralConstant
+!PARSE-TREE: | | | | | | | bool = 'true'
+!PARSE-TREE: | | OmpDirectiveSpecification
+!PARSE-TREE: | | | llvm::omp::Directive = flush
+!PARSE-TREE: | | | OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'x'
+!PARSE-TREE: | | | OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'y'
+!PARSE-TREE: | | | OmpClauseList ->
+
+subroutine f07
+  integer :: t
+  !$omp metadirective when(user={condition(.true.)}: &
+  !$omp & threadprivate(t))
+end
+
+!UNPARSE: SUBROUTINE f07
+!UNPARSE:  INTEGER t
+!UNPARSE: !$OMP METADIRECTIVE  WHEN(USER={CONDITION(.true._4)}: THREADPRIVATE(t))
+!UNPARSE: END SUBROUTINE
+
+!PARSE-TREE: DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OmpMetadirectiveDirective
+!PARSE-TREE: | OmpClauseList -> OmpClause -> When -> OmpWhenClause
+!PARSE-TREE: | | Modifier -> OmpContextSelectorSpecification -> OmpTraitSetSelector
+!PARSE-TREE: | | | OmpTraitSetSelectorName -> Value = User
+!PARSE-TREE: | | | OmpTraitSelector
+!PARSE-TREE: | | | | OmpTraitSelectorName -> Value = Condition
+!PARSE-TREE: | | | | Properties
+!PARSE-TREE: | | | | | OmpTraitProperty -> Scalar -> Expr = '.true._4'
+!PARSE-TREE: | | | | | | LiteralConstant -> LogicalLiteralConstant
+!PARSE-TREE: | | | | | | | bool = 'true'
+!PARSE-TREE: | | OmpDirectiveSpecification
+!PARSE-TREE: | | | llvm::omp::Directive = threadprivate
+!PARSE-TREE: | | | OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 't'
+!PARSE-TREE: | | | OmpClauseList ->
diff --git flang/test/Parser/OpenMP/metadirective-v50.f90 flang/test/Parser/OpenMP/metadirective-v50.f90
index 73d5077da3d9..d7c3121b8f1b 100644
--- flang/test/Parser/OpenMP/metadirective-v50.f90
+++ flang/test/Parser/OpenMP/metadirective-v50.f90
@@ -2,12 +2,14 @@
 !RUN: %flang_fc1 -fdebug-dump-parse-tree -fopenmp -fopenmp-version=50 %s | FileCheck --check-prefix="PARSE-TREE" %s
 
 subroutine f01
+  continue
   !$omp metadirective &
   !$omp & when(user={condition(.true.)}: nothing) &
   !$omp & default(nothing)
 end
 
 !UNPARSE: SUBROUTINE f01
+!UNPARSE:  CONTINUE
 !UNPARSE: !$OMP METADIRECTIVE  WHEN(USER={CONDITION(.true._4)}: NOTHING) DEFAULT(NOTHING)
 !UNPARSE: END SUBROUTINE
 
diff --git flang/test/Parser/OpenMP/metadirective.f90 flang/test/Parser/OpenMP/metadirective.f90
index af6c3bbefacf..dce31c2e7db2 100644
--- flang/test/Parser/OpenMP/metadirective.f90
+++ flang/test/Parser/OpenMP/metadirective.f90
@@ -2,10 +2,12 @@
 !RUN: %flang_fc1 -fdebug-dump-parse-tree -fopenmp -fopenmp-version=52 %s | FileCheck --check-prefix="PARSE-TREE" %s
 
 subroutine f00
+  continue
   !$omp metadirective when(construct={target, parallel}: nothing)
 end
 
 !UNPARSE: SUBROUTINE f00
+!UNPARSE:  CONTINUE
 !UNPARSE: !$OMP METADIRECTIVE  WHEN(CONSTRUCT={TARGET, PARALLEL}: NOTHING)
 !UNPARSE: END SUBROUTINE
 
@@ -22,17 +24,19 @@ end
 !PARSE-TREE: | | | OmpClauseList ->
 
 subroutine f01
-  !$omp metadirective when(device={kind(host), device_num(1)}: nothing)
+  continue
+  !$omp metadirective when(target_device={kind(host), device_num(1)}: nothing)
 end
 
 !UNPARSE: SUBROUTINE f01
-!UNPARSE: !$OMP METADIRECTIVE  WHEN(DEVICE={KIND(host), DEVICE_NUM(1_4)}: NOTHING)
+!UNPARSE:  CONTINUE
+!UNPARSE: !$OMP METADIRECTIVE  WHEN(TARGET_DEVICE={KIND(host), DEVICE_NUM(1_4)}: NOTHING)
 !UNPARSE: END SUBROUTINE
 
 !PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPStandaloneConstruct -> OmpMetadirectiveDirective
 !PARSE-TREE: | OmpClauseList -> OmpClause -> When -> OmpWhenClause
 !PARSE-TREE: | | Modifier -> OmpContextSelectorSpecification -> OmpTraitSetSelector
-!PARSE-TREE: | | | OmpTraitSetSelectorName -> Value = Device
+!PARSE-TREE: | | | OmpTraitSetSelectorName -> Value = Target_Device
 !PARSE-TREE: | | | OmpTraitSelector
 !PARSE-TREE: | | | | OmpTraitSelectorName -> Value = Kind
 !PARSE-TREE: | | | | Properties
@@ -47,10 +51,12 @@ end
 !PARSE-TREE: | | | OmpClauseList ->
 
 subroutine f02
+  continue
   !$omp metadirective when(target_device={kind(any), device_num(7)}: nothing)
 end
 
 !UNPARSE: SUBROUTINE f02
+!UNPARSE:  CONTINUE
 !UNPARSE: !$OMP METADIRECTIVE  WHEN(TARGET_DEVICE={KIND(any), DEVICE_NUM(7_4)}: NOTHING)
 !UNPARSE: END SUBROUTINE
 
@@ -72,11 +78,13 @@ end
 !PARSE-TREE: | | | OmpClauseList ->
 
 subroutine f03
+  continue
   !$omp metadirective &
   !$omp & when(implementation={atomic_default_mem_order(acq_rel)}: nothing)
 end
 
 !UNPARSE: SUBROUTINE f03
+!UNPARSE:  CONTINUE
 !UNPARSE: !$OMP METADIRECTIVE  WHEN(IMPLEMENTATION={ATOMIC_DEFAULT_MEM_ORDER(ACQ_REL)}: &
 !UNPARSE: !$OMP&NOTHING)
 !UNPARSE: END SUBROUTINE
@@ -94,13 +102,15 @@ end
 !PARSE-TREE: | | | OmpClauseList ->
 
 subroutine f04
+  continue
   !$omp metadirective &
-  !$omp & when(implementation={extension(haha(1), foo(baz, "bar"(1)))}: nothing)
+  !$omp when(implementation={extension_trait(haha(1), foo(baz, "bar"(1)))}: nothing)
 end
 
 !UNPARSE: SUBROUTINE f04
-!UNPARSE: !$OMP METADIRECTIVE  WHEN(IMPLEMENTATION={EXTENSION(haha(1_4), foo(baz,bar(1_4)))}: &
-!UNPARSE: !$OMP&NOTHING)
+!UNPARSE:  CONTINUE
+!UNPARSE: !$OMP METADIRECTIVE  WHEN(IMPLEMENTATION={extension_trait(haha(1_4), foo(baz,bar(1_4&
+!UNPARSE: !$OMP&)))}: NOTHING)
 !UNPARSE: END SUBROUTINE
 
 !PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPStandaloneConstruct -> OmpMetadirectiveDirective
@@ -108,7 +118,7 @@ end
 !PARSE-TREE: | | Modifier -> OmpContextSelectorSpecification -> OmpTraitSetSelector
 !PARSE-TREE: | | | OmpTraitSetSelectorName -> Value = Implementation
 !PARSE-TREE: | | | OmpTraitSelector
-!PARSE-TREE: | | | | OmpTraitSelectorName -> Value = Extension
+!PARSE-TREE: | | | | OmpTraitSelectorName -> string = 'extension_trait'
 !PARSE-TREE: | | | | Properties
 !PARSE-TREE: | | | | | OmpTraitProperty -> OmpTraitPropertyExtension -> Complex
 !PARSE-TREE: | | | | | | OmpTraitPropertyName -> string = 'haha'
@@ -127,6 +137,7 @@ end
 
 subroutine f05(x)
   integer :: x
+  continue
   !$omp metadirective &
   !$omp & when(user={condition(score(100): .true.)}: &
   !$omp &    parallel do reduction(+: x)) &
@@ -137,6 +148,7 @@ end
 
 !UNPARSE: SUBROUTINE f05 (x)
 !UNPARSE:  INTEGER x
+!UNPARSE:  CONTINUE
 !UNPARSE: !$OMP METADIRECTIVE  WHEN(USER={CONDITION(SCORE(100_4): .true._4)}: PARALLEL DO REDUCTION(+&
 !UNPARSE: !$OMP&: x)) OTHERWISE(NOTHING)
 !UNPARSE:  DO i=1_4,10_4
@@ -165,6 +177,7 @@ end
 !PARSE-TREE: | | OmpClauseList ->
 
 subroutine f06
+  continue
   ! Two trait set selectors
   !$omp metadirective &
   !$omp & when(implementation={vendor("amd")}, &
@@ -172,6 +185,7 @@ subroutine f06
 end
 
 !UNPARSE: SUBROUTINE f06
+!UNPARSE:  CONTINUE
 !UNPARSE: !$OMP METADIRECTIVE  WHEN(IMPLEMENTATION={VENDOR(amd)}, USER={CONDITION(.true._4)}: NO&
 !UNPARSE: !$OMP&THING)
 !UNPARSE: END SUBROUTINE
@@ -196,3 +210,42 @@ end
 !PARSE-TREE: | | | llvm::omp::Directive = nothing
 !PARSE-TREE: | | | OmpClauseList ->
 
+subroutine f07
+  ! Declarative metadirective
+  !$omp metadirective &
+  !$omp & when(implementation={vendor("amd")}: declare simd) &
+  !$omp & when(user={condition(.true.)}: declare target) &
+  !$omp & otherwise(nothing)
+end
+
+!UNPARSE: SUBROUTINE f07
+!UNPARSE: !$OMP METADIRECTIVE  WHEN(IMPLEMENTATION={VENDOR(amd)}: DECLARE SIMD) WHEN(USE&
+!UNPARSE: !$OMP&R={CONDITION(.true._4)}: DECLARE TARGET) OTHERWISE(NOTHING)
+!UNPARSE: END SUBROUTINE
+
+!PARSE-TREE: OpenMPDeclarativeConstruct -> OmpMetadirectiveDirective
+!PARSE-TREE: | OmpClauseList -> OmpClause -> When -> OmpWhenClause
+!PARSE-TREE: | | Modifier -> OmpContextSelectorSpecification -> OmpTraitSetSelector
+!PARSE-TREE: | | | OmpTraitSetSelectorName -> Value = Implementation
+!PARSE-TREE: | | | OmpTraitSelector
+!PARSE-TREE: | | | | OmpTraitSelectorName -> Value = Vendor
+!PARSE-TREE: | | | | Properties
+!PARSE-TREE: | | | | | OmpTraitProperty -> OmpTraitPropertyName -> string = 'amd'
+!PARSE-TREE: | | OmpDirectiveSpecification
+!PARSE-TREE: | | | llvm::omp::Directive = declare simd
+!PARSE-TREE: | | | OmpClauseList ->
+!PARSE-TREE: | OmpClause -> When -> OmpWhenClause
+!PARSE-TREE: | | Modifier -> OmpContextSelectorSpecification -> OmpTraitSetSelector
+!PARSE-TREE: | | | OmpTraitSetSelectorName -> Value = User
+!PARSE-TREE: | | | OmpTraitSelector
+!PARSE-TREE: | | | | OmpTraitSelectorName -> Value = Condition
+!PARSE-TREE: | | | | Properties
+!PARSE-TREE: | | | | | OmpTraitProperty -> Scalar -> Expr = '.true._4'
+!PARSE-TREE: | | | | | | LiteralConstant -> LogicalLiteralConstant
+!PARSE-TREE: | | | | | | | bool = 'true'
+!PARSE-TREE: | | OmpDirectiveSpecification
+!PARSE-TREE: | | | llvm::omp::Directive = declare target
+!PARSE-TREE: | | | OmpClauseList ->
+!PARSE-TREE: | OmpClause -> Otherwise -> OmpOtherwiseClause -> OmpDirectiveSpecification
+!PARSE-TREE: | | llvm::omp::Directive = nothing
+!PARSE-TREE: | | OmpClauseList ->
\ No newline at end of file
diff --git flang/test/Semantics/OpenMP/metadirective-common.f90 flang/test/Semantics/OpenMP/metadirective-common.f90
new file mode 100644
index 000000000000..4988fae9e8ed
--- /dev/null
+++ flang/test/Semantics/OpenMP/metadirective-common.f90
@@ -0,0 +1,37 @@
+!RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=52
+
+! Common context selector tests
+
+subroutine f00
+  !$omp metadirective &
+  !$omp & when(implementation={vendor("this")}, &
+!ERROR: Repeated trait set name IMPLEMENTATION in a context specifier
+  !$omp &      implementation={requires(unified_shared_memory)}: nothing)
+end
+
+subroutine f01
+  !$omp metadirective &
+!ERROR: Repeated trait name ISA in a trait set
+  !$omp & when(device={isa("this"), isa("that")}: nothing)
+end
+
+subroutine f02
+  !$omp metadirective &
+!ERROR: SCORE expression must be a non-negative constant integer expression
+  !$omp & when(user={condition(score(-2): .true.)}: nothing)
+end
+
+subroutine f03(x)
+  integer :: x
+  !$omp metadirective &
+!ERROR: SCORE expression must be a non-negative constant integer expression
+  !$omp & when(user={condition(score(x): .true.)}: nothing)
+end
+
+subroutine f04
+  !$omp metadirective &
+!ERROR: Trait property should be a scalar expression
+!ERROR: More invalid properties are present
+  !$omp & when(target_device={device_num("device", "foo"(1))}: nothing)
+end
+
diff --git flang/test/Semantics/OpenMP/metadirective-construct.f90 flang/test/Semantics/OpenMP/metadirective-construct.f90
new file mode 100644
index 000000000000..1dd23b1dca67
--- /dev/null
+++ flang/test/Semantics/OpenMP/metadirective-construct.f90
@@ -0,0 +1,33 @@
+!RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=52
+
+! The CONSTRUCT trait set
+
+subroutine f00
+  !$omp metadirective &
+!ERROR: CONDITION is not a valid trait for CONSTRUCT trait set
+  !$omp & when(construct={condition(.true.)}: nothing)
+end
+
+subroutine f01
+  !$omp metadirective &
+!ERROR: Directive-name traits cannot have properties
+  !$omp & when(construct={parallel(nowait), simd}: nothing)
+end
+
+subroutine f02
+  !$omp metadirective &
+!ERROR: SIMD trait requires a clause that is allowed on the DECLARE SIMD directive
+  !$omp & when(construct={simd(nowait)}: nothing)
+end
+
+subroutine f03
+  !$omp metadirective &
+!ERROR: Extension traits are not valid for CONSTRUCT trait set
+  !$omp & when(construct={fred(1)}: nothing)
+end
+
+subroutine f04
+  !$omp metadirective &
+!This is ok
+  !$omp & when(construct={parallel, simd(simdlen(32), notinbranch)}: nothing)
+end
diff --git flang/test/Semantics/OpenMP/metadirective-device.f90 flang/test/Semantics/OpenMP/metadirective-device.f90
new file mode 100644
index 000000000000..fb1149906624
--- /dev/null
+++ flang/test/Semantics/OpenMP/metadirective-device.f90
@@ -0,0 +1,36 @@
+!RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=52
+
+! The DEVICE and TARGET_DEVICE trait sets
+
+subroutine f00
+  !$omp metadirective &
+!ERROR: DEVICE_NUM is not a valid trait for DEVICE trait set
+  !$omp & when(device={device_num(10)}: nothing)
+end
+
+subroutine f01
+  !$omp metadirective &
+!This is ok: all traits are valid
+  !$omp & when(device={arch("some-arch"), isa("some-isa"), kind("some-kind")}:&
+  !$omp & nothing)
+end
+
+subroutine f02
+  !$omp metadirective &
+!This is ok: all traits are valid
+  !$omp & when(target_device={arch("some-arch"), device_num(10), &
+  !$omp & isa("some-isa"), kind("some-kind"), uid("some-uid")}: nothing)
+end
+
+subroutine f03
+  !$omp metadirective &
+!This is ok: extension traits are allowed
+  !$omp & when(device={some_new_trait}: nothing)
+end
+
+subroutine f04
+  !$omp metadirective &
+!This is ok: extension traits are allowed
+  !$omp & when(target_device={another_new_trait(12, 21)}: nothing)
+end
+
diff --git flang/test/Semantics/OpenMP/metadirective-implementation.f90 flang/test/Semantics/OpenMP/metadirective-implementation.f90
new file mode 100644
index 000000000000..7a7642158fc2
--- /dev/null
+++ flang/test/Semantics/OpenMP/metadirective-implementation.f90
@@ -0,0 +1,33 @@
+!RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=52
+
+! The IMPLEMENTATION trait set
+
+subroutine f00
+  !$omp metadirective &
+!ERROR: Trait property should be a clause
+  !$omp & when(implementation={atomic_default_mem_order(0)}: nothing)
+end
+
+subroutine f01
+  !$omp metadirective &
+!ERROR: ATOMIC_DEFAULT_MEM_ORDER trait requires a clause from the memory-order clause set
+  !$omp & when(implementation={atomic_default_mem_order(nowait)}: nothing)
+end
+
+subroutine f02
+  !$omp metadirective &
+!ERROR: REQUIRES trait requires a clause from the requirement clause set
+!ERROR: Invalid clause specification for SHARED
+  !$omp & when(implementation={requires(shared)}: nothing)
+end
+
+subroutine f03
+  !$omp metadirective &
+!This is ok
+  !$omp & when(implementation={ &
+  !$omp &         atomic_default_mem_order(relaxed), &
+  !$omp &         extension("foo"), &
+  !$omp &         requires(unified_address),
+  !$omp &         vendor(some_vendor) &
+  !$omp &      }: nothing)
+end
diff --git flang/test/Semantics/OpenMP/metadirective-user.f90 flang/test/Semantics/OpenMP/metadirective-user.f90
new file mode 100644
index 000000000000..c4f037d57a9d
--- /dev/null
+++ flang/test/Semantics/OpenMP/metadirective-user.f90
@@ -0,0 +1,29 @@
+!RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=52
+
+! The USER trait set
+
+subroutine f00(x)
+  integer :: x
+  !$omp metadirective &
+!ERROR: CONDITION trait requires a single LOGICAL expression
+  !$omp & when(user={condition(score(2): x)}: nothing)
+end
+
+subroutine f01
+  !$omp metadirective &
+!ERROR: CONDITION trait requires a single expression property
+  !$omp & when(user={condition(.true., .false.)}: nothing)
+end
+
+subroutine f02
+  !$omp metadirective &
+!ERROR: Extension traits are not valid for USER trait set
+  !$omp & when(user={fred}: nothing)
+end
+
+subroutine f03(x)
+  integer :: x
+  !$omp metadirective &
+!This is ok
+  !$omp & when(user={condition(x > 0)}: nothing)
+end
diff --git flang/test/Semantics/bug124487.f90 flang/test/Semantics/bug124487.f90
new file mode 100644
index 000000000000..b91757c36236
--- /dev/null
+++ flang/test/Semantics/bug124487.f90
@@ -0,0 +1,14 @@
+!RUN: %flang_fc1 -fsyntax-only %s 2>&1 | FileCheck --allow-empty %s
+!CHECK-NOT: error:
+module m
+  interface
+    module subroutine smp(x)
+      character, external :: x
+    end
+  end interface
+end
+submodule (m) sm
+ contains
+  module procedure smp ! crashes here
+  end
+end
diff --git flang/test/Semantics/bug124621.f90 flang/test/Semantics/bug124621.f90
new file mode 100644
index 000000000000..1106ed4a25c4
--- /dev/null
+++ flang/test/Semantics/bug124621.f90
@@ -0,0 +1,46 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1
+module m
+  type t1
+   contains
+    procedure, pass(from) :: defAsst1
+    generic :: assignment(=) => defAsst1
+  end type
+  type t2
+  end type
+  type t3
+  end type
+  interface assignment(=)
+    module procedure defAsst2
+  end interface
+ contains
+  subroutine defAsst1(to,from)
+    class(*), intent(out) :: to
+    class(t1), intent(in) :: from
+  end
+  subroutine defAsst2(to,from)
+    class(*), intent(out) :: to
+    class(t2), intent(in) :: from
+  end
+end
+
+program test
+  use m
+  type(t1) x1
+  type(t2) x2
+  type(t3) x3
+  j = x1
+  j = x2
+  !ERROR: No intrinsic or user-defined ASSIGNMENT(=) matches operand types INTEGER(4) and TYPE(t3)
+  j = x3
+  x1 = x1
+  x1 = x2
+  !ERROR: No intrinsic or user-defined ASSIGNMENT(=) matches operand types TYPE(t1) and TYPE(t3)
+  x1 = x3
+  x2 = x1
+  x2 = x2
+  !ERROR: No intrinsic or user-defined ASSIGNMENT(=) matches operand types TYPE(t2) and TYPE(t3)
+  x2 = x3
+  x3 = x1
+  x3 = x2
+  x3 = x3
+end
diff --git flang/test/Semantics/bug124716.f90 flang/test/Semantics/bug124716.f90
new file mode 100644
index 000000000000..c1487a235721
--- /dev/null
+++ flang/test/Semantics/bug124716.f90
@@ -0,0 +1,36 @@
+! RUN: %python %S/test_modfile.py %s %flang_fc1
+MODULE m1
+  INTERFACE
+    MODULE SUBROUTINE sub1(N, ARR)
+      INTEGER, INTENT(IN) :: N
+      INTEGER, DIMENSION(N) :: ARR
+    END SUBROUTINE
+  END INTERFACE
+END MODULE
+SUBMODULE (m1) m1sub
+ CONTAINS
+  MODULE SUBROUTINE sub1(N, ARR)
+    INTEGER, INTENT(IN) :: N
+    INTEGER, DIMENSION(N) :: ARR
+    PRINT *, "sub1", N, ARR
+  END SUBROUTINE
+END SUBMODULE
+
+!Expect: m1.mod
+!module m1
+!interface
+!module subroutine sub1(n,arr)
+!integer(4),intent(in)::n
+!integer(4)::arr(1_8:int(n,kind=8))
+!end
+!end interface
+!end
+
+!Expect: m1-m1sub.mod
+!submodule(m1) m1sub
+!contains
+!module subroutine sub1(n,arr)
+!integer(4),intent(in)::n
+!integer(4)::arr(1_8:int(n,kind=8))
+!end
+!end
diff --git flang/test/Semantics/bug124731.f90 flang/test/Semantics/bug124731.f90
new file mode 100644
index 000000000000..924b41dd1db4
--- /dev/null
+++ flang/test/Semantics/bug124731.f90
@@ -0,0 +1,24 @@
+!RUN: %flang_fc1 -fsyntax-only %s  2>&1 | FileCheck %s --allow-empty
+!CHECK-NOT: error:
+module m1
+  interface
+    module subroutine foo
+    end
+  end interface
+  real x
+end
+module m2
+  use m1
+end
+submodule(m1) sm1
+  use m2 ! ok
+ contains
+  module procedure foo
+  end
+end
+submodule(m1) sm2
+ contains
+  subroutine bar
+    use m2 ! ok
+  end
+end
diff --git flang/test/Semantics/bug12477.f90 flang/test/Semantics/bug12477.f90
new file mode 100644
index 000000000000..52d079e3b26b
--- /dev/null
+++ flang/test/Semantics/bug12477.f90
@@ -0,0 +1,26 @@
+!RUN: %flang_fc1 -fsyntax-only %s  2>&1 | FileCheck %s --allow-empty
+!CHECK-NOT: error:
+module m
+  type t
+   contains
+    procedure nonelemental
+    generic :: operator(+) => nonelemental
+  end type
+  interface operator(+)
+    procedure elemental
+  end interface
+ contains
+  type(t) elemental function elemental (a, b)
+    class(t), intent(in) :: a, b
+    elemental = t()
+  end
+  type(t) function nonelemental (a, b)
+    class(t), intent(in) :: a, b(:)
+    nonelemental = t()
+  end
+end
+program main
+  use m
+  type(t) x, y(1)
+  x = x + y ! ok
+end
diff --git flang/test/Semantics/bug124976.f90 flang/test/Semantics/bug124976.f90
new file mode 100644
index 000000000000..29c21d4ead84
--- /dev/null
+++ flang/test/Semantics/bug124976.f90
@@ -0,0 +1,33 @@
+!RUN: %python %S/test_errors.py %s %flang_fc1
+program main
+  type base
+    integer :: x = 1
+  end type
+  type, extends(base) :: child
+    integer :: y = 2
+  end type
+  class(child), allocatable :: c1(:), c2(:,:)
+  class(base), allocatable :: b1(:), b2(:,:)
+  logical var(1)
+  common /blk/ var
+  allocate(c1(2), c2(2,2), b1(2), b2(2,2))
+  !ERROR: Actual argument for 'pad=' has bad type or kind 'CLASS(base)'
+  c2 = reshape(c1, shape(c2), pad=b1)
+  b2 = reshape(b1, shape(b2), pad=c1) ! ok
+  !ERROR: Actual argument for 'to=' has bad type or kind 'CLASS(child)'
+  call move_alloc(b1, c1)
+  call move_alloc(c1, b1) ! ok
+  !ERROR: Actual argument for 'boundary=' has bad type or kind 'CLASS(base)'
+  c1 = eoshift(c1, 1, b1(1))
+  c1 = eoshift(c1, 1, c2(1,1)) ! ok
+  b1 = eoshift(b1, 1, c1(1)) ! ok
+  !ERROR: Actual argument for 'fsource=' has bad type or kind 'CLASS(child)'
+  b1 = merge(b1, c1, var(1))
+  !ERROR: Actual argument for 'fsource=' has bad type or kind 'CLASS(base)'
+  b1 = merge(c1, b1, var(1))
+  b1 = merge(b1, b1, var(1)) ! ok
+  !ERROR: Actual argument for 'vector=' has bad type or kind 'CLASS(base)'
+  c1 = pack(c1, var, b1)
+  c1 = pack(c1, var, c1) ! ok
+  b1 = pack(b1, var, c1) ! ok
+end
diff --git flang/test/Semantics/cuf18.cuf flang/test/Semantics/cuf18.cuf
new file mode 100644
index 000000000000..ce9a2a31ca0d
--- /dev/null
+++ flang/test/Semantics/cuf18.cuf
@@ -0,0 +1,11 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1
+
+subroutine sub1()
+  real, allocatable, device :: a(:)
+
+!ERROR: Unsupported CUDA data transfer
+  a = a + 10 ! Illegal expression according to 3.4.2
+end subroutine
+
+
+
diff --git flang/test/Semantics/kinds01.f90 flang/test/Semantics/kinds01.f90
index 5238a90719fe..82c4d76da371 100644
--- flang/test/Semantics/kinds01.f90
+++ flang/test/Semantics/kinds01.f90
@@ -1,3 +1,4 @@
+! REQUIRES: x86_64-registered-target
 ! RUN: %python %S/test_symbols.py %s %flang_fc1
  !DEF: /MainProgram1/jk1 ObjectEntity INTEGER(1)
  integer(kind=1) jk1
diff --git flang/test/Semantics/kinds02.f90 flang/test/Semantics/kinds02.f90
index 13dbb803de8e..02b1e6c8c310 100644
--- flang/test/Semantics/kinds02.f90
+++ flang/test/Semantics/kinds02.f90
@@ -1,3 +1,4 @@
+! REQUIRES: x86_64-registered-target
 ! RUN: %python %S/test_errors.py %s %flang_fc1
 ! C712 The value of scalar-int-constant-expr shall be nonnegative and 
 ! shall specify a representation method that exists on the processor.
diff --git flang/test/Semantics/kinds04_q10.f90 flang/test/Semantics/kinds04_q10.f90
index aa5c4abe2f1d..3bec7a386585 100644
--- flang/test/Semantics/kinds04_q10.f90
+++ flang/test/Semantics/kinds04_q10.f90
@@ -1,4 +1,5 @@
-! RUN: %python %S/test_errors.py %s %flang_fc1
+! RUN: not %flang_fc1 %s 2>%t.stderr
+! RUN: FileCheck %s --input-file=%t.stderr --check-prefixes=PORTABILITY,ERROR,WARNING%if system-aix %{,AIX_WARNING%}
 ! C716 If both kind-param and exponent-letter appear, exponent-letter
 ! shall be E. (As an extension we also allow an exponent-letter which matches
 ! the kind-param)
@@ -12,10 +13,12 @@ subroutine s(var)
   !PORTABILITY: Explicit kind parameter together with non-'E' exponent letter is not standard
   real :: realvar4 = 4.0D6_8
   !WARNING: Explicit kind parameter on real constant disagrees with exponent letter 'q'
+  !AIX_WARNING: underflow on REAL(10) to REAL(4) conversion
   real :: realvar5 = 4.0Q6_10
   !PORTABILITY: Explicit kind parameter together with non-'E' exponent letter is not standard
   real :: realvar6 = 4.0Q6_16
   real :: realvar7 = 4.0E6_8
+  !AIX_WARNING: underflow on REAL(10) to REAL(4) conversion
   real :: realvar8 = 4.0E6_10
   real :: realvar9 = 4.0E6_16
   !ERROR: Unsupported REAL(KIND=32)
@@ -29,6 +32,7 @@ subroutine s(var)
   !PORTABILITY: Explicit kind parameter together with non-'E' exponent letter is not standard
   double precision :: doublevar5 = 4.0Q6_16
   double precision :: doublevar6 = 4.0E6_8
+  !AIX_WARNING: underflow on REAL(10) to REAL(8) conversion
   double precision :: doublevar7 = 4.0E6_10
   double precision :: doublevar8 = 4.0E6_16
   !ERROR: Unsupported REAL(KIND=32)
diff --git flang/test/Semantics/resolve110.f90 flang/test/Semantics/resolve110.f90
index 0b9e560e5ed7..398304b4d767 100644
--- flang/test/Semantics/resolve110.f90
+++ flang/test/Semantics/resolve110.f90
@@ -1,7 +1,5 @@
 ! RUN: %python %S/test_errors.py %s %flang_fc1
 ! Exercise ways to define and extend non-type-bound generics
-! TODO: crashes compiler (infinite recursion) when build with MSVC
-! XFAIL: system-windows
 
 module m1
   type :: t1; end type
diff --git flang/test/Semantics/resolve41.f90 flang/test/Semantics/resolve41.f90
index d2a991c0a52d..4d0b3a3d31e5 100644
--- flang/test/Semantics/resolve41.f90
+++ flang/test/Semantics/resolve41.f90
@@ -21,7 +21,7 @@ module m
   integer :: ff = 2_f
   !ERROR: REAL(KIND=23) is not a supported type
   real(d/2) :: g
-  !ERROR: REAL*47 is not a supported type
+  !ERROR: REAL(KIND=47) is not a supported type
   real*47 :: h
   !ERROR: COMPLEX*47 is not a supported type
   complex*47 :: i
diff --git flang/test/Semantics/self-use.f90 flang/test/Semantics/self-use.f90
index 4bc66a24343c..12433732fc33 100644
--- flang/test/Semantics/self-use.f90
+++ flang/test/Semantics/self-use.f90
@@ -15,7 +15,6 @@ submodule(m) submod1
  contains
   module subroutine separate
     !ERROR: Module 'm' cannot USE itself from its own submodule 'submod1'
-    !ERROR: Cannot use-associate 'separate'; it is already declared in this scope
     use m
   end
 end
diff --git flang/test/Transforms/generic-loop-rewriting.mlir flang/test/Transforms/generic-loop-rewriting.mlir
index 842136444fc1..49caf242fe32 100644
--- flang/test/Transforms/generic-loop-rewriting.mlir
+++ flang/test/Transforms/generic-loop-rewriting.mlir
@@ -1,9 +1,6 @@
 // RUN: fir-opt --omp-generic-loop-conversion %s | FileCheck %s
 
-omp.private {type = private} @_QFteams_loopEi_private_ref_i32 : !fir.ref<i32> alloc {
-^bb0(%arg0: !fir.ref<i32>):
-  omp.yield(%arg0 : !fir.ref<i32>)
-}
+omp.private {type = private} @_QFteams_loopEi_private_i32 : i32
 
 func.func @_QPteams_loop() {
   %i = fir.alloca i32
@@ -11,7 +8,7 @@ func.func @_QPteams_loop() {
     %c0 = arith.constant 0 : i32
     %c10 = arith.constant 10 : i32
     %c1 = arith.constant 1 : i32
-    omp.loop private(@_QFteams_loopEi_private_ref_i32 %i -> %arg2 : !fir.ref<i32>) {
+    omp.loop private(@_QFteams_loopEi_private_i32 %i -> %arg2 : !fir.ref<i32>) {
       omp.loop_nest (%arg3) : i32 = (%c0) to (%c10) inclusive step (%c1) {
         fir.store %arg3 to %arg2 : !fir.ref<i32>
         omp.yield
diff --git flang/test/Transforms/omp-maps-for-privatized-symbols.fir flang/test/Transforms/omp-maps-for-privatized-symbols.fir
index d32444aaabf2..10a76126ed05 100644
--- flang/test/Transforms/omp-maps-for-privatized-symbols.fir
+++ flang/test/Transforms/omp-maps-for-privatized-symbols.fir
@@ -1,12 +1,12 @@
 // RUN: fir-opt --split-input-file --omp-maps-for-privatized-symbols %s | FileCheck %s
 module attributes {omp.is_target_device = false} {
-  omp.private {type = private} @_QFtarget_simpleEsimple_var_private_ref_box_heap_i32 : !fir.ref<!fir.box<!fir.heap<i32>>> alloc {
-  ^bb0(%arg0: !fir.ref<!fir.box<!fir.heap<i32>>>):
-    %0 = fir.alloca !fir.box<!fir.heap<i32>> {bindc_name = "simple_var", pinned, uniq_name = "_QFtarget_simpleEsimple_var"}
-    %1 = fir.load %arg0 : !fir.ref<!fir.box<!fir.heap<i32>>>
-    %5:2 = hlfir.declare %0 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtarget_simpleEsimple_var"} : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.box<!fir.heap<i32>>>)
-    omp.yield(%5#0 : !fir.ref<!fir.box<!fir.heap<i32>>>)
+  omp.private {type = private} @_QFtarget_simpleEsimple_var_private_ref_box_heap_i32 : !fir.box<!fir.heap<i32>> init {
+  ^bb0(%arg0: !fir.ref<!fir.box<!fir.heap<i32>>>, %arg1: !fir.ref<!fir.box<!fir.heap<i32>>>):
+    %mold = fir.load %arg0 : !fir.ref<!fir.box<!fir.heap<i32>>>
+    // extract box address, see if it is null, etc
+    omp.yield(%arg1: !fir.ref<!fir.box<!fir.heap<i32>>>)
   }
+
   func.func @_QPtarget_simple() {
     %0 = fir.alloca i32 {bindc_name = "a", uniq_name = "_QFtarget_simpleEa"}
     %1:2 = hlfir.declare %0 {uniq_name = "_QFtarget_simpleEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
diff --git flang/test/lib/OpenACC/TestOpenACCInterfaces.cpp flang/test/lib/OpenACC/TestOpenACCInterfaces.cpp
index a01396748f4c..5c14809a265e 100644
--- flang/test/lib/OpenACC/TestOpenACCInterfaces.cpp
+++ flang/test/lib/OpenACC/TestOpenACCInterfaces.cpp
@@ -28,7 +28,7 @@ struct TestFIROpenACCInterfaces
   void runOnOperation() override {
     mlir::ModuleOp mod = getOperation();
     auto datalayout =
-        fir::support::getOrSetDataLayout(mod, /*allowDefaultLayout=*/true);
+        fir::support::getOrSetMLIRDataLayout(mod, /*allowDefaultLayout=*/true);
     mlir::OpBuilder builder(mod);
     getOperation().walk([&](Operation *op) {
       if (isa<ACC_DATA_ENTRY_OPS>(op)) {
diff --git libc/cmake/modules/LLVMLibCCompileOptionRules.cmake libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
index 12420db33196..0facb0b9be0c 100644
--- libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
+++ libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
@@ -179,8 +179,9 @@ function(_get_common_compile_options output_var flags)
     endif()
     list(APPEND compile_options "-Wconversion")
     list(APPEND compile_options "-Wno-sign-conversion")
-    # Silence this warning because _Complex is a part of C99.
+    list(APPEND compile_options "-Wdeprecated")
     if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+      # Silence this warning because _Complex is a part of C99.
       list(APPEND compile_options "-fext-numeric-literals")
     else()
       list(APPEND compile_options "-Wno-c99-extensions")
diff --git libc/include/llvm-libc-macros/limits-macros.h libc/include/llvm-libc-macros/limits-macros.h
index d4aa7ae539e8..a4957225c9d3 100644
--- libc/include/llvm-libc-macros/limits-macros.h
+++ libc/include/llvm-libc-macros/limits-macros.h
@@ -235,4 +235,8 @@
 #define _POSIX_PATH_MAX 256
 #endif
 
+#ifndef _POSIX_ARG_MAX
+#define _POSIX_ARG_MAX 4096
+#endif
+
 #endif // LLVM_LIBC_MACROS_LIMITS_MACROS_H
diff --git libc/src/__support/CPP/span.h libc/src/__support/CPP/span.h
index e9e3dbf169ce..a41c9b744e37 100644
--- libc/src/__support/CPP/span.h
+++ libc/src/__support/CPP/span.h
@@ -10,7 +10,7 @@
 
 #include <stddef.h> // For size_t
 
-#include "array.h"       // For array
+#include "array.h" // For array
 #include "src/__support/macros/config.h"
 #include "type_traits.h" // For remove_cv_t, enable_if_t, is_same_v, is_const_v
 
@@ -52,6 +52,8 @@ public:
 
   LIBC_INLINE constexpr span() : span_data(nullptr), span_size(0) {}
 
+  LIBC_INLINE constexpr span(const span &) = default;
+
   LIBC_INLINE constexpr span(pointer first, size_type count)
       : span_data(first), span_size(count) {}
 
diff --git libc/test/src/setjmp/CMakeLists.txt libc/test/src/setjmp/CMakeLists.txt
index 049df89ba39a..392230784bd9 100644
--- libc/test/src/setjmp/CMakeLists.txt
+++ libc/test/src/setjmp/CMakeLists.txt
@@ -11,6 +11,8 @@ add_libc_unittest(
     libc_setjmp_unittests
   SRCS
     setjmp_test.cpp
+  CXX_STANDARD
+    20
   DEPENDS
     libc.src.setjmp.longjmp
     libc.src.setjmp.setjmp
diff --git libc/test/src/setjmp/setjmp_test.cpp libc/test/src/setjmp/setjmp_test.cpp
index 9e5f74a1734b..27113cd6e063 100644
--- libc/test/src/setjmp/setjmp_test.cpp
+++ libc/test/src/setjmp/setjmp_test.cpp
@@ -27,7 +27,7 @@ TEST(LlvmLibcSetJmpTest, SetAndJumpBack) {
   // The first time setjmp is called, it should return 0.
   // Subsequent calls will return the value passed to jump_back below.
   if (LIBC_NAMESPACE::setjmp(buf) <= MAX_LOOP) {
-    ++n;
+    n = n + 1;
     jump_back(buf, n);
   }
   ASSERT_EQ(longjmp_called, n);
diff --git libc/utils/gpu/server/CMakeLists.txt libc/utils/gpu/server/CMakeLists.txt
index a109d603318b..ae8a0d902f45 100644
--- libc/utils/gpu/server/CMakeLists.txt
+++ libc/utils/gpu/server/CMakeLists.txt
@@ -11,6 +11,7 @@ target_include_directories(llvmlibc_rpc_server PUBLIC ${CMAKE_CURRENT_SOURCE_DIR
 
 # Ignore unsupported clang attributes if we're using GCC.
 target_compile_options(llvmlibc_rpc_server PUBLIC
+                       $<$<CXX_COMPILER_ID:Clang>:-Wno-c99-extensions>
                        $<$<CXX_COMPILER_ID:GNU>:-Wno-attributes>)
 target_compile_definitions(llvmlibc_rpc_server PUBLIC
                            LIBC_COPT_USE_C_ASSERT
diff --git libclc/CMakeLists.txt libclc/CMakeLists.txt
index 2c2c7f16e294..ff52153354e0 100644
--- libclc/CMakeLists.txt
+++ libclc/CMakeLists.txt
@@ -277,9 +277,15 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} )
     list( APPEND dirs amdgpu )
   endif()
 
-  # nvptx is special
+  # Some targets' directories alias others
   if( ${ARCH} STREQUAL nvptx OR ${ARCH} STREQUAL nvptx64 )
     set( DARCH ptx )
+  elseif( ${ARCH} STREQUAL clspv OR ${ARCH} STREQUAL clspv64 )
+    set( DARCH clspv )
+  elseif( ${ARCH} STREQUAL spirv OR ${ARCH} STREQUAL spirv64 )
+    set( DARCH spirv )
+  elseif( ${ARCH} STREQUAL amdgcn-mesa3d )
+    set( DARCH amdgcn-amdhsa )
   else()
     set( DARCH ${ARCH} )
   endif()
diff --git libclc/amdgcn-mesa3d libclc/amdgcn-mesa3d
deleted file mode 120000
index 400782833efe..000000000000
--- libclc/amdgcn-mesa3d
+++ /dev/null
@@ -1 +0,0 @@
-amdgcn-amdhsa
\ No newline at end of file
diff --git libclc/clc/include/clc/integer/clc_mad_sat.h libclc/clc/include/clc/integer/clc_mad_sat.h
new file mode 100644
index 000000000000..c474d067b0f6
--- /dev/null
+++ libclc/clc/include/clc/integer/clc_mad_sat.h
@@ -0,0 +1,12 @@
+#ifndef __CLC_INTEGER_CLC_MAD_SAT_H__
+#define __CLC_INTEGER_CLC_MAD_SAT_H__
+
+#define __CLC_FUNCTION __clc_mad_sat
+#define __CLC_BODY <clc/shared/ternary_decl.inc>
+
+#include <clc/integer/gentype.inc>
+
+#undef __CLC_BODY
+#undef __CLC_FUNCTION
+
+#endif // __CLC_INTEGER_CLC_MAD_SAT_H__
diff --git libclc/clc/include/clc/integer/definitions.h libclc/clc/include/clc/integer/definitions.h
index 18a9e54dec75..1b51c11a93f8 100644
--- libclc/clc/include/clc/integer/definitions.h
+++ libclc/clc/include/clc/integer/definitions.h
@@ -13,8 +13,12 @@
 #define SHRT_MAX 32767
 #define SHRT_MIN (-32767 - 1)
 #define UCHAR_MAX 255
+#define UCHAR_MIN 0
 #define USHRT_MAX 65535
+#define USHRT_MIN 0
 #define UINT_MAX 0xffffffff
+#define UINT_MIN 0
 #define ULONG_MAX 0xffffffffffffffffUL
+#define ULONG_MIN 0UL
 
 #endif // __CLC_INTEGER_DEFINITIONS_H__
diff --git libclc/clc/lib/clspv/SOURCES libclc/clc/lib/clspv/SOURCES
index 2581abe64f14..c96a6fc15bf6 100644
--- libclc/clc/lib/clspv/SOURCES
+++ libclc/clc/lib/clspv/SOURCES
@@ -2,6 +2,7 @@
 ../generic/integer/clc_clz.cl
 ../generic/integer/clc_hadd.cl
 ../generic/integer/clc_mad24.cl
+../generic/integer/clc_mad_sat.cl
 ../generic/integer/clc_mul24.cl
 ../generic/integer/clc_mul_hi.cl
 ../generic/integer/clc_popcount.cl
diff --git libclc/clc/lib/clspv64 libclc/clc/lib/clspv64
deleted file mode 120000
index ea01ba94bc63..000000000000
--- libclc/clc/lib/clspv64
+++ /dev/null
@@ -1 +0,0 @@
-clspv
\ No newline at end of file
diff --git libclc/clc/lib/generic/SOURCES libclc/clc/lib/generic/SOURCES
index 2f4df168f707..5fd882eb1bb2 100644
--- libclc/clc/lib/generic/SOURCES
+++ libclc/clc/lib/generic/SOURCES
@@ -8,6 +8,7 @@ integer/clc_add_sat.cl
 integer/clc_clz.cl
 integer/clc_hadd.cl
 integer/clc_mad24.cl
+integer/clc_mad_sat.cl
 integer/clc_mul24.cl
 integer/clc_mul_hi.cl
 integer/clc_popcount.cl
diff --git libclc/clc/lib/generic/integer/clc_mad_sat.cl libclc/clc/lib/generic/integer/clc_mad_sat.cl
new file mode 100644
index 000000000000..4e559dba2b2f
--- /dev/null
+++ libclc/clc/lib/generic/integer/clc_mad_sat.cl
@@ -0,0 +1,119 @@
+#include <clc/clcmacro.h>
+#include <clc/integer/clc_add_sat.h>
+#include <clc/integer/clc_mad24.h>
+#include <clc/integer/clc_mul_hi.h>
+#include <clc/integer/clc_upsample.h>
+#include <clc/integer/definitions.h>
+#include <clc/internal/clc.h>
+#include <clc/relational/clc_select.h>
+#include <clc/shared/clc_clamp.h>
+
+#define __CLC_CONVERT_TY(X, TY) __builtin_convertvector(X, TY)
+
+// Macro for defining mad_sat variants for char/uchar/short/ushort
+// FIXME: Once using __clc_convert_ty, can easily unify scalar and vector defs
+#define __CLC_DEFINE_SIMPLE_MAD_SAT(TYPE, UP_TYPE, LIT_PREFIX)                 \
+  _CLC_OVERLOAD _CLC_DEF TYPE __clc_mad_sat(TYPE x, TYPE y, TYPE z) {          \
+    return __clc_clamp(                                                        \
+        (UP_TYPE)__clc_mad24((UP_TYPE)x, (UP_TYPE)y, (UP_TYPE)z),              \
+        (UP_TYPE)LIT_PREFIX##_MIN, (UP_TYPE)LIT_PREFIX##_MAX);                 \
+  }
+
+#define __CLC_DEFINE_SIMPLE_MAD_SAT_VEC(TYPE, UP_TYPE, LIT_PREFIX)             \
+  _CLC_OVERLOAD _CLC_DEF TYPE __clc_mad_sat(TYPE x, TYPE y, TYPE z) {          \
+    UP_TYPE upscaled_mad = __clc_mad24(__CLC_CONVERT_TY(x, UP_TYPE),           \
+                                       __CLC_CONVERT_TY(y, UP_TYPE),           \
+                                       __CLC_CONVERT_TY(z, UP_TYPE));          \
+    UP_TYPE clamped_mad = __clc_clamp(upscaled_mad, (UP_TYPE)LIT_PREFIX##_MIN, \
+                                      (UP_TYPE)LIT_PREFIX##_MAX);              \
+    return __CLC_CONVERT_TY(clamped_mad, TYPE);                                \
+  }
+
+#define __CLC_DEFINE_SIMPLE_MAD_SAT_ALL_TYS(TYPE, UP_TYPE, LIT_PREFIX)         \
+  __CLC_DEFINE_SIMPLE_MAD_SAT(TYPE, UP_TYPE, LIT_PREFIX)                       \
+  __CLC_DEFINE_SIMPLE_MAD_SAT_VEC(TYPE##2, UP_TYPE##2, LIT_PREFIX)             \
+  __CLC_DEFINE_SIMPLE_MAD_SAT_VEC(TYPE##3, UP_TYPE##3, LIT_PREFIX)             \
+  __CLC_DEFINE_SIMPLE_MAD_SAT_VEC(TYPE##4, UP_TYPE##4, LIT_PREFIX)             \
+  __CLC_DEFINE_SIMPLE_MAD_SAT_VEC(TYPE##8, UP_TYPE##8, LIT_PREFIX)             \
+  __CLC_DEFINE_SIMPLE_MAD_SAT_VEC(TYPE##16, UP_TYPE##16, LIT_PREFIX)
+
+__CLC_DEFINE_SIMPLE_MAD_SAT_ALL_TYS(char, int, CHAR)
+__CLC_DEFINE_SIMPLE_MAD_SAT_ALL_TYS(uchar, uint, UCHAR)
+__CLC_DEFINE_SIMPLE_MAD_SAT_ALL_TYS(short, int, SHRT)
+__CLC_DEFINE_SIMPLE_MAD_SAT_ALL_TYS(ushort, uint, USHRT)
+
+// Macro for defining mad_sat variants for uint/ulong
+#define __CLC_DEFINE_UINTLONG_MAD_SAT(UTYPE, STYPE, ULIT_PREFIX)               \
+  _CLC_OVERLOAD _CLC_DEF UTYPE __clc_mad_sat(UTYPE x, UTYPE y, UTYPE z) {      \
+    STYPE has_mul_hi = __clc_mul_hi(x, y) != (UTYPE)0;                         \
+    return __clc_select(__clc_add_sat(x * y, z), (UTYPE)ULIT_PREFIX##_MAX,     \
+                        has_mul_hi);                                           \
+  }
+
+#define __CLC_DEFINE_UINTLONG_MAD_SAT_ALL_TYS(UTY, STY, ULIT_PREFIX)           \
+  __CLC_DEFINE_UINTLONG_MAD_SAT(UTY, STY, ULIT_PREFIX)                         \
+  __CLC_DEFINE_UINTLONG_MAD_SAT(UTY##2, STY##2, ULIT_PREFIX)                   \
+  __CLC_DEFINE_UINTLONG_MAD_SAT(UTY##3, STY##3, ULIT_PREFIX)                   \
+  __CLC_DEFINE_UINTLONG_MAD_SAT(UTY##4, STY##4, ULIT_PREFIX)                   \
+  __CLC_DEFINE_UINTLONG_MAD_SAT(UTY##8, STY##8, ULIT_PREFIX)                   \
+  __CLC_DEFINE_UINTLONG_MAD_SAT(UTY##16, STY##16, ULIT_PREFIX)
+
+__CLC_DEFINE_UINTLONG_MAD_SAT_ALL_TYS(uint, int, UINT)
+__CLC_DEFINE_UINTLONG_MAD_SAT_ALL_TYS(ulong, long, ULONG)
+
+// Macro for defining mad_sat variants for int
+#define __CLC_DEFINE_SINT_MAD_SAT(INTTY, UINTTY, SLONGTY)                      \
+  _CLC_OVERLOAD _CLC_DEF INTTY __clc_mad_sat(INTTY x, INTTY y, INTTY z) {      \
+    INTTY mhi = __clc_mul_hi(x, y);                                            \
+    UINTTY mlo = __clc_as_##UINTTY(x * y);                                     \
+    SLONGTY m = __clc_upsample(mhi, mlo);                                      \
+    m += __CLC_CONVERT_TY(z, SLONGTY);                                         \
+    m = __clc_clamp(m, (SLONGTY)INT_MIN, (SLONGTY)INT_MAX);                    \
+    return __CLC_CONVERT_TY(m, INTTY);                                         \
+  }
+
+// FIXME: Once using __clc_convert_ty, can easily unify scalar and vector defs
+#define __CLC_DEFINE_SINT_MAD_SAT_ALL_TYS(INTTY, UINTTY, SLONGTY)              \
+  _CLC_OVERLOAD _CLC_DEF INTTY __clc_mad_sat(INTTY x, INTTY y, INTTY z) {      \
+    INTTY mhi = __clc_mul_hi(x, y);                                            \
+    UINTTY mlo = __clc_as_##UINTTY(x * y);                                     \
+    SLONGTY m = __clc_upsample(mhi, mlo);                                      \
+    m += z;                                                                    \
+    return __clc_clamp(m, (SLONGTY)INT_MIN, (SLONGTY)INT_MAX);                 \
+  }                                                                            \
+  __CLC_DEFINE_SINT_MAD_SAT(INTTY##2, UINTTY##2, SLONGTY##2)                   \
+  __CLC_DEFINE_SINT_MAD_SAT(INTTY##3, UINTTY##3, SLONGTY##3)                   \
+  __CLC_DEFINE_SINT_MAD_SAT(INTTY##4, UINTTY##4, SLONGTY##4)                   \
+  __CLC_DEFINE_SINT_MAD_SAT(INTTY##8, UINTTY##8, SLONGTY##8)                   \
+  __CLC_DEFINE_SINT_MAD_SAT(INTTY##16, UINTTY##16, SLONGTY##16)
+
+__CLC_DEFINE_SINT_MAD_SAT_ALL_TYS(int, uint, long)
+
+// Macro for defining mad_sat variants for long
+#define __CLC_DEFINE_SLONG_MAD_SAT(SLONGTY, ULONGTY)                           \
+  _CLC_OVERLOAD _CLC_DEF SLONGTY __clc_mad_sat(SLONGTY x, SLONGTY y,           \
+                                               SLONGTY z) {                    \
+    SLONGTY hi = __clc_mul_hi(x, y);                                           \
+    ULONGTY ulo = __clc_as_##ULONGTY(x * y);                                   \
+    SLONGTY max1 = (x < 0) == (y < 0) && hi != 0;                              \
+    SLONGTY max2 = hi == 0 && ulo >= LONG_MAX &&                               \
+                   (z > 0 || (ulo + __clc_as_##ULONGTY(z)) > LONG_MAX);        \
+    SLONGTY min1 = (((x < 0) != (y < 0)) && hi != -1);                         \
+    SLONGTY min2 =                                                             \
+        hi == -1 && ulo <= ((ULONGTY)LONG_MAX + 1UL) &&                        \
+        (z < 0 || __clc_as_##ULONGTY(z) < ((ULONGTY)LONG_MAX - ulo));          \
+    SLONGTY ret = __clc_as_##SLONGTY(ulo + __clc_as_##ULONGTY(z));             \
+    ret = __clc_select(ret, (SLONGTY)LONG_MAX, (SLONGTY)(max1 || max2));       \
+    ret = __clc_select(ret, (SLONGTY)LONG_MIN, (SLONGTY)(min1 || min2));       \
+    return ret;                                                                \
+  }
+
+#define __CLC_DEFINE_SLONG_MAD_SAT_ALL_TYS(SLONGTY, ULONGTY)                   \
+  __CLC_DEFINE_SLONG_MAD_SAT(SLONGTY, ULONGTY)                                 \
+  __CLC_DEFINE_SLONG_MAD_SAT(SLONGTY##2, ULONGTY##2)                           \
+  __CLC_DEFINE_SLONG_MAD_SAT(SLONGTY##3, ULONGTY##3)                           \
+  __CLC_DEFINE_SLONG_MAD_SAT(SLONGTY##4, ULONGTY##4)                           \
+  __CLC_DEFINE_SLONG_MAD_SAT(SLONGTY##8, ULONGTY##8)                           \
+  __CLC_DEFINE_SLONG_MAD_SAT(SLONGTY##16, ULONGTY##16)
+
+__CLC_DEFINE_SLONG_MAD_SAT_ALL_TYS(long, ulong)
diff --git libclc/clc/lib/spirv/SOURCES libclc/clc/lib/spirv/SOURCES
index ddc9e4c49d86..c3cc4068225d 100644
--- libclc/clc/lib/spirv/SOURCES
+++ libclc/clc/lib/spirv/SOURCES
@@ -6,6 +6,7 @@
 ../generic/integer/clc_clz.cl
 ../generic/integer/clc_hadd.cl
 ../generic/integer/clc_mad24.cl
+../generic/integer/clc_mad_sat.cl
 ../generic/integer/clc_mul24.cl
 ../generic/integer/clc_mul_hi.cl
 ../generic/integer/clc_popcount.cl
diff --git libclc/clc/lib/spirv64/SOURCES libclc/clc/lib/spirv64/SOURCES
deleted file mode 100644
index ddc9e4c49d86..000000000000
--- libclc/clc/lib/spirv64/SOURCES
+++ /dev/null
@@ -1,24 +0,0 @@
-../generic/common/clc_degrees.cl
-../generic/common/clc_radians.cl
-../generic/common/clc_smoothstep.cl
-../generic/geometric/clc_dot.cl
-../generic/integer/clc_add_sat.cl
-../generic/integer/clc_clz.cl
-../generic/integer/clc_hadd.cl
-../generic/integer/clc_mad24.cl
-../generic/integer/clc_mul24.cl
-../generic/integer/clc_mul_hi.cl
-../generic/integer/clc_popcount.cl
-../generic/integer/clc_rhadd.cl
-../generic/integer/clc_sub_sat.cl
-../generic/integer/clc_upsample.cl
-../generic/math/clc_ceil.cl
-../generic/math/clc_copysign.cl
-../generic/math/clc_fabs.cl
-../generic/math/clc_floor.cl
-../generic/math/clc_mad.cl
-../generic/math/clc_nextafter.cl
-../generic/math/clc_rint.cl
-../generic/math/clc_trunc.cl
-../generic/relational/clc_select.cl
-../generic/shared/clc_clamp.cl
diff --git libclc/clspv64 libclc/clspv64
deleted file mode 120000
index ea01ba94bc63..000000000000
--- libclc/clspv64
+++ /dev/null
@@ -1 +0,0 @@
-clspv
\ No newline at end of file
diff --git libclc/generic/lib/integer/mad_sat.cl libclc/generic/lib/integer/mad_sat.cl
index 2372eaacd6da..e9c44364deee 100644
--- libclc/generic/lib/integer/mad_sat.cl
+++ libclc/generic/lib/integer/mad_sat.cl
@@ -1,72 +1,7 @@
 #include <clc/clc.h>
-#include <clc/clcmacro.h>
+#include <clc/integer/clc_mad_sat.h>
 
-_CLC_OVERLOAD _CLC_DEF char mad_sat(char x, char y, char z) {
-  return clamp((short)mad24((short)x, (short)y, (short)z), (short)CHAR_MIN, (short) CHAR_MAX);
-}
+#define FUNCTION mad_sat
+#define __CLC_BODY <clc/shared/ternary_def.inc>
 
-_CLC_OVERLOAD _CLC_DEF uchar mad_sat(uchar x, uchar y, uchar z) {
-  return clamp((ushort)mad24((ushort)x, (ushort)y, (ushort)z), (ushort)0, (ushort) UCHAR_MAX);
-}
-
-_CLC_OVERLOAD _CLC_DEF short mad_sat(short x, short y, short z) {
-  return clamp((int)mad24((int)x, (int)y, (int)z), (int)SHRT_MIN, (int) SHRT_MAX);
-}
-
-_CLC_OVERLOAD _CLC_DEF ushort mad_sat(ushort x, ushort y, ushort z) {
-  return clamp((uint)mad24((uint)x, (uint)y, (uint)z), (uint)0, (uint) USHRT_MAX);
-}
-
-_CLC_OVERLOAD _CLC_DEF int mad_sat(int x, int y, int z) {
-  int mhi = mul_hi(x, y);
-  uint mlo = x * y;
-  long m = upsample(mhi, mlo);
-  m += z;
-  if (m > INT_MAX)
-    return INT_MAX;
-  if (m < INT_MIN)
-    return INT_MIN;
-  return m;
-}
-
-_CLC_OVERLOAD _CLC_DEF uint mad_sat(uint x, uint y, uint z) {
-  if (mul_hi(x, y) != 0)
-    return UINT_MAX;
-  return add_sat(x * y, z);
-}
-
-_CLC_OVERLOAD _CLC_DEF long mad_sat(long x, long y, long z) {
-  long hi = mul_hi(x, y);
-  ulong ulo = x * y;
-  long  slo = x * y;
-  /* Big overflow of more than 2 bits, add can't fix this */
-  if (((x < 0) == (y < 0)) && hi != 0)
-    return LONG_MAX;
-  /* Low overflow in mul and z not neg enough to correct it */
-  if (hi == 0 && ulo >= LONG_MAX && (z > 0 || (ulo + z) > LONG_MAX))
-    return LONG_MAX;
-  /* Big overflow of more than 2 bits, add can't fix this */
-  if (((x < 0) != (y < 0)) && hi != -1)
-    return LONG_MIN;
-  /* Low overflow in mul and z not pos enough to correct it */
-  if (hi == -1 && ulo <= ((ulong)LONG_MAX + 1UL) && (z < 0 || z < (LONG_MAX - ulo)))
-    return LONG_MIN;
-  /* We have checked all conditions, any overflow in addition returns
-   * the correct value */
-  return ulo + z;
-}
-
-_CLC_OVERLOAD _CLC_DEF ulong mad_sat(ulong x, ulong y, ulong z) {
-  if (mul_hi(x, y) != 0)
-    return ULONG_MAX;
-  return add_sat(x * y, z);
-}
-
-_CLC_TERNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, char, mad_sat, char, char, char)
-_CLC_TERNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, uchar, mad_sat, uchar, uchar, uchar)
-_CLC_TERNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, short, mad_sat, short, short, short)
-_CLC_TERNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, ushort, mad_sat, ushort, ushort, ushort)
-_CLC_TERNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, int, mad_sat, int, int, int)
-_CLC_TERNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, uint, mad_sat, uint, uint, uint)
-_CLC_TERNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, long, mad_sat, long, long, long)
-_CLC_TERNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, ulong, mad_sat, ulong, ulong, ulong)
+#include <clc/integer/gentype.inc>
diff --git libclc/spirv64/lib/SOURCES libclc/spirv64/lib/SOURCES
deleted file mode 100644
index 854cba614c8b..000000000000
--- libclc/spirv64/lib/SOURCES
+++ /dev/null
@@ -1,90 +0,0 @@
-subnormal_config.cl
-../../generic/lib/async/async_work_group_strided_copy.cl
-../../generic/lib/async/wait_group_events.cl
-../../generic/lib/common/degrees.cl
-../../generic/lib/common/mix.cl
-../../generic/lib/common/radians.cl
-../../generic/lib/common/sign.cl
-../../generic/lib/common/smoothstep.cl
-../../generic/lib/common/step.cl
-../../generic/lib/geometric/cross.cl
-../../generic/lib/geometric/distance.cl
-../../generic/lib/geometric/dot.cl
-../../generic/lib/geometric/fast_distance.cl
-../../generic/lib/geometric/fast_length.cl
-../../generic/lib/geometric/fast_normalize.cl
-../../generic/lib/geometric/length.cl
-../../generic/lib/geometric/normalize.cl
-../../generic/lib/integer/rotate.cl
-../../generic/lib/integer/mad_sat.cl
-../../generic/lib/math/acos.cl
-../../generic/lib/math/acosh.cl
-../../generic/lib/math/acospi.cl
-../../generic/lib/math/asin.cl
-../../generic/lib/math/asinh.cl
-../../generic/lib/math/asinpi.cl
-../../generic/lib/math/atan.cl
-../../generic/lib/math/atan2.cl
-../../generic/lib/math/atan2pi.cl
-../../generic/lib/math/atanh.cl
-../../generic/lib/math/atanpi.cl
-../../generic/lib/math/cbrt.cl
-../../generic/lib/math/cos.cl
-../../generic/lib/math/cosh.cl
-../../generic/lib/math/cospi.cl
-../../generic/lib/math/ep_log.cl
-../../generic/lib/math/erf.cl
-../../generic/lib/math/erfc.cl
-../../generic/lib/math/exp.cl
-../../generic/lib/math/exp_helper.cl
-../../generic/lib/math/expm1.cl
-../../generic/lib/math/exp2.cl
-../../generic/lib/math/clc_exp10.cl
-../../generic/lib/math/exp10.cl
-../../generic/lib/math/clc_fma.cl
-math/fma.cl
-../../generic/lib/math/clc_fmod.cl
-../../generic/lib/math/fmod.cl
-../../generic/lib/math/fract.cl
-../../generic/lib/math/frexp.cl
-../../generic/lib/math/half_rsqrt.cl
-../../generic/lib/math/half_sqrt.cl
-../../generic/lib/math/clc_hypot.cl
-../../generic/lib/math/hypot.cl
-../../generic/lib/math/ilogb.cl
-../../generic/lib/math/clc_ldexp.cl
-../../generic/lib/math/ldexp.cl
-../../generic/lib/math/lgamma.cl
-../../generic/lib/math/lgamma_r.cl
-../../generic/lib/math/log.cl
-../../generic/lib/math/log10.cl
-../../generic/lib/math/log1p.cl
-../../generic/lib/math/log2.cl
-../../generic/lib/math/logb.cl
-../../generic/lib/math/modf.cl
-../../generic/lib/math/tables.cl
-../../generic/lib/math/clc_pow.cl
-../../generic/lib/math/pow.cl
-../../generic/lib/math/clc_pown.cl
-../../generic/lib/math/pown.cl
-../../generic/lib/math/clc_powr.cl
-../../generic/lib/math/powr.cl
-../../generic/lib/math/clc_remainder.cl
-../../generic/lib/math/remainder.cl
-../../generic/lib/math/clc_remquo.cl
-../../generic/lib/math/remquo.cl
-../../generic/lib/math/clc_rootn.cl
-../../generic/lib/math/rootn.cl
-../../generic/lib/math/sin.cl
-../../generic/lib/math/sincos.cl
-../../generic/lib/math/sincos_helpers.cl
-../../generic/lib/math/sinh.cl
-../../generic/lib/math/sinpi.cl
-../../generic/lib/math/clc_tan.cl
-../../generic/lib/math/tan.cl
-../../generic/lib/math/tanh.cl
-../../generic/lib/math/clc_tanpi.cl
-../../generic/lib/math/tanpi.cl
-../../generic/lib/math/tgamma.cl
-../../generic/lib/shared/vload.cl
-../../generic/lib/shared/vstore.cl
diff --git libclc/spirv64/lib/math/fma.cl libclc/spirv64/lib/math/fma.cl
deleted file mode 100644
index 79142425e52d..000000000000
--- libclc/spirv64/lib/math/fma.cl
+++ /dev/null
@@ -1,11 +0,0 @@
-#include <clc/clc.h>
-#include <math/clc_fma.h>
-
-#define __CLC_BODY <fma.inc>
-#define __FLOAT_ONLY
-#include <clc/math/gentype.inc>
-
-bool __clc_runtime_has_hw_fma32()
-{
-    return false;
-}
diff --git libclc/spirv64/lib/math/fma.inc libclc/spirv64/lib/math/fma.inc
deleted file mode 100644
index 0f12c565758f..000000000000
--- libclc/spirv64/lib/math/fma.inc
+++ /dev/null
@@ -1,3 +0,0 @@
-_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE fma(__CLC_GENTYPE a, __CLC_GENTYPE b, __CLC_GENTYPE c) {
-	return __clc_sw_fma(a, b, c);
-}
diff --git libclc/spirv64/lib/subnormal_config.cl libclc/spirv64/lib/subnormal_config.cl
deleted file mode 100644
index 3ab5a6394b93..000000000000
--- libclc/spirv64/lib/subnormal_config.cl
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * Copyright (c) 2015 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-
-#include <clc/clc.h>
-#include <clc/math/clc_subnormal_config.h>
-
-_CLC_DEF bool __clc_fp16_subnormals_supported() { return false; }
-
-_CLC_DEF bool __clc_fp32_subnormals_supported() { return false; }
-
-_CLC_DEF bool __clc_fp64_subnormals_supported() { return false; }
diff --git libcxx/docs/ReleaseNotes/21.rst libcxx/docs/ReleaseNotes/21.rst
index e746244b653d..82f1de6bad39 100644
--- libcxx/docs/ReleaseNotes/21.rst
+++ libcxx/docs/ReleaseNotes/21.rst
@@ -38,13 +38,13 @@ What's New in Libc++ 21.0.0?
 Implemented Papers
 ------------------
 
-- TODO
-
+- N4258: Cleaning-up noexcept in the Library (`Github <https://github.com/llvm/llvm-project/issues/99937>`__)
 
 Improvements and New Features
 -----------------------------
 
-- TODO
+- The ``std::ranges::{copy, copy_n, copy_backward}`` algorithms have been optimized for ``std::vector<bool>::iterator``\s,
+  resulting in a performance improvement of up to 2000x.
 
 
 Deprecations and Removals
diff --git libcxx/docs/Status/Cxx17Papers.csv libcxx/docs/Status/Cxx17Papers.csv
index fbcac452adb8..24fc7f718c36 100644
--- libcxx/docs/Status/Cxx17Papers.csv
+++ libcxx/docs/Status/Cxx17Papers.csv
@@ -3,7 +3,7 @@
 "`N4089 <https://wg21.link/N4089>`__","Safe conversions in ``unique_ptr<T[]>``\ .","2014-11 (Urbana)","|Complete|","5",""
 "`N4169 <https://wg21.link/N4169>`__","A proposal to add invoke function template","2014-11 (Urbana)","|Complete|","3.7",""
 "`N4190 <https://wg21.link/N4190>`__","Removing auto_ptr, random_shuffle(), And Old <functional> Stuff.","2014-11 (Urbana)","|Complete|","15",""
-"`N4258 <https://wg21.link/N4258>`__","Cleaning-up noexcept in the Library.","2014-11 (Urbana)","|In Progress|","3.7",""
+"`N4258 <https://wg21.link/N4258>`__","Cleaning-up noexcept in the Library.","2014-11 (Urbana)","|Complete|","21",""
 "`N4259 <https://wg21.link/N4259>`__","Wording for std::uncaught_exceptions","2014-11 (Urbana)","|Complete|","3.7","``std::uncaught_exception`` is deprecated since LLVM 20"
 "`N4277 <https://wg21.link/N4277>`__","TriviallyCopyable ``reference_wrapper``\ .","2014-11 (Urbana)","|Complete|","3.2",""
 "`N4279 <https://wg21.link/N4279>`__","Improved insertion interface for unique-key maps.","2014-11 (Urbana)","|Complete|","3.7",""
diff --git libcxx/include/__algorithm/copy.h libcxx/include/__algorithm/copy.h
index 962aa90059d5..7454c874a4d9 100644
--- libcxx/include/__algorithm/copy.h
+++ libcxx/include/__algorithm/copy.h
@@ -13,8 +13,10 @@
 #include <__algorithm/for_each_segment.h>
 #include <__algorithm/min.h>
 #include <__config>
+#include <__fwd/bit_reference.h>
 #include <__iterator/iterator_traits.h>
 #include <__iterator/segmented_iterator.h>
+#include <__memory/pointer_traits.h>
 #include <__type_traits/common_type.h>
 #include <__type_traits/enable_if.h>
 #include <__utility/move.h>
@@ -29,9 +31,129 @@ _LIBCPP_PUSH_MACROS
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
+template <class _InputIterator, class _OutputIterator>
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator
+copy(_InputIterator __first, _InputIterator __last, _OutputIterator __result);
+
 template <class _InIter, class _Sent, class _OutIter>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_InIter, _OutIter> __copy(_InIter, _Sent, _OutIter);
 
+template <class _Cp, bool _IsConst>
+_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_aligned(
+    __bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
+  using _In             = __bit_iterator<_Cp, _IsConst>;
+  using difference_type = typename _In::difference_type;
+  using __storage_type  = typename _In::__storage_type;
+
+  const int __bits_per_word = _In::__bits_per_word;
+  difference_type __n       = __last - __first;
+  if (__n > 0) {
+    // do first word
+    if (__first.__ctz_ != 0) {
+      unsigned __clz       = __bits_per_word - __first.__ctz_;
+      difference_type __dn = std::min(static_cast<difference_type>(__clz), __n);
+      __n -= __dn;
+      __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz - __dn));
+      __storage_type __b = *__first.__seg_ & __m;
+      *__result.__seg_ &= ~__m;
+      *__result.__seg_ |= __b;
+      __result.__seg_ += (__dn + __result.__ctz_) / __bits_per_word;
+      __result.__ctz_ = static_cast<unsigned>((__dn + __result.__ctz_) % __bits_per_word);
+      ++__first.__seg_;
+      // __first.__ctz_ = 0;
+    }
+    // __first.__ctz_ == 0;
+    // do middle words
+    __storage_type __nw = __n / __bits_per_word;
+    std::copy(std::__to_address(__first.__seg_),
+              std::__to_address(__first.__seg_ + __nw),
+              std::__to_address(__result.__seg_));
+    __n -= __nw * __bits_per_word;
+    __result.__seg_ += __nw;
+    // do last word
+    if (__n > 0) {
+      __first.__seg_ += __nw;
+      __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
+      __storage_type __b = *__first.__seg_ & __m;
+      *__result.__seg_ &= ~__m;
+      *__result.__seg_ |= __b;
+      __result.__ctz_ = static_cast<unsigned>(__n);
+    }
+  }
+  return __result;
+}
+
+template <class _Cp, bool _IsConst>
+_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_unaligned(
+    __bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
+  using _In             = __bit_iterator<_Cp, _IsConst>;
+  using difference_type = typename _In::difference_type;
+  using __storage_type  = typename _In::__storage_type;
+
+  const int __bits_per_word = _In::__bits_per_word;
+  difference_type __n       = __last - __first;
+  if (__n > 0) {
+    // do first word
+    if (__first.__ctz_ != 0) {
+      unsigned __clz_f     = __bits_per_word - __first.__ctz_;
+      difference_type __dn = std::min(static_cast<difference_type>(__clz_f), __n);
+      __n -= __dn;
+      __storage_type __m   = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
+      __storage_type __b   = *__first.__seg_ & __m;
+      unsigned __clz_r     = __bits_per_word - __result.__ctz_;
+      __storage_type __ddn = std::min<__storage_type>(__dn, __clz_r);
+      __m                  = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __ddn));
+      *__result.__seg_ &= ~__m;
+      if (__result.__ctz_ > __first.__ctz_)
+        *__result.__seg_ |= __b << (__result.__ctz_ - __first.__ctz_);
+      else
+        *__result.__seg_ |= __b >> (__first.__ctz_ - __result.__ctz_);
+      __result.__seg_ += (__ddn + __result.__ctz_) / __bits_per_word;
+      __result.__ctz_ = static_cast<unsigned>((__ddn + __result.__ctz_) % __bits_per_word);
+      __dn -= __ddn;
+      if (__dn > 0) {
+        __m = ~__storage_type(0) >> (__bits_per_word - __dn);
+        *__result.__seg_ &= ~__m;
+        *__result.__seg_ |= __b >> (__first.__ctz_ + __ddn);
+        __result.__ctz_ = static_cast<unsigned>(__dn);
+      }
+      ++__first.__seg_;
+      // __first.__ctz_ = 0;
+    }
+    // __first.__ctz_ == 0;
+    // do middle words
+    unsigned __clz_r   = __bits_per_word - __result.__ctz_;
+    __storage_type __m = ~__storage_type(0) << __result.__ctz_;
+    for (; __n >= __bits_per_word; __n -= __bits_per_word, ++__first.__seg_) {
+      __storage_type __b = *__first.__seg_;
+      *__result.__seg_ &= ~__m;
+      *__result.__seg_ |= __b << __result.__ctz_;
+      ++__result.__seg_;
+      *__result.__seg_ &= __m;
+      *__result.__seg_ |= __b >> __clz_r;
+    }
+    // do last word
+    if (__n > 0) {
+      __m                 = ~__storage_type(0) >> (__bits_per_word - __n);
+      __storage_type __b  = *__first.__seg_ & __m;
+      __storage_type __dn = std::min(__n, static_cast<difference_type>(__clz_r));
+      __m                 = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __dn));
+      *__result.__seg_ &= ~__m;
+      *__result.__seg_ |= __b << __result.__ctz_;
+      __result.__seg_ += (__dn + __result.__ctz_) / __bits_per_word;
+      __result.__ctz_ = static_cast<unsigned>((__dn + __result.__ctz_) % __bits_per_word);
+      __n -= __dn;
+      if (__n > 0) {
+        __m = ~__storage_type(0) >> (__bits_per_word - __n);
+        *__result.__seg_ &= ~__m;
+        *__result.__seg_ |= __b >> __dn;
+        __result.__ctz_ = static_cast<unsigned>(__n);
+      }
+    }
+  }
+  return __result;
+}
+
 struct __copy_impl {
   template <class _InIter, class _Sent, class _OutIter>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_InIter, _OutIter>
@@ -95,6 +217,16 @@ struct __copy_impl {
     }
   }
 
+  template <class _Cp, bool _IsConst>
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<__bit_iterator<_Cp, _IsConst>, __bit_iterator<_Cp, false> >
+  operator()(__bit_iterator<_Cp, _IsConst> __first,
+             __bit_iterator<_Cp, _IsConst> __last,
+             __bit_iterator<_Cp, false> __result) const {
+    if (__first.__ctz_ == __result.__ctz_)
+      return std::make_pair(__last, std::__copy_aligned(__first, __last, __result));
+    return std::make_pair(__last, std::__copy_unaligned(__first, __last, __result));
+  }
+
   // At this point, the iterators have been unwrapped so any `contiguous_iterator` has been unwrapped to a pointer.
   template <class _In, class _Out, __enable_if_t<__can_lower_copy_assignment_to_memmove<_In, _Out>::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_In*, _Out*>
@@ -110,7 +242,7 @@ __copy(_InIter __first, _Sent __last, _OutIter __result) {
 }
 
 template <class _InputIterator, class _OutputIterator>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator
 copy(_InputIterator __first, _InputIterator __last, _OutputIterator __result) {
   return std::__copy(__first, __last, __result).second;
 }
diff --git libcxx/include/__algorithm/copy_backward.h libcxx/include/__algorithm/copy_backward.h
index 48a768f577f5..02ffc14361e6 100644
--- libcxx/include/__algorithm/copy_backward.h
+++ libcxx/include/__algorithm/copy_backward.h
@@ -10,11 +10,14 @@
 #define _LIBCPP___ALGORITHM_COPY_BACKWARD_H
 
 #include <__algorithm/copy_move_common.h>
+#include <__algorithm/copy_n.h>
 #include <__algorithm/iterator_operations.h>
 #include <__algorithm/min.h>
 #include <__config>
+#include <__fwd/bit_reference.h>
 #include <__iterator/iterator_traits.h>
 #include <__iterator/segmented_iterator.h>
+#include <__memory/pointer_traits.h>
 #include <__type_traits/common_type.h>
 #include <__type_traits/enable_if.h>
 #include <__type_traits/is_constructible.h>
@@ -34,6 +37,124 @@ template <class _AlgPolicy, class _InIter, class _Sent, class _OutIter>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_InIter, _OutIter>
 __copy_backward(_InIter __first, _Sent __last, _OutIter __result);
 
+template <class _Cp, bool _IsConst>
+_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_backward_aligned(
+    __bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
+  using _In             = __bit_iterator<_Cp, _IsConst>;
+  using difference_type = typename _In::difference_type;
+  using __storage_type  = typename _In::__storage_type;
+
+  const int __bits_per_word = _In::__bits_per_word;
+  difference_type __n       = __last - __first;
+  if (__n > 0) {
+    // do first word
+    if (__last.__ctz_ != 0) {
+      difference_type __dn = std::min(static_cast<difference_type>(__last.__ctz_), __n);
+      __n -= __dn;
+      unsigned __clz     = __bits_per_word - __last.__ctz_;
+      __storage_type __m = (~__storage_type(0) << (__last.__ctz_ - __dn)) & (~__storage_type(0) >> __clz);
+      __storage_type __b = *__last.__seg_ & __m;
+      *__result.__seg_ &= ~__m;
+      *__result.__seg_ |= __b;
+      __result.__ctz_ = static_cast<unsigned>(((-__dn & (__bits_per_word - 1)) + __result.__ctz_) % __bits_per_word);
+      // __last.__ctz_ = 0
+    }
+    // __last.__ctz_ == 0 || __n == 0
+    // __result.__ctz_ == 0 || __n == 0
+    // do middle words
+    __storage_type __nw = __n / __bits_per_word;
+    __result.__seg_ -= __nw;
+    __last.__seg_ -= __nw;
+    std::copy_n(std::__to_address(__last.__seg_), __nw, std::__to_address(__result.__seg_));
+    __n -= __nw * __bits_per_word;
+    // do last word
+    if (__n > 0) {
+      __storage_type __m = ~__storage_type(0) << (__bits_per_word - __n);
+      __storage_type __b = *--__last.__seg_ & __m;
+      *--__result.__seg_ &= ~__m;
+      *__result.__seg_ |= __b;
+      __result.__ctz_ = static_cast<unsigned>(-__n & (__bits_per_word - 1));
+    }
+  }
+  return __result;
+}
+
+template <class _Cp, bool _IsConst>
+_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_backward_unaligned(
+    __bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
+  using _In             = __bit_iterator<_Cp, _IsConst>;
+  using difference_type = typename _In::difference_type;
+  using __storage_type  = typename _In::__storage_type;
+
+  const int __bits_per_word = _In::__bits_per_word;
+  difference_type __n       = __last - __first;
+  if (__n > 0) {
+    // do first word
+    if (__last.__ctz_ != 0) {
+      difference_type __dn = std::min(static_cast<difference_type>(__last.__ctz_), __n);
+      __n -= __dn;
+      unsigned __clz_l     = __bits_per_word - __last.__ctz_;
+      __storage_type __m   = (~__storage_type(0) << (__last.__ctz_ - __dn)) & (~__storage_type(0) >> __clz_l);
+      __storage_type __b   = *__last.__seg_ & __m;
+      unsigned __clz_r     = __bits_per_word - __result.__ctz_;
+      __storage_type __ddn = std::min(__dn, static_cast<difference_type>(__result.__ctz_));
+      if (__ddn > 0) {
+        __m = (~__storage_type(0) << (__result.__ctz_ - __ddn)) & (~__storage_type(0) >> __clz_r);
+        *__result.__seg_ &= ~__m;
+        if (__result.__ctz_ > __last.__ctz_)
+          *__result.__seg_ |= __b << (__result.__ctz_ - __last.__ctz_);
+        else
+          *__result.__seg_ |= __b >> (__last.__ctz_ - __result.__ctz_);
+        __result.__ctz_ = static_cast<unsigned>(((-__ddn & (__bits_per_word - 1)) + __result.__ctz_) % __bits_per_word);
+        __dn -= __ddn;
+      }
+      if (__dn > 0) {
+        // __result.__ctz_ == 0
+        --__result.__seg_;
+        __result.__ctz_ = static_cast<unsigned>(-__dn & (__bits_per_word - 1));
+        __m             = ~__storage_type(0) << __result.__ctz_;
+        *__result.__seg_ &= ~__m;
+        __last.__ctz_ -= __dn + __ddn;
+        *__result.__seg_ |= __b << (__result.__ctz_ - __last.__ctz_);
+      }
+      // __last.__ctz_ = 0
+    }
+    // __last.__ctz_ == 0 || __n == 0
+    // __result.__ctz_ != 0 || __n == 0
+    // do middle words
+    unsigned __clz_r   = __bits_per_word - __result.__ctz_;
+    __storage_type __m = ~__storage_type(0) >> __clz_r;
+    for (; __n >= __bits_per_word; __n -= __bits_per_word) {
+      __storage_type __b = *--__last.__seg_;
+      *__result.__seg_ &= ~__m;
+      *__result.__seg_ |= __b >> __clz_r;
+      *--__result.__seg_ &= __m;
+      *__result.__seg_ |= __b << __result.__ctz_;
+    }
+    // do last word
+    if (__n > 0) {
+      __m                 = ~__storage_type(0) << (__bits_per_word - __n);
+      __storage_type __b  = *--__last.__seg_ & __m;
+      __clz_r             = __bits_per_word - __result.__ctz_;
+      __storage_type __dn = std::min(__n, static_cast<difference_type>(__result.__ctz_));
+      __m                 = (~__storage_type(0) << (__result.__ctz_ - __dn)) & (~__storage_type(0) >> __clz_r);
+      *__result.__seg_ &= ~__m;
+      *__result.__seg_ |= __b >> (__bits_per_word - __result.__ctz_);
+      __result.__ctz_ = static_cast<unsigned>(((-__dn & (__bits_per_word - 1)) + __result.__ctz_) % __bits_per_word);
+      __n -= __dn;
+      if (__n > 0) {
+        // __result.__ctz_ == 0
+        --__result.__seg_;
+        __result.__ctz_ = static_cast<unsigned>(-__n & (__bits_per_word - 1));
+        __m             = ~__storage_type(0) << __result.__ctz_;
+        *__result.__seg_ &= ~__m;
+        *__result.__seg_ |= __b << (__result.__ctz_ - (__bits_per_word - __n - __dn));
+      }
+    }
+  }
+  return __result;
+}
+
 template <class _AlgPolicy>
 struct __copy_backward_impl {
   template <class _InIter, class _Sent, class _OutIter>
@@ -107,6 +228,16 @@ struct __copy_backward_impl {
     }
   }
 
+  template <class _Cp, bool _IsConst>
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<__bit_iterator<_Cp, _IsConst>, __bit_iterator<_Cp, false> >
+  operator()(__bit_iterator<_Cp, _IsConst> __first,
+             __bit_iterator<_Cp, _IsConst> __last,
+             __bit_iterator<_Cp, false> __result) {
+    if (__last.__ctz_ == __result.__ctz_)
+      return std::make_pair(__last, std::__copy_backward_aligned(__first, __last, __result));
+    return std::make_pair(__last, std::__copy_backward_unaligned(__first, __last, __result));
+  }
+
   // At this point, the iterators have been unwrapped so any `contiguous_iterator` has been unwrapped to a pointer.
   template <class _In, class _Out, __enable_if_t<__can_lower_copy_assignment_to_memmove<_In, _Out>::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_In*, _Out*>
diff --git libcxx/include/__bit_reference libcxx/include/__bit_reference
index 67abb023122e..bb8d4725c398 100644
--- libcxx/include/__bit_reference
+++ libcxx/include/__bit_reference
@@ -10,6 +10,8 @@
 #ifndef _LIBCPP___BIT_REFERENCE
 #define _LIBCPP___BIT_REFERENCE
 
+#include <__algorithm/copy.h>
+#include <__algorithm/copy_backward.h>
 #include <__algorithm/copy_n.h>
 #include <__algorithm/min.h>
 #include <__bit/countr.h>
@@ -24,6 +26,7 @@
 #include <__type_traits/conditional.h>
 #include <__type_traits/is_constant_evaluated.h>
 #include <__type_traits/void_t.h>
+#include <__utility/pair.h>
 #include <__utility/swap.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -183,258 +186,6 @@ private:
         __mask_(__m) {}
 };
 
-// copy
-
-template <class _Cp, bool _IsConst>
-_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_aligned(
-    __bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
-  using _In             = __bit_iterator<_Cp, _IsConst>;
-  using difference_type = typename _In::difference_type;
-  using __storage_type  = typename _In::__storage_type;
-
-  const int __bits_per_word = _In::__bits_per_word;
-  difference_type __n       = __last - __first;
-  if (__n > 0) {
-    // do first word
-    if (__first.__ctz_ != 0) {
-      unsigned __clz       = __bits_per_word - __first.__ctz_;
-      difference_type __dn = std::min(static_cast<difference_type>(__clz), __n);
-      __n -= __dn;
-      __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz - __dn));
-      __storage_type __b = *__first.__seg_ & __m;
-      *__result.__seg_ &= ~__m;
-      *__result.__seg_ |= __b;
-      __result.__seg_ += (__dn + __result.__ctz_) / __bits_per_word;
-      __result.__ctz_ = static_cast<unsigned>((__dn + __result.__ctz_) % __bits_per_word);
-      ++__first.__seg_;
-      // __first.__ctz_ = 0;
-    }
-    // __first.__ctz_ == 0;
-    // do middle words
-    __storage_type __nw = __n / __bits_per_word;
-    std::copy_n(std::__to_address(__first.__seg_), __nw, std::__to_address(__result.__seg_));
-    __n -= __nw * __bits_per_word;
-    __result.__seg_ += __nw;
-    // do last word
-    if (__n > 0) {
-      __first.__seg_ += __nw;
-      __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
-      __storage_type __b = *__first.__seg_ & __m;
-      *__result.__seg_ &= ~__m;
-      *__result.__seg_ |= __b;
-      __result.__ctz_ = static_cast<unsigned>(__n);
-    }
-  }
-  return __result;
-}
-
-template <class _Cp, bool _IsConst>
-_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_unaligned(
-    __bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
-  using _In             = __bit_iterator<_Cp, _IsConst>;
-  using difference_type = typename _In::difference_type;
-  using __storage_type  = typename _In::__storage_type;
-
-  const int __bits_per_word = _In::__bits_per_word;
-  difference_type __n       = __last - __first;
-  if (__n > 0) {
-    // do first word
-    if (__first.__ctz_ != 0) {
-      unsigned __clz_f     = __bits_per_word - __first.__ctz_;
-      difference_type __dn = std::min(static_cast<difference_type>(__clz_f), __n);
-      __n -= __dn;
-      __storage_type __m   = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
-      __storage_type __b   = *__first.__seg_ & __m;
-      unsigned __clz_r     = __bits_per_word - __result.__ctz_;
-      __storage_type __ddn = std::min<__storage_type>(__dn, __clz_r);
-      __m                  = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __ddn));
-      *__result.__seg_ &= ~__m;
-      if (__result.__ctz_ > __first.__ctz_)
-        *__result.__seg_ |= __b << (__result.__ctz_ - __first.__ctz_);
-      else
-        *__result.__seg_ |= __b >> (__first.__ctz_ - __result.__ctz_);
-      __result.__seg_ += (__ddn + __result.__ctz_) / __bits_per_word;
-      __result.__ctz_ = static_cast<unsigned>((__ddn + __result.__ctz_) % __bits_per_word);
-      __dn -= __ddn;
-      if (__dn > 0) {
-        __m = ~__storage_type(0) >> (__bits_per_word - __dn);
-        *__result.__seg_ &= ~__m;
-        *__result.__seg_ |= __b >> (__first.__ctz_ + __ddn);
-        __result.__ctz_ = static_cast<unsigned>(__dn);
-      }
-      ++__first.__seg_;
-      // __first.__ctz_ = 0;
-    }
-    // __first.__ctz_ == 0;
-    // do middle words
-    unsigned __clz_r   = __bits_per_word - __result.__ctz_;
-    __storage_type __m = ~__storage_type(0) << __result.__ctz_;
-    for (; __n >= __bits_per_word; __n -= __bits_per_word, ++__first.__seg_) {
-      __storage_type __b = *__first.__seg_;
-      *__result.__seg_ &= ~__m;
-      *__result.__seg_ |= __b << __result.__ctz_;
-      ++__result.__seg_;
-      *__result.__seg_ &= __m;
-      *__result.__seg_ |= __b >> __clz_r;
-    }
-    // do last word
-    if (__n > 0) {
-      __m                 = ~__storage_type(0) >> (__bits_per_word - __n);
-      __storage_type __b  = *__first.__seg_ & __m;
-      __storage_type __dn = std::min(__n, static_cast<difference_type>(__clz_r));
-      __m                 = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __dn));
-      *__result.__seg_ &= ~__m;
-      *__result.__seg_ |= __b << __result.__ctz_;
-      __result.__seg_ += (__dn + __result.__ctz_) / __bits_per_word;
-      __result.__ctz_ = static_cast<unsigned>((__dn + __result.__ctz_) % __bits_per_word);
-      __n -= __dn;
-      if (__n > 0) {
-        __m = ~__storage_type(0) >> (__bits_per_word - __n);
-        *__result.__seg_ &= ~__m;
-        *__result.__seg_ |= __b >> __dn;
-        __result.__ctz_ = static_cast<unsigned>(__n);
-      }
-    }
-  }
-  return __result;
-}
-
-template <class _Cp, bool _IsConst>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __bit_iterator<_Cp, false>
-copy(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
-  if (__first.__ctz_ == __result.__ctz_)
-    return std::__copy_aligned(__first, __last, __result);
-  return std::__copy_unaligned(__first, __last, __result);
-}
-
-// copy_backward
-
-template <class _Cp, bool _IsConst>
-_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_backward_aligned(
-    __bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
-  using _In             = __bit_iterator<_Cp, _IsConst>;
-  using difference_type = typename _In::difference_type;
-  using __storage_type  = typename _In::__storage_type;
-
-  const int __bits_per_word = _In::__bits_per_word;
-  difference_type __n       = __last - __first;
-  if (__n > 0) {
-    // do first word
-    if (__last.__ctz_ != 0) {
-      difference_type __dn = std::min(static_cast<difference_type>(__last.__ctz_), __n);
-      __n -= __dn;
-      unsigned __clz     = __bits_per_word - __last.__ctz_;
-      __storage_type __m = (~__storage_type(0) << (__last.__ctz_ - __dn)) & (~__storage_type(0) >> __clz);
-      __storage_type __b = *__last.__seg_ & __m;
-      *__result.__seg_ &= ~__m;
-      *__result.__seg_ |= __b;
-      __result.__ctz_ = static_cast<unsigned>(((-__dn & (__bits_per_word - 1)) + __result.__ctz_) % __bits_per_word);
-      // __last.__ctz_ = 0
-    }
-    // __last.__ctz_ == 0 || __n == 0
-    // __result.__ctz_ == 0 || __n == 0
-    // do middle words
-    __storage_type __nw = __n / __bits_per_word;
-    __result.__seg_ -= __nw;
-    __last.__seg_ -= __nw;
-    std::copy_n(std::__to_address(__last.__seg_), __nw, std::__to_address(__result.__seg_));
-    __n -= __nw * __bits_per_word;
-    // do last word
-    if (__n > 0) {
-      __storage_type __m = ~__storage_type(0) << (__bits_per_word - __n);
-      __storage_type __b = *--__last.__seg_ & __m;
-      *--__result.__seg_ &= ~__m;
-      *__result.__seg_ |= __b;
-      __result.__ctz_ = static_cast<unsigned>(-__n & (__bits_per_word - 1));
-    }
-  }
-  return __result;
-}
-
-template <class _Cp, bool _IsConst>
-_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_backward_unaligned(
-    __bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
-  using _In             = __bit_iterator<_Cp, _IsConst>;
-  using difference_type = typename _In::difference_type;
-  using __storage_type  = typename _In::__storage_type;
-
-  const int __bits_per_word = _In::__bits_per_word;
-  difference_type __n       = __last - __first;
-  if (__n > 0) {
-    // do first word
-    if (__last.__ctz_ != 0) {
-      difference_type __dn = std::min(static_cast<difference_type>(__last.__ctz_), __n);
-      __n -= __dn;
-      unsigned __clz_l     = __bits_per_word - __last.__ctz_;
-      __storage_type __m   = (~__storage_type(0) << (__last.__ctz_ - __dn)) & (~__storage_type(0) >> __clz_l);
-      __storage_type __b   = *__last.__seg_ & __m;
-      unsigned __clz_r     = __bits_per_word - __result.__ctz_;
-      __storage_type __ddn = std::min(__dn, static_cast<difference_type>(__result.__ctz_));
-      if (__ddn > 0) {
-        __m = (~__storage_type(0) << (__result.__ctz_ - __ddn)) & (~__storage_type(0) >> __clz_r);
-        *__result.__seg_ &= ~__m;
-        if (__result.__ctz_ > __last.__ctz_)
-          *__result.__seg_ |= __b << (__result.__ctz_ - __last.__ctz_);
-        else
-          *__result.__seg_ |= __b >> (__last.__ctz_ - __result.__ctz_);
-        __result.__ctz_ = static_cast<unsigned>(((-__ddn & (__bits_per_word - 1)) + __result.__ctz_) % __bits_per_word);
-        __dn -= __ddn;
-      }
-      if (__dn > 0) {
-        // __result.__ctz_ == 0
-        --__result.__seg_;
-        __result.__ctz_ = static_cast<unsigned>(-__dn & (__bits_per_word - 1));
-        __m             = ~__storage_type(0) << __result.__ctz_;
-        *__result.__seg_ &= ~__m;
-        __last.__ctz_ -= __dn + __ddn;
-        *__result.__seg_ |= __b << (__result.__ctz_ - __last.__ctz_);
-      }
-      // __last.__ctz_ = 0
-    }
-    // __last.__ctz_ == 0 || __n == 0
-    // __result.__ctz_ != 0 || __n == 0
-    // do middle words
-    unsigned __clz_r   = __bits_per_word - __result.__ctz_;
-    __storage_type __m = ~__storage_type(0) >> __clz_r;
-    for (; __n >= __bits_per_word; __n -= __bits_per_word) {
-      __storage_type __b = *--__last.__seg_;
-      *__result.__seg_ &= ~__m;
-      *__result.__seg_ |= __b >> __clz_r;
-      *--__result.__seg_ &= __m;
-      *__result.__seg_ |= __b << __result.__ctz_;
-    }
-    // do last word
-    if (__n > 0) {
-      __m                 = ~__storage_type(0) << (__bits_per_word - __n);
-      __storage_type __b  = *--__last.__seg_ & __m;
-      __clz_r             = __bits_per_word - __result.__ctz_;
-      __storage_type __dn = std::min(__n, static_cast<difference_type>(__result.__ctz_));
-      __m                 = (~__storage_type(0) << (__result.__ctz_ - __dn)) & (~__storage_type(0) >> __clz_r);
-      *__result.__seg_ &= ~__m;
-      *__result.__seg_ |= __b >> (__bits_per_word - __result.__ctz_);
-      __result.__ctz_ = static_cast<unsigned>(((-__dn & (__bits_per_word - 1)) + __result.__ctz_) % __bits_per_word);
-      __n -= __dn;
-      if (__n > 0) {
-        // __result.__ctz_ == 0
-        --__result.__seg_;
-        __result.__ctz_ = static_cast<unsigned>(-__n & (__bits_per_word - 1));
-        __m             = ~__storage_type(0) << __result.__ctz_;
-        *__result.__seg_ &= ~__m;
-        *__result.__seg_ |= __b << (__result.__ctz_ - (__bits_per_word - __n - __dn));
-      }
-    }
-  }
-  return __result;
-}
-
-template <class _Cp, bool _IsConst>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __bit_iterator<_Cp, false> copy_backward(
-    __bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
-  if (__last.__ctz_ == __result.__ctz_)
-    return std::__copy_backward_aligned(__first, __last, __result);
-  return std::__copy_backward_unaligned(__first, __last, __result);
-}
-
 // move
 
 template <class _Cp, bool _IsConst>
@@ -989,17 +740,17 @@ private:
   _LIBCPP_CONSTEXPR_SINCE_CXX20 friend __bit_iterator<_Dp, false> __copy_unaligned(
       __bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result);
   template <class _Dp, bool _IC>
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 friend __bit_iterator<_Dp, false>
-  copy(__bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result);
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 friend pair<__bit_iterator<_Dp, _IC>, __bit_iterator<_Dp, false> >
+  __copy_impl::operator()(
+      __bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result) const;
   template <class _Dp, bool _IC>
   _LIBCPP_CONSTEXPR_SINCE_CXX20 friend __bit_iterator<_Dp, false> __copy_backward_aligned(
       __bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result);
   template <class _Dp, bool _IC>
   _LIBCPP_CONSTEXPR_SINCE_CXX20 friend __bit_iterator<_Dp, false> __copy_backward_unaligned(
       __bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result);
-  template <class _Dp, bool _IC>
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 friend __bit_iterator<_Dp, false>
-  copy_backward(__bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result);
+  template <class _AlgPolicy>
+  friend struct __copy_backward_impl;
   template <class _Cl, class _Cr>
   friend __bit_iterator<_Cr, false>
       __swap_ranges_aligned(__bit_iterator<_Cl, false>, __bit_iterator<_Cl, false>, __bit_iterator<_Cr, false>);
diff --git libcxx/include/__hash_table libcxx/include/__hash_table
index 9a82ec51daee..d7b312f8774f 100644
--- libcxx/include/__hash_table
+++ libcxx/include/__hash_table
@@ -770,9 +770,10 @@ public:
 
   _LIBCPP_HIDE_FROM_ABI __hash_table& operator=(const __hash_table& __u);
   _LIBCPP_HIDE_FROM_ABI __hash_table& operator=(__hash_table&& __u)
-      _NOEXCEPT_(__node_traits::propagate_on_container_move_assignment::value&&
-                     is_nothrow_move_assignable<__node_allocator>::value&& is_nothrow_move_assignable<hasher>::value&&
-                         is_nothrow_move_assignable<key_equal>::value);
+      _NOEXCEPT_(is_nothrow_move_assignable<hasher>::value&& is_nothrow_move_assignable<key_equal>::value &&
+                 ((__node_traits::propagate_on_container_move_assignment::value &&
+                   is_nothrow_move_assignable<__node_allocator>::value) ||
+                  allocator_traits<__node_allocator>::is_always_equal::value));
   template <class _InputIterator>
   _LIBCPP_HIDE_FROM_ABI void __assign_unique(_InputIterator __first, _InputIterator __last);
   template <class _InputIterator>
@@ -1238,10 +1239,11 @@ void __hash_table<_Tp, _Hash, _Equal, _Alloc>::__move_assign(__hash_table& __u,
 }
 
 template <class _Tp, class _Hash, class _Equal, class _Alloc>
-inline __hash_table<_Tp, _Hash, _Equal, _Alloc>&
-__hash_table<_Tp, _Hash, _Equal, _Alloc>::operator=(__hash_table&& __u) _NOEXCEPT_(
-    __node_traits::propagate_on_container_move_assignment::value&& is_nothrow_move_assignable<__node_allocator>::value&&
-        is_nothrow_move_assignable<hasher>::value&& is_nothrow_move_assignable<key_equal>::value) {
+inline __hash_table<_Tp, _Hash, _Equal, _Alloc>& __hash_table<_Tp, _Hash, _Equal, _Alloc>::operator=(__hash_table&& __u)
+    _NOEXCEPT_(is_nothrow_move_assignable<hasher>::value&& is_nothrow_move_assignable<key_equal>::value &&
+               ((__node_traits::propagate_on_container_move_assignment::value &&
+                 is_nothrow_move_assignable<__node_allocator>::value) ||
+                allocator_traits<__node_allocator>::is_always_equal::value)) {
   __move_assign(__u, integral_constant<bool, __node_traits::propagate_on_container_move_assignment::value>());
   return *this;
 }
diff --git libcxx/include/__tree libcxx/include/__tree
index acad6c33f878..c627641d5d86 100644
--- libcxx/include/__tree
+++ libcxx/include/__tree
@@ -987,9 +987,12 @@ public:
   _LIBCPP_HIDE_FROM_ABI __tree(__tree&& __t) _NOEXCEPT_(
       is_nothrow_move_constructible<__node_allocator>::value&& is_nothrow_move_constructible<value_compare>::value);
   _LIBCPP_HIDE_FROM_ABI __tree(__tree&& __t, const allocator_type& __a);
-  _LIBCPP_HIDE_FROM_ABI __tree& operator=(__tree&& __t) _NOEXCEPT_(
-      __node_traits::propagate_on_container_move_assignment::value&& is_nothrow_move_assignable<value_compare>::value&&
-          is_nothrow_move_assignable<__node_allocator>::value);
+  _LIBCPP_HIDE_FROM_ABI __tree& operator=(__tree&& __t)
+      _NOEXCEPT_(is_nothrow_move_assignable<value_compare>::value &&
+                 ((__node_traits::propagate_on_container_move_assignment::value &&
+                   is_nothrow_move_assignable<__node_allocator>::value) ||
+                  allocator_traits<__node_allocator>::is_always_equal::value));
+
   _LIBCPP_HIDE_FROM_ABI ~__tree();
 
   _LIBCPP_HIDE_FROM_ABI iterator begin() _NOEXCEPT { return iterator(__begin_node()); }
@@ -1520,11 +1523,11 @@ void __tree<_Tp, _Compare, _Allocator>::__move_assign(__tree& __t, false_type) {
 }
 
 template <class _Tp, class _Compare, class _Allocator>
-__tree<_Tp, _Compare, _Allocator>& __tree<_Tp, _Compare, _Allocator>::operator=(__tree&& __t) _NOEXCEPT_(
-    __node_traits::propagate_on_container_move_assignment::value&& is_nothrow_move_assignable<value_compare>::value&&
-        is_nothrow_move_assignable<__node_allocator>::value)
-
-{
+__tree<_Tp, _Compare, _Allocator>& __tree<_Tp, _Compare, _Allocator>::operator=(__tree&& __t)
+    _NOEXCEPT_(is_nothrow_move_assignable<value_compare>::value &&
+               ((__node_traits::propagate_on_container_move_assignment::value &&
+                 is_nothrow_move_assignable<__node_allocator>::value) ||
+                allocator_traits<__node_allocator>::is_always_equal::value)) {
   __move_assign(__t, integral_constant<bool, __node_traits::propagate_on_container_move_assignment::value>());
   return *this;
 }
diff --git libcxx/include/__vector/vector_bool.h libcxx/include/__vector/vector_bool.h
index 8d9257eddfcd..25e5c42f1dd6 100644
--- libcxx/include/__vector/vector_bool.h
+++ libcxx/include/__vector/vector_bool.h
@@ -10,6 +10,7 @@
 #define _LIBCPP___VECTOR_VECTOR_BOOL_H
 
 #include <__algorithm/copy.h>
+#include <__algorithm/copy_backward.h>
 #include <__algorithm/fill_n.h>
 #include <__algorithm/iterator_operations.h>
 #include <__algorithm/max.h>
diff --git libcxx/include/bitset libcxx/include/bitset
index 10576eb80bf2..a20842985b3d 100644
--- libcxx/include/bitset
+++ libcxx/include/bitset
@@ -129,6 +129,8 @@ template <size_t N> struct hash<std::bitset<N>>;
 #if __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
 #  include <__cxx03/bitset>
 #else
+#  include <__algorithm/copy.h>
+#  include <__algorithm/copy_backward.h>
 #  include <__algorithm/count.h>
 #  include <__algorithm/fill.h>
 #  include <__algorithm/fill_n.h>
diff --git libcxx/include/deque libcxx/include/deque
index df3094cff7f8..95200b4801d7 100644
--- libcxx/include/deque
+++ libcxx/include/deque
@@ -59,9 +59,9 @@ public:
 
     deque& operator=(const deque& c);
     deque& operator=(deque&& c)
-        noexcept(
-             allocator_type::propagate_on_container_move_assignment::value &&
-             is_nothrow_move_assignable<allocator_type>::value);
+        noexcept((__alloc_traits::propagate_on_container_move_assignment::value &&
+                  is_nothrow_move_assignable<allocator_type>::value) ||
+                 allocator_traits<allocator_type>::is_always_equal::value);
     deque& operator=(initializer_list<value_type> il);
 
     template <class InputIterator>
@@ -674,9 +674,10 @@ public:
 
   _LIBCPP_HIDE_FROM_ABI deque(deque&& __c) noexcept(is_nothrow_move_constructible<allocator_type>::value);
   _LIBCPP_HIDE_FROM_ABI deque(deque&& __c, const __type_identity_t<allocator_type>& __a);
-  _LIBCPP_HIDE_FROM_ABI deque&
-  operator=(deque&& __c) noexcept(__alloc_traits::propagate_on_container_move_assignment::value &&
-                                  is_nothrow_move_assignable<allocator_type>::value);
+  _LIBCPP_HIDE_FROM_ABI deque& operator=(deque&& __c) noexcept(
+      (__alloc_traits::propagate_on_container_move_assignment::value &&
+       is_nothrow_move_assignable<allocator_type>::value) ||
+      allocator_traits<allocator_type>::is_always_equal::value);
 
   _LIBCPP_HIDE_FROM_ABI void assign(initializer_list<value_type> __il) { assign(__il.begin(), __il.end()); }
 #  endif // _LIBCPP_CXX03_LANG
@@ -1379,8 +1380,9 @@ inline deque<_Tp, _Allocator>::deque(deque&& __c, const __type_identity_t<alloca
 
 template <class _Tp, class _Allocator>
 inline deque<_Tp, _Allocator>& deque<_Tp, _Allocator>::operator=(deque&& __c) noexcept(
-    __alloc_traits::propagate_on_container_move_assignment::value &&
-    is_nothrow_move_assignable<allocator_type>::value) {
+    (__alloc_traits::propagate_on_container_move_assignment::value &&
+     is_nothrow_move_assignable<allocator_type>::value) ||
+    allocator_traits<allocator_type>::is_always_equal::value) {
   __move_assign(__c, integral_constant<bool, __alloc_traits::propagate_on_container_move_assignment::value>());
   return *this;
 }
diff --git libcxx/include/forward_list libcxx/include/forward_list
index f3b9617ab2e0..4b6ca8ea8587 100644
--- libcxx/include/forward_list
+++ libcxx/include/forward_list
@@ -58,9 +58,9 @@ public:
 
     forward_list& operator=(const forward_list& x);
     forward_list& operator=(forward_list&& x)
-        noexcept(
-             allocator_type::propagate_on_container_move_assignment::value &&
-             is_nothrow_move_assignable<allocator_type>::value);
+        noexcept((__node_traits::propagate_on_container_move_assignment::value &&
+                  is_nothrow_move_assignable<allocator_type>::value) ||
+                 allocator_traits<allocator_type>::is_always_equal::value);
     forward_list& operator=(initializer_list<value_type> il);
 
     template <class InputIterator>
@@ -717,8 +717,9 @@ public:
   _LIBCPP_HIDE_FROM_ABI forward_list(initializer_list<value_type> __il, const allocator_type& __a);
 
   _LIBCPP_HIDE_FROM_ABI forward_list& operator=(forward_list&& __x) noexcept(
-      __node_traits::propagate_on_container_move_assignment::value &&
-      is_nothrow_move_assignable<allocator_type>::value);
+      (__node_traits::propagate_on_container_move_assignment::value &&
+       is_nothrow_move_assignable<allocator_type>::value) ||
+      allocator_traits<allocator_type>::is_always_equal::value);
 
   _LIBCPP_HIDE_FROM_ABI forward_list& operator=(initializer_list<value_type> __il);
 
@@ -1009,8 +1010,10 @@ void forward_list<_Tp, _Alloc>::__move_assign(forward_list& __x, false_type) {
 }
 
 template <class _Tp, class _Alloc>
-inline forward_list<_Tp, _Alloc>& forward_list<_Tp, _Alloc>::operator=(forward_list&& __x) _NOEXCEPT_(
-    __node_traits::propagate_on_container_move_assignment::value&& is_nothrow_move_assignable<allocator_type>::value) {
+inline forward_list<_Tp, _Alloc>& forward_list<_Tp, _Alloc>::operator=(forward_list&& __x) noexcept(
+    (__node_traits::propagate_on_container_move_assignment::value &&
+     is_nothrow_move_assignable<allocator_type>::value) ||
+    allocator_traits<allocator_type>::is_always_equal::value) {
   __move_assign(__x, integral_constant<bool, __node_traits::propagate_on_container_move_assignment::value>());
   return *this;
 }
diff --git libcxx/include/list libcxx/include/list
index 5e2fd40d6ee9..3fcf796ebc03 100644
--- libcxx/include/list
+++ libcxx/include/list
@@ -60,9 +60,9 @@ public:
 
     list& operator=(const list& x);
     list& operator=(list&& x)
-        noexcept(
-             allocator_type::propagate_on_container_move_assignment::value &&
-             is_nothrow_move_assignable<allocator_type>::value);
+        noexcept((__node_alloc_traits::propagate_on_container_move_assignment::value &&
+                  is_nothrow_move_assignable<__node_allocator>::value) ||
+                 allocator_traits<allocator_type>::is_always_equal::value);
     list& operator=(initializer_list<value_type>);
     template <class Iter>
         void assign(Iter first, Iter last);
@@ -728,9 +728,10 @@ public:
 
   _LIBCPP_HIDE_FROM_ABI list(list&& __c) _NOEXCEPT_(is_nothrow_move_constructible<__node_allocator>::value);
   _LIBCPP_HIDE_FROM_ABI list(list&& __c, const __type_identity_t<allocator_type>& __a);
-  _LIBCPP_HIDE_FROM_ABI list& operator=(list&& __c)
-      _NOEXCEPT_(__node_alloc_traits::propagate_on_container_move_assignment::value&&
-                     is_nothrow_move_assignable<__node_allocator>::value);
+  _LIBCPP_HIDE_FROM_ABI list& operator=(list&& __c) noexcept(
+      (__node_alloc_traits::propagate_on_container_move_assignment::value &&
+       is_nothrow_move_assignable<__node_allocator>::value) ||
+      allocator_traits<allocator_type>::is_always_equal::value);
 
   _LIBCPP_HIDE_FROM_ABI list& operator=(initializer_list<value_type> __il) {
     assign(__il.begin(), __il.end());
@@ -1067,8 +1068,9 @@ inline list<_Tp, _Alloc>::list(list&& __c, const __type_identity_t<allocator_typ
 
 template <class _Tp, class _Alloc>
 inline list<_Tp, _Alloc>& list<_Tp, _Alloc>::operator=(list&& __c) noexcept(
-    __node_alloc_traits::propagate_on_container_move_assignment::value &&
-    is_nothrow_move_assignable<__node_allocator>::value) {
+    (__node_alloc_traits::propagate_on_container_move_assignment::value &&
+     is_nothrow_move_assignable<__node_allocator>::value) ||
+    allocator_traits<allocator_type>::is_always_equal::value) {
   __move_assign(__c, integral_constant<bool, __node_alloc_traits::propagate_on_container_move_assignment::value>());
   return *this;
 }
diff --git libcxx/test/benchmarks/GenerateInput.h libcxx/test/benchmarks/GenerateInput.h
index 6d5c5167e91e..081631a32b21 100644
--- libcxx/test/benchmarks/GenerateInput.h
+++ libcxx/test/benchmarks/GenerateInput.h
@@ -11,6 +11,7 @@
 
 #include <algorithm>
 #include <climits>
+#include <concepts>
 #include <cstddef>
 #include <random>
 #include <string>
@@ -171,4 +172,31 @@ inline std::vector<const char*> getRandomCStringInputs(std::size_t N) {
   return cinputs;
 }
 
+template <class T>
+struct Generate {
+  // When the contents don't matter
+  static T arbitrary();
+
+  // Prefer a cheap-to-construct element if possible
+  static T cheap();
+
+  // Prefer an expensive-to-construct element if possible
+  static T expensive();
+};
+
+template <class T>
+  requires std::integral<T>
+struct Generate<T> {
+  static T arbitrary() { return 42; }
+  static T cheap() { return 42; }
+  static T expensive() { return 42; }
+};
+
+template <>
+struct Generate<std::string> {
+  static std::string arbitrary() { return "hello world"; }
+  static std::string cheap() { return "small"; }
+  static std::string expensive() { return std::string(256, 'x'); }
+};
+
 #endif // BENCHMARK_GENERATE_INPUT_H
diff --git libcxx/test/benchmarks/Utilities.h libcxx/test/benchmarks/Utilities.h
deleted file mode 100644
index fed16ba51f99..000000000000
--- libcxx/test/benchmarks/Utilities.h
+++ /dev/null
@@ -1,37 +0,0 @@
-// -*- C++ -*-
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef BENCHMARK_UTILITIES_H
-#define BENCHMARK_UTILITIES_H
-
-#include <cassert>
-#include <type_traits>
-
-#include "benchmark/benchmark.h"
-
-namespace UtilitiesInternal {
-template <class Container>
-auto HaveDataImpl(int) -> decltype((std::declval<Container&>().data(), std::true_type{}));
-template <class Container>
-auto HaveDataImpl(long) -> std::false_type;
-template <class T>
-using HasData = decltype(HaveDataImpl<T>(0));
-} // namespace UtilitiesInternal
-
-template <class Container, std::enable_if_t<UtilitiesInternal::HasData<Container>::value>* = nullptr>
-void DoNotOptimizeData(Container& c) {
-  benchmark::DoNotOptimize(c.data());
-}
-
-template <class Container, std::enable_if_t<!UtilitiesInternal::HasData<Container>::value>* = nullptr>
-void DoNotOptimizeData(Container& c) {
-  benchmark::DoNotOptimize(&c);
-}
-
-#endif // BENCHMARK_UTILITIES_H
diff --git libcxx/test/benchmarks/algorithms/algorithms.partition_point.bench.cpp libcxx/test/benchmarks/algorithms/algorithms.partition_point.bench.cpp
index 0777acbafb5c..e0bd7e36f78a 100644
--- libcxx/test/benchmarks/algorithms/algorithms.partition_point.bench.cpp
+++ libcxx/test/benchmarks/algorithms/algorithms.partition_point.bench.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++03, c++11, c++14
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 #include <algorithm>
 #include <array>
diff --git libcxx/test/benchmarks/algorithms/copy.bench.cpp libcxx/test/benchmarks/algorithms/copy.bench.cpp
new file mode 100644
index 000000000000..b6f0f15eb770
--- /dev/null
+++ libcxx/test/benchmarks/algorithms/copy.bench.cpp
@@ -0,0 +1,89 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+#include <algorithm>
+#include <benchmark/benchmark.h>
+#include <vector>
+
+static void bm_ranges_copy_vb(benchmark::State& state, bool aligned) {
+  auto n = state.range();
+  std::vector<bool> in(n, true);
+  std::vector<bool> out(aligned ? n : n + 8);
+  benchmark::DoNotOptimize(&in);
+  auto dst = aligned ? out.begin() : out.begin() + 4;
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(std::ranges::copy(in, dst));
+    benchmark::DoNotOptimize(&out);
+  }
+}
+
+static void bm_ranges_copy_n_vb(benchmark::State& state, bool aligned) {
+  auto n = state.range();
+  std::vector<bool> in(n, true);
+  std::vector<bool> out(aligned ? n : n + 8);
+  benchmark::DoNotOptimize(&in);
+  auto src = in.begin();
+  auto dst = aligned ? out.begin() : out.begin() + 4;
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(std::ranges::copy_n(src, n, dst));
+    benchmark::DoNotOptimize(&out);
+  }
+}
+
+static void bm_copy_vb(benchmark::State& state, bool aligned) {
+  auto n = state.range();
+  std::vector<bool> in(n, true);
+  std::vector<bool> out(aligned ? n : n + 8);
+  benchmark::DoNotOptimize(&in);
+  auto beg = in.begin();
+  auto end = in.end();
+  auto dst = aligned ? out.begin() : out.begin() + 4;
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(std::copy(beg, end, dst));
+    benchmark::DoNotOptimize(&out);
+  }
+}
+
+static void bm_copy_n_vb(benchmark::State& state, bool aligned) {
+  auto n = state.range();
+  std::vector<bool> in(n, true);
+  std::vector<bool> out(aligned ? n : n + 8);
+  benchmark::DoNotOptimize(&in);
+  auto src = in.begin();
+  auto dst = aligned ? out.begin() : out.begin() + 4;
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(std::copy_n(src, n, dst));
+    benchmark::DoNotOptimize(&out);
+  }
+}
+
+static void bm_ranges_copy_vb_aligned(benchmark::State& state) { bm_ranges_copy_vb(state, true); }
+static void bm_ranges_copy_vb_unaligned(benchmark::State& state) { bm_ranges_copy_vb(state, false); }
+static void bm_ranges_copy_n_vb_aligned(benchmark::State& state) { bm_ranges_copy_n_vb(state, true); }
+static void bm_ranges_copy_n_vb_unaligned(benchmark::State& state) { bm_ranges_copy_n_vb(state, false); }
+
+static void bm_copy_vb_aligned(benchmark::State& state) { bm_copy_vb(state, true); }
+static void bm_copy_vb_unaligned(benchmark::State& state) { bm_copy_vb(state, false); }
+static void bm_copy_n_vb_aligned(benchmark::State& state) { bm_copy_n_vb(state, true); }
+static void bm_copy_n_vb_unaligned(benchmark::State& state) { bm_copy_n_vb(state, false); }
+
+// Test std::ranges::copy for vector<bool>::iterator
+BENCHMARK(bm_ranges_copy_vb_aligned)->Range(8, 1 << 16)->DenseRange(102400, 204800, 4096);
+BENCHMARK(bm_ranges_copy_n_vb_aligned)->Range(8, 1 << 20);
+BENCHMARK(bm_ranges_copy_vb_unaligned)->Range(8, 1 << 20);
+BENCHMARK(bm_ranges_copy_n_vb_unaligned)->Range(8, 1 << 20);
+
+// Test std::copy for vector<bool>::iterator
+BENCHMARK(bm_copy_vb_aligned)->Range(8, 1 << 20);
+BENCHMARK(bm_copy_n_vb_aligned)->Range(8, 1 << 20);
+BENCHMARK(bm_copy_vb_unaligned)->Range(8, 1 << 20);
+BENCHMARK(bm_copy_n_vb_unaligned)->Range(8, 1 << 20);
+
+BENCHMARK_MAIN();
diff --git libcxx/test/benchmarks/algorithms/copy_backward.bench.cpp libcxx/test/benchmarks/algorithms/copy_backward.bench.cpp
new file mode 100644
index 000000000000..c943d9a874b4
--- /dev/null
+++ libcxx/test/benchmarks/algorithms/copy_backward.bench.cpp
@@ -0,0 +1,55 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+#include <algorithm>
+#include <benchmark/benchmark.h>
+#include <vector>
+
+static void bm_ranges_copy_backward_vb(benchmark::State& state, bool aligned) {
+  auto n = state.range();
+  std::vector<bool> in(n, true);
+  std::vector<bool> out(aligned ? n : n + 8);
+  benchmark::DoNotOptimize(&in);
+  auto dst = aligned ? out.end() : out.end() - 4;
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(std::ranges::copy_backward(in, dst));
+    benchmark::DoNotOptimize(&out);
+  }
+}
+
+static void bm_copy_backward_vb(benchmark::State& state, bool aligned) {
+  auto n = state.range();
+  std::vector<bool> in(n, true);
+  std::vector<bool> out(aligned ? n : n + 8);
+  benchmark::DoNotOptimize(&in);
+  auto beg = in.begin();
+  auto end = in.end();
+  auto dst = aligned ? out.end() : out.end() - 4;
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(std::copy_backward(beg, end, dst));
+    benchmark::DoNotOptimize(&out);
+  }
+}
+
+static void bm_ranges_copy_backward_vb_aligned(benchmark::State& state) { bm_ranges_copy_backward_vb(state, true); }
+static void bm_ranges_copy_backward_vb_unaligned(benchmark::State& state) { bm_ranges_copy_backward_vb(state, false); }
+
+static void bm_copy_backward_vb_aligned(benchmark::State& state) { bm_copy_backward_vb(state, true); }
+static void bm_copy_backward_vb_unaligned(benchmark::State& state) { bm_copy_backward_vb(state, false); }
+
+// Test std::ranges::copy_backward for vector<bool>::iterator
+BENCHMARK(bm_ranges_copy_backward_vb_aligned)->Range(8, 1 << 16)->DenseRange(102400, 204800, 4096);
+BENCHMARK(bm_ranges_copy_backward_vb_unaligned)->Range(8, 1 << 20);
+
+// Test std::copy_backward for vector<bool>::iterator
+BENCHMARK(bm_copy_backward_vb_aligned)->Range(8, 1 << 20);
+BENCHMARK(bm_copy_backward_vb_unaligned)->Range(8, 1 << 20);
+
+BENCHMARK_MAIN();
diff --git libcxx/test/benchmarks/algorithms/lower_bound.bench.cpp libcxx/test/benchmarks/algorithms/lower_bound.bench.cpp
index d9d57969df67..31fb3597241f 100644
--- libcxx/test/benchmarks/algorithms/lower_bound.bench.cpp
+++ libcxx/test/benchmarks/algorithms/lower_bound.bench.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++03, c++11, c++14
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 #include <algorithm>
 #include <numeric>
diff --git libcxx/test/benchmarks/algorithms/make_heap.bench.cpp libcxx/test/benchmarks/algorithms/make_heap.bench.cpp
index b7320e17c3e5..64d559620c51 100644
--- libcxx/test/benchmarks/algorithms/make_heap.bench.cpp
+++ libcxx/test/benchmarks/algorithms/make_heap.bench.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++03, c++11, c++14
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 #include <algorithm>
 
diff --git libcxx/test/benchmarks/algorithms/make_heap_then_sort_heap.bench.cpp libcxx/test/benchmarks/algorithms/make_heap_then_sort_heap.bench.cpp
index 5991d2846aee..c6dc136be3ac 100644
--- libcxx/test/benchmarks/algorithms/make_heap_then_sort_heap.bench.cpp
+++ libcxx/test/benchmarks/algorithms/make_heap_then_sort_heap.bench.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++03, c++11, c++14
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 #include <algorithm>
 
diff --git libcxx/test/benchmarks/algorithms/pop_heap.bench.cpp libcxx/test/benchmarks/algorithms/pop_heap.bench.cpp
index 5fef52284239..e4b96a0ae48c 100644
--- libcxx/test/benchmarks/algorithms/pop_heap.bench.cpp
+++ libcxx/test/benchmarks/algorithms/pop_heap.bench.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++03, c++11, c++14
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 #include <algorithm>
 
diff --git libcxx/test/benchmarks/algorithms/pstl.stable_sort.bench.cpp libcxx/test/benchmarks/algorithms/pstl.stable_sort.bench.cpp
index 10254ac12cf5..a385185ec7fe 100644
--- libcxx/test/benchmarks/algorithms/pstl.stable_sort.bench.cpp
+++ libcxx/test/benchmarks/algorithms/pstl.stable_sort.bench.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++03, c++11, c++14
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 // UNSUPPORTED: libcpp-has-no-incomplete-pstl
 
 #include <algorithm>
diff --git libcxx/test/benchmarks/algorithms/push_heap.bench.cpp libcxx/test/benchmarks/algorithms/push_heap.bench.cpp
index 89d8122bd1db..7dfa0285348b 100644
--- libcxx/test/benchmarks/algorithms/push_heap.bench.cpp
+++ libcxx/test/benchmarks/algorithms/push_heap.bench.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++03, c++11, c++14
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 #include <algorithm>
 
diff --git libcxx/test/benchmarks/algorithms/reverse.bench.cpp libcxx/test/benchmarks/algorithms/reverse.bench.cpp
new file mode 100644
index 000000000000..2d8dd819ac24
--- /dev/null
+++ libcxx/test/benchmarks/algorithms/reverse.bench.cpp
@@ -0,0 +1,48 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+#include <algorithm>
+#include <iterator>
+#include <string>
+#include <vector>
+
+#include <benchmark/benchmark.h>
+#include "../GenerateInput.h"
+
+template <class T>
+static void bm_reverse(benchmark::State& state) {
+  std::size_t const n = state.range();
+  std::vector<T> vec;
+  std::generate_n(std::back_inserter(vec), n, [] { return Generate<T>::cheap(); });
+  for (auto _ : state) {
+    std::reverse(vec.begin(), vec.end());
+    benchmark::DoNotOptimize(vec);
+  }
+}
+BENCHMARK(bm_reverse<int>)->Name("std::reverse(vector<int>)")->DenseRange(1, 8)->Range(16, 1 << 20);
+BENCHMARK(bm_reverse<std::string>)->Name("std::reverse(vector<string>)")->DenseRange(1, 8)->Range(16, 1 << 20);
+
+template <class T>
+static void bm_ranges_reverse(benchmark::State& state) {
+  std::size_t const n = state.range();
+  std::vector<T> vec;
+  std::generate_n(std::back_inserter(vec), n, [] { return Generate<T>::cheap(); });
+  for (auto _ : state) {
+    std::ranges::reverse(vec.begin(), vec.end());
+    benchmark::DoNotOptimize(vec);
+  }
+}
+BENCHMARK(bm_ranges_reverse<int>)->Name("ranges::reverse(vector<int>)")->DenseRange(1, 8)->Range(16, 1 << 20);
+BENCHMARK(bm_ranges_reverse<std::string>)
+    ->Name("ranges::reverse(vector<string>)")
+    ->DenseRange(1, 8)
+    ->Range(16, 1 << 20);
+
+BENCHMARK_MAIN();
diff --git libcxx/test/benchmarks/algorithms/set_intersection.bench.cpp libcxx/test/benchmarks/algorithms/set_intersection.bench.cpp
index 9bde4bb29dc2..40292179781e 100644
--- libcxx/test/benchmarks/algorithms/set_intersection.bench.cpp
+++ libcxx/test/benchmarks/algorithms/set_intersection.bench.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++03, c++11, c++14
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 #include <algorithm>
 #include <cstdlib>
diff --git libcxx/test/benchmarks/algorithms/sort.bench.cpp libcxx/test/benchmarks/algorithms/sort.bench.cpp
index 899272e34795..7f3ce6ff7a07 100644
--- libcxx/test/benchmarks/algorithms/sort.bench.cpp
+++ libcxx/test/benchmarks/algorithms/sort.bench.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++03, c++11, c++14
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 #include <algorithm>
 
diff --git libcxx/test/benchmarks/algorithms/sort_heap.bench.cpp libcxx/test/benchmarks/algorithms/sort_heap.bench.cpp
index ee4b6bfc7387..1ce9f1a6df9a 100644
--- libcxx/test/benchmarks/algorithms/sort_heap.bench.cpp
+++ libcxx/test/benchmarks/algorithms/sort_heap.bench.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++03, c++11, c++14
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 #include <algorithm>
 
diff --git libcxx/test/benchmarks/algorithms/stable_sort.bench.cpp libcxx/test/benchmarks/algorithms/stable_sort.bench.cpp
index c68f73838c31..26e8de935f5c 100644
--- libcxx/test/benchmarks/algorithms/stable_sort.bench.cpp
+++ libcxx/test/benchmarks/algorithms/stable_sort.bench.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++03, c++11, c++14
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 #include <algorithm>
 
diff --git libcxx/test/benchmarks/containers/ContainerBenchmarks.h libcxx/test/benchmarks/containers/ContainerBenchmarks.h
deleted file mode 100644
index 5fc898161967..000000000000
--- libcxx/test/benchmarks/containers/ContainerBenchmarks.h
+++ /dev/null
@@ -1,332 +0,0 @@
-// -*- C++ -*-
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef BENCHMARK_CONTAINER_BENCHMARKS_H
-#define BENCHMARK_CONTAINER_BENCHMARKS_H
-
-#include <cassert>
-#include <iterator>
-#include <utility>
-
-#include "benchmark/benchmark.h"
-#include "../Utilities.h"
-#include "test_iterators.h"
-
-namespace ContainerBenchmarks {
-
-template <class Container>
-void BM_ConstructSize(benchmark::State& st, Container) {
-  auto size = st.range(0);
-  for (auto _ : st) {
-    Container c(size);
-    DoNotOptimizeData(c);
-  }
-}
-
-template <class Container>
-void BM_CopyConstruct(benchmark::State& st, Container) {
-  auto size = st.range(0);
-  Container c(size);
-  for (auto _ : st) {
-    auto v = c;
-    DoNotOptimizeData(v);
-  }
-}
-
-template <class Container>
-void BM_Assignment(benchmark::State& st, Container) {
-  auto size = st.range(0);
-  Container c1;
-  Container c2(size);
-  for (auto _ : st) {
-    c1 = c2;
-    DoNotOptimizeData(c1);
-    DoNotOptimizeData(c2);
-  }
-}
-
-template <std::size_t... sz, typename Container, typename GenInputs>
-void BM_AssignInputIterIter(benchmark::State& st, Container c, GenInputs gen) {
-  auto v = gen(1, sz...);
-  c.resize(st.range(0), v[0]);
-  auto in = gen(st.range(1), sz...);
-  benchmark::DoNotOptimize(&in);
-  benchmark::DoNotOptimize(&c);
-  for (auto _ : st) {
-    c.assign(cpp17_input_iterator(in.begin()), cpp17_input_iterator(in.end()));
-    benchmark::ClobberMemory();
-  }
-}
-
-template <class Container>
-void BM_ConstructSizeValue(benchmark::State& st, Container, typename Container::value_type const& val) {
-  const auto size = st.range(0);
-  for (auto _ : st) {
-    Container c(size, val);
-    DoNotOptimizeData(c);
-  }
-}
-
-template <class Container, class GenInputs>
-void BM_ConstructIterIter(benchmark::State& st, Container, GenInputs gen) {
-  auto in          = gen(st.range(0));
-  const auto begin = in.begin();
-  const auto end   = in.end();
-  benchmark::DoNotOptimize(&in);
-  while (st.KeepRunning()) {
-    Container c(begin, end);
-    DoNotOptimizeData(c);
-  }
-}
-
-template <class Container, class GenInputs>
-void BM_ConstructFromRange(benchmark::State& st, Container, GenInputs gen) {
-  auto in = gen(st.range(0));
-  benchmark::DoNotOptimize(&in);
-  while (st.KeepRunning()) {
-    Container c(std::from_range, in);
-    DoNotOptimizeData(c);
-  }
-}
-
-template <class Container>
-void BM_Pushback_no_grow(benchmark::State& state, Container c) {
-  int count = state.range(0);
-  c.reserve(count);
-  while (state.KeepRunningBatch(count)) {
-    c.clear();
-    for (int i = 0; i != count; ++i) {
-      c.push_back(i);
-    }
-    benchmark::DoNotOptimize(c.data());
-  }
-}
-
-template <class Container, class GenInputs>
-void BM_InsertValue(benchmark::State& st, Container c, GenInputs gen) {
-  auto in        = gen(st.range(0));
-  const auto end = in.end();
-  while (st.KeepRunning()) {
-    c.clear();
-    for (auto it = in.begin(); it != end; ++it) {
-      benchmark::DoNotOptimize(&(*c.insert(*it).first));
-    }
-    benchmark::ClobberMemory();
-  }
-}
-
-template <class Container, class GenInputs>
-void BM_InsertValueRehash(benchmark::State& st, Container c, GenInputs gen) {
-  auto in        = gen(st.range(0));
-  const auto end = in.end();
-  while (st.KeepRunning()) {
-    c.clear();
-    c.rehash(16);
-    for (auto it = in.begin(); it != end; ++it) {
-      benchmark::DoNotOptimize(&(*c.insert(*it).first));
-    }
-    benchmark::ClobberMemory();
-  }
-}
-
-template <class Container, class GenInputs>
-void BM_Insert_InputIterIter_NoRealloc(benchmark::State& st, Container c, GenInputs gen) {
-  auto in = gen(st.range(0));
-  DoNotOptimizeData(in);
-  const auto size = c.size();
-  const auto beg  = cpp17_input_iterator(in.begin());
-  const auto end  = cpp17_input_iterator(in.end());
-  c.reserve(size + in.size()); // force no reallocation
-  for (auto _ : st) {
-    benchmark::DoNotOptimize(&(*c.insert(c.begin(), beg, end)));
-    st.PauseTiming();
-    c.erase(c.begin() + size, c.end()); // avoid the container to grow indefinitely
-    st.ResumeTiming();
-    DoNotOptimizeData(c);
-    benchmark::ClobberMemory();
-  }
-}
-
-template <class Container, class GenInputs>
-void BM_Insert_InputIterIter_Realloc_HalfFilled(benchmark::State& st, Container, GenInputs gen) {
-  const auto size = st.range(0);
-  Container a     = gen(size);
-  Container in    = gen(size + 10);
-  DoNotOptimizeData(a);
-  DoNotOptimizeData(in);
-  const auto beg = cpp17_input_iterator(in.begin());
-  const auto end = cpp17_input_iterator(in.end());
-  for (auto _ : st) {
-    st.PauseTiming();
-    Container c;
-    c.reserve(size * 2); // Reallocation with half-filled container
-    c = a;
-    st.ResumeTiming();
-    benchmark::DoNotOptimize(&(*c.insert(c.begin(), beg, end)));
-    DoNotOptimizeData(c);
-    benchmark::ClobberMemory();
-  }
-}
-
-template <class Container, class GenInputs>
-void BM_Insert_InputIterIter_Realloc_NearFull(benchmark::State& st, Container, GenInputs gen) {
-  const auto size = st.range(0);
-  Container a     = gen(size);
-  Container in    = gen(10);
-  DoNotOptimizeData(a);
-  DoNotOptimizeData(in);
-  const auto beg = cpp17_input_iterator(in.begin());
-  const auto end = cpp17_input_iterator(in.end());
-  for (auto _ : st) {
-    st.PauseTiming();
-    Container c;
-    c.reserve(size + 5); // Reallocation almost-full container
-    c = a;
-    st.ResumeTiming();
-    benchmark::DoNotOptimize(&(*c.insert(c.begin(), beg, end)));
-    DoNotOptimizeData(c);
-    benchmark::ClobberMemory();
-  }
-}
-
-template <class Container, class GenInputs>
-void BM_InsertDuplicate(benchmark::State& st, Container c, GenInputs gen) {
-  auto in        = gen(st.range(0));
-  const auto end = in.end();
-  c.insert(in.begin(), in.end());
-  benchmark::DoNotOptimize(&c);
-  benchmark::DoNotOptimize(&in);
-  while (st.KeepRunning()) {
-    for (auto it = in.begin(); it != end; ++it) {
-      benchmark::DoNotOptimize(&(*c.insert(*it).first));
-    }
-    benchmark::ClobberMemory();
-  }
-}
-
-template <class Container, class GenInputs>
-void BM_EmplaceDuplicate(benchmark::State& st, Container c, GenInputs gen) {
-  auto in        = gen(st.range(0));
-  const auto end = in.end();
-  c.insert(in.begin(), in.end());
-  benchmark::DoNotOptimize(&c);
-  benchmark::DoNotOptimize(&in);
-  while (st.KeepRunning()) {
-    for (auto it = in.begin(); it != end; ++it) {
-      benchmark::DoNotOptimize(&(*c.emplace(*it).first));
-    }
-    benchmark::ClobberMemory();
-  }
-}
-
-template <class Container, class GenInputs>
-void BM_erase_iter_in_middle(benchmark::State& st, Container, GenInputs gen) {
-  auto in = gen(st.range(0));
-  Container c(in.begin(), in.end());
-  assert(c.size() > 2);
-  for (auto _ : st) {
-    auto mid    = std::next(c.begin(), c.size() / 2);
-    auto tmp    = *mid;
-    auto result = c.erase(mid); // erase an element in the middle
-    benchmark::DoNotOptimize(result);
-    c.push_back(std::move(tmp)); // and then push it back at the end to avoid needing a new container
-  }
-}
-
-template <class Container, class GenInputs>
-void BM_erase_iter_at_start(benchmark::State& st, Container, GenInputs gen) {
-  auto in = gen(st.range(0));
-  Container c(in.begin(), in.end());
-  assert(c.size() > 2);
-  for (auto _ : st) {
-    auto it     = c.begin();
-    auto tmp    = *it;
-    auto result = c.erase(it); // erase the first element
-    benchmark::DoNotOptimize(result);
-    c.push_back(std::move(tmp)); // and then push it back at the end to avoid needing a new container
-  }
-}
-
-template <class Container, class GenInputs>
-void BM_Find(benchmark::State& st, Container c, GenInputs gen) {
-  auto in = gen(st.range(0));
-  c.insert(in.begin(), in.end());
-  benchmark::DoNotOptimize(&(*c.begin()));
-  const auto end = in.data() + in.size();
-  while (st.KeepRunning()) {
-    for (auto it = in.data(); it != end; ++it) {
-      benchmark::DoNotOptimize(&(*c.find(*it)));
-    }
-    benchmark::ClobberMemory();
-  }
-}
-
-template <class Container, class GenInputs>
-void BM_FindRehash(benchmark::State& st, Container c, GenInputs gen) {
-  c.rehash(8);
-  auto in = gen(st.range(0));
-  c.insert(in.begin(), in.end());
-  benchmark::DoNotOptimize(&(*c.begin()));
-  const auto end = in.data() + in.size();
-  while (st.KeepRunning()) {
-    for (auto it = in.data(); it != end; ++it) {
-      benchmark::DoNotOptimize(&(*c.find(*it)));
-    }
-    benchmark::ClobberMemory();
-  }
-}
-
-template <class Container, class GenInputs>
-void BM_Rehash(benchmark::State& st, Container c, GenInputs gen) {
-  auto in = gen(st.range(0));
-  c.max_load_factor(3.0);
-  c.insert(in.begin(), in.end());
-  benchmark::DoNotOptimize(c);
-  const auto bucket_count = c.bucket_count();
-  while (st.KeepRunning()) {
-    c.rehash(bucket_count + 1);
-    c.rehash(bucket_count);
-    benchmark::ClobberMemory();
-  }
-}
-
-template <class Container, class GenInputs>
-void BM_Compare_same_container(benchmark::State& st, Container, GenInputs gen) {
-  auto in = gen(st.range(0));
-  Container c1(in.begin(), in.end());
-  Container c2 = c1;
-
-  benchmark::DoNotOptimize(&(*c1.begin()));
-  benchmark::DoNotOptimize(&(*c2.begin()));
-  while (st.KeepRunning()) {
-    bool res = c1 == c2;
-    benchmark::DoNotOptimize(&res);
-    benchmark::ClobberMemory();
-  }
-}
-
-template <class Container, class GenInputs>
-void BM_Compare_different_containers(benchmark::State& st, Container, GenInputs gen) {
-  auto in1 = gen(st.range(0));
-  auto in2 = gen(st.range(0));
-  Container c1(in1.begin(), in1.end());
-  Container c2(in2.begin(), in2.end());
-
-  benchmark::DoNotOptimize(&(*c1.begin()));
-  benchmark::DoNotOptimize(&(*c2.begin()));
-  while (st.KeepRunning()) {
-    bool res = c1 == c2;
-    benchmark::DoNotOptimize(&res);
-    benchmark::ClobberMemory();
-  }
-}
-
-} // namespace ContainerBenchmarks
-
-#endif // BENCHMARK_CONTAINER_BENCHMARKS_H
diff --git libcxx/test/benchmarks/containers/container_benchmarks.h libcxx/test/benchmarks/containers/container_benchmarks.h
new file mode 100644
index 000000000000..e24bd767177e
--- /dev/null
+++ libcxx/test/benchmarks/containers/container_benchmarks.h
@@ -0,0 +1,609 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TEST_BENCHMARKS_CONTAINERS_CONTAINER_BENCHMARKS_H
+#define TEST_BENCHMARKS_CONTAINERS_CONTAINER_BENCHMARKS_H
+
+#include <algorithm>
+#include <cstddef>
+#include <iterator>
+#include <ranges> // for std::from_range
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include "benchmark/benchmark.h"
+#include "test_iterators.h"
+#include "test_macros.h"
+#include "../GenerateInput.h"
+
+namespace ContainerBenchmarks {
+
+template <class Container>
+void DoNotOptimizeData(Container& c) {
+  if constexpr (requires { c.data(); }) {
+    benchmark::DoNotOptimize(c.data());
+  } else {
+    benchmark::DoNotOptimize(&c);
+  }
+}
+
+//
+// Sequence container operations
+//
+template <class Container>
+void BM_ctor_size(benchmark::State& st) {
+  auto size = st.range(0);
+
+  for (auto _ : st) {
+    Container c(size); // we assume the destructor doesn't dominate the benchmark
+    DoNotOptimizeData(c);
+  }
+}
+
+template <class Container, class Generator>
+void BM_ctor_size_value(benchmark::State& st, Generator gen) {
+  using ValueType = typename Container::value_type;
+  const auto size = st.range(0);
+  ValueType value = gen();
+  benchmark::DoNotOptimize(value);
+
+  for (auto _ : st) {
+    Container c(size, value); // we assume the destructor doesn't dominate the benchmark
+    DoNotOptimizeData(c);
+  }
+}
+
+template <class Container, class Generator>
+void BM_ctor_iter_iter(benchmark::State& st, Generator gen) {
+  using ValueType = typename Container::value_type;
+  const auto size = st.range(0);
+  std::vector<ValueType> in;
+  std::generate_n(std::back_inserter(in), size, gen);
+  const auto begin = in.begin();
+  const auto end   = in.end();
+  benchmark::DoNotOptimize(in);
+
+  for (auto _ : st) {
+    Container c(begin, end); // we assume the destructor doesn't dominate the benchmark
+    DoNotOptimizeData(c);
+  }
+}
+
+#if TEST_STD_VER >= 23
+template <class Container, class Generator>
+void BM_ctor_from_range(benchmark::State& st, Generator gen) {
+  using ValueType = typename Container::value_type;
+  const auto size = st.range(0);
+  std::vector<ValueType> in;
+  std::generate_n(std::back_inserter(in), size, gen);
+  benchmark::DoNotOptimize(in);
+
+  for (auto _ : st) {
+    Container c(std::from_range, in); // we assume the destructor doesn't dominate the benchmark
+    DoNotOptimizeData(c);
+  }
+}
+#endif
+
+template <class Container, class Generator>
+void BM_ctor_copy(benchmark::State& st, Generator gen) {
+  auto size = st.range(0);
+  Container in;
+  std::generate_n(std::back_inserter(in), size, gen);
+  DoNotOptimizeData(in);
+
+  for (auto _ : st) {
+    Container c(in); // we assume the destructor doesn't dominate the benchmark
+    DoNotOptimizeData(c);
+    DoNotOptimizeData(in);
+  }
+}
+
+template <class Container, class Generator>
+void BM_assignment(benchmark::State& st, Generator gen) {
+  auto size = st.range(0);
+  Container in1, in2;
+  std::generate_n(std::back_inserter(in1), size, gen);
+  std::generate_n(std::back_inserter(in2), size, gen);
+  DoNotOptimizeData(in1);
+  DoNotOptimizeData(in2);
+
+  // Assign from one of two containers in succession to avoid
+  // hitting a self-assignment corner-case
+  Container c(in1);
+  bool toggle = false;
+  for (auto _ : st) {
+    c      = toggle ? in1 : in2;
+    toggle = !toggle;
+    DoNotOptimizeData(c);
+    DoNotOptimizeData(in1);
+    DoNotOptimizeData(in2);
+  }
+}
+
+// Benchmark Container::assign(input-iter, input-iter) when the container already contains
+// the same number of elements that we're assigning. The intent is to check whether the
+// implementation basically creates a new container from scratch or manages to reuse the
+// pre-existing storage.
+template <typename Container, class Generator>
+void BM_assign_input_iter_full(benchmark::State& st, Generator gen) {
+  using ValueType = typename Container::value_type;
+  auto size       = st.range(0);
+  std::vector<ValueType> in1, in2;
+  std::generate_n(std::back_inserter(in1), size, gen);
+  std::generate_n(std::back_inserter(in2), size, gen);
+  DoNotOptimizeData(in1);
+  DoNotOptimizeData(in2);
+
+  Container c(in1.begin(), in1.end());
+  bool toggle = false;
+  for (auto _ : st) {
+    std::vector<ValueType>& in = toggle ? in1 : in2;
+    auto first                 = in.data();
+    auto last                  = in.data() + in.size();
+    c.assign(cpp17_input_iterator(first), cpp17_input_iterator(last));
+    toggle = !toggle;
+    DoNotOptimizeData(c);
+  }
+}
+
+template <class Container, class Generator>
+void BM_insert_begin(benchmark::State& st, Generator gen) {
+  using ValueType = typename Container::value_type;
+  const int size  = st.range(0);
+  std::vector<ValueType> in;
+  std::generate_n(std::back_inserter(in), size, gen);
+  DoNotOptimizeData(in);
+
+  Container c(in.begin(), in.end());
+  DoNotOptimizeData(c);
+
+  ValueType value = gen();
+  benchmark::DoNotOptimize(value);
+
+  for (auto _ : st) {
+    c.insert(c.begin(), value);
+    DoNotOptimizeData(c);
+
+    c.erase(std::prev(c.end())); // avoid growing indefinitely
+  }
+}
+
+template <class Container, class Generator>
+  requires std::random_access_iterator<typename Container::iterator>
+void BM_insert_middle(benchmark::State& st, Generator gen) {
+  using ValueType = typename Container::value_type;
+  const int size  = st.range(0);
+  std::vector<ValueType> in;
+  std::generate_n(std::back_inserter(in), size, gen);
+  DoNotOptimizeData(in);
+
+  Container c(in.begin(), in.end());
+  DoNotOptimizeData(c);
+
+  ValueType value = gen();
+  benchmark::DoNotOptimize(value);
+
+  for (auto _ : st) {
+    auto mid = c.begin() + (size / 2); // requires random-access iterators in order to make sense
+    c.insert(mid, value);
+    DoNotOptimizeData(c);
+
+    c.erase(c.end() - 1); // avoid growing indefinitely
+  }
+}
+
+// Insert at the start of a vector in a scenario where the vector already
+// has enough capacity to hold all the elements we are inserting.
+template <class Container, class Generator>
+void BM_insert_begin_input_iter_with_reserve_no_realloc(benchmark::State& st, Generator gen) {
+  using ValueType = typename Container::value_type;
+  const int size  = st.range(0);
+  std::vector<ValueType> in;
+  std::generate_n(std::back_inserter(in), size, gen);
+  DoNotOptimizeData(in);
+  auto first = in.data();
+  auto last  = in.data() + in.size();
+
+  const int small = 100; // arbitrary
+  Container c;
+  c.reserve(size + small); // ensure no reallocation
+  std::generate_n(std::back_inserter(c), small, gen);
+
+  for (auto _ : st) {
+    c.insert(c.begin(), cpp17_input_iterator(first), cpp17_input_iterator(last));
+    DoNotOptimizeData(c);
+
+    st.PauseTiming();
+    c.erase(c.begin() + small, c.end()); // avoid growing indefinitely
+    st.ResumeTiming();
+  }
+}
+
+// Insert at the start of a vector in a scenario where the vector already
+// has almost enough capacity to hold all the elements we are inserting,
+// but does need to reallocate.
+template <class Container, class Generator>
+void BM_insert_begin_input_iter_with_reserve_almost_no_realloc(benchmark::State& st, Generator gen) {
+  using ValueType = typename Container::value_type;
+  const int size  = st.range(0);
+  std::vector<ValueType> in;
+  std::generate_n(std::back_inserter(in), size, gen);
+  DoNotOptimizeData(in);
+  auto first = in.data();
+  auto last  = in.data() + in.size();
+
+  const int overflow = size / 10; // 10% of elements won't fit in the vector when we insert
+  Container c;
+  for (auto _ : st) {
+    st.PauseTiming();
+    c = Container();
+    c.reserve(size);
+    std::generate_n(std::back_inserter(c), overflow, gen);
+    st.ResumeTiming();
+
+    c.insert(c.begin(), cpp17_input_iterator(first), cpp17_input_iterator(last));
+    DoNotOptimizeData(c);
+  }
+}
+
+// Insert at the start of a vector in a scenario where the vector can fit a few
+// more elements, but needs to reallocate almost immediately to fit the remaining
+// elements.
+template <class Container, class Generator>
+void BM_insert_begin_input_iter_with_reserve_near_full(benchmark::State& st, Generator gen) {
+  using ValueType = typename Container::value_type;
+  const int size  = st.range(0);
+  std::vector<ValueType> in;
+  std::generate_n(std::back_inserter(in), size, gen);
+  DoNotOptimizeData(in);
+  auto first = in.data();
+  auto last  = in.data() + in.size();
+
+  const int overflow = 9 * (size / 10); // 90% of elements won't fit in the vector when we insert
+  Container c;
+  for (auto _ : st) {
+    st.PauseTiming();
+    c = Container();
+    c.reserve(size);
+    std::generate_n(std::back_inserter(c), overflow, gen);
+    st.ResumeTiming();
+
+    c.insert(c.begin(), cpp17_input_iterator(first), cpp17_input_iterator(last));
+    DoNotOptimizeData(c);
+  }
+}
+
+template <class Container, class Generator>
+void BM_erase_begin(benchmark::State& st, Generator gen) {
+  using ValueType = typename Container::value_type;
+  const int size  = st.range(0);
+  std::vector<ValueType> in;
+  std::generate_n(std::back_inserter(in), size, gen);
+  DoNotOptimizeData(in);
+
+  Container c(in.begin(), in.end());
+  DoNotOptimizeData(c);
+
+  ValueType value = gen();
+  benchmark::DoNotOptimize(value);
+
+  for (auto _ : st) {
+    c.erase(c.begin());
+    DoNotOptimizeData(c);
+
+    c.insert(c.end(), value); // re-insert an element at the end to avoid needing a new container
+  }
+}
+
+template <class Container, class Generator>
+  requires std::random_access_iterator<typename Container::iterator>
+void BM_erase_middle(benchmark::State& st, Generator gen) {
+  using ValueType = typename Container::value_type;
+  const int size  = st.range(0);
+  std::vector<ValueType> in;
+  std::generate_n(std::back_inserter(in), size, gen);
+  DoNotOptimizeData(in);
+
+  Container c(in.begin(), in.end());
+  DoNotOptimizeData(c);
+
+  ValueType value = gen();
+  benchmark::DoNotOptimize(value);
+
+  for (auto _ : st) {
+    auto mid = c.begin() + (size / 2);
+    c.erase(mid);
+    DoNotOptimizeData(c);
+
+    c.insert(c.end(), value); // re-insert an element at the end to avoid needing a new container
+  }
+}
+
+template <class Container, class Generator>
+void BM_push_back(benchmark::State& st, Generator gen) {
+  using ValueType = typename Container::value_type;
+  const int size  = st.range(0);
+  std::vector<ValueType> in;
+  std::generate_n(std::back_inserter(in), size, gen);
+  DoNotOptimizeData(in);
+
+  Container c;
+  DoNotOptimizeData(c);
+  while (st.KeepRunningBatch(size)) {
+    c.clear();
+    for (int i = 0; i != size; ++i) {
+      c.push_back(in[i]);
+    }
+    DoNotOptimizeData(c);
+  }
+}
+
+template <class Container, class Generator>
+void BM_push_back_with_reserve(benchmark::State& st, Generator gen) {
+  using ValueType = typename Container::value_type;
+  const int size  = st.range(0);
+  std::vector<ValueType> in;
+  std::generate_n(std::back_inserter(in), size, gen);
+  DoNotOptimizeData(in);
+
+  Container c;
+  c.reserve(size);
+  DoNotOptimizeData(c);
+  while (st.KeepRunningBatch(size)) {
+    c.clear();
+    for (int i = 0; i != size; ++i) {
+      c.push_back(in[i]);
+    }
+    DoNotOptimizeData(c);
+  }
+}
+
+template <class Container>
+void sequence_container_benchmarks(std::string container) {
+  using ValueType = typename Container::value_type;
+
+  using Generator     = ValueType (*)();
+  Generator cheap     = [] { return Generate<ValueType>::cheap(); };
+  Generator expensive = [] { return Generate<ValueType>::expensive(); };
+  auto tostr          = [&](Generator gen) { return gen == cheap ? " (cheap elements)" : " (expensive elements)"; };
+  std::vector<Generator> generators;
+  generators.push_back(cheap);
+  if constexpr (!std::is_integral_v<ValueType>) {
+    generators.push_back(expensive);
+  }
+
+  // constructors
+  if constexpr (std::is_constructible_v<Container, std::size_t>) {
+    // not all containers provide this one
+    benchmark::RegisterBenchmark(container + "::ctor(size)", BM_ctor_size<Container>)->Arg(1024);
+  }
+  for (auto gen : generators)
+    benchmark::RegisterBenchmark(container + "::ctor(size, value_type)" + tostr(gen), [=](auto& st) {
+      BM_ctor_size_value<Container>(st, gen);
+    })->Arg(1024);
+  for (auto gen : generators)
+    benchmark::RegisterBenchmark(container + "::ctor(Iterator, Iterator)" + tostr(gen), [=](auto& st) {
+      BM_ctor_iter_iter<Container>(st, gen);
+    })->Arg(1024);
+#if TEST_STD_VER >= 23
+  for (auto gen : generators)
+    benchmark::RegisterBenchmark(container + "::ctor(Range)" + tostr(gen), [=](auto& st) {
+      BM_ctor_from_range<Container>(st, gen);
+    })->Arg(1024);
+#endif
+  for (auto gen : generators)
+    benchmark::RegisterBenchmark(container + "::ctor(const&)" + tostr(gen), [=](auto& st) {
+      BM_ctor_copy<Container>(st, gen);
+    })->Arg(1024);
+
+  // assignment
+  for (auto gen : generators)
+    benchmark::RegisterBenchmark(container + "::operator=(const&)" + tostr(gen), [=](auto& st) {
+      BM_assignment<Container>(st, gen);
+    })->Arg(1024);
+  for (auto gen : generators)
+    benchmark::RegisterBenchmark(container + "::assign(input-iter, input-iter) (full container)" + tostr(gen),
+                                 [=](auto& st) { BM_assign_input_iter_full<Container>(st, gen); })
+        ->Arg(1024);
+
+  // insert
+  for (auto gen : generators)
+    benchmark::RegisterBenchmark(container + "::insert(begin)" + tostr(gen), [=](auto& st) {
+      BM_insert_begin<Container>(st, gen);
+    })->Arg(1024);
+  if constexpr (std::random_access_iterator<typename Container::iterator>) {
+    for (auto gen : generators)
+      benchmark::RegisterBenchmark(container + "::insert(middle)" + tostr(gen), [=](auto& st) {
+        BM_insert_middle<Container>(st, gen);
+      })->Arg(1024);
+  }
+  if constexpr (requires(Container c) { c.reserve(0); }) {
+    for (auto gen : generators)
+      benchmark::RegisterBenchmark(
+          container + "::insert(begin, input-iter, input-iter) (no realloc)" + tostr(gen),
+          [=](auto& st) { BM_insert_begin_input_iter_with_reserve_no_realloc<Container>(st, gen); })
+          ->Arg(1024);
+    for (auto gen : generators)
+      benchmark::RegisterBenchmark(
+          container + "::insert(begin, input-iter, input-iter) (half filled)" + tostr(gen),
+          [=](auto& st) { BM_insert_begin_input_iter_with_reserve_almost_no_realloc<Container>(st, gen); })
+          ->Arg(1024);
+    for (auto gen : generators)
+      benchmark::RegisterBenchmark(
+          container + "::insert(begin, input-iter, input-iter) (near full)" + tostr(gen),
+          [=](auto& st) { BM_insert_begin_input_iter_with_reserve_near_full<Container>(st, gen); })
+          ->Arg(1024);
+  }
+
+  // erase
+  for (auto gen : generators)
+    benchmark::RegisterBenchmark(container + "::erase(begin)" + tostr(gen), [=](auto& st) {
+      BM_erase_begin<Container>(st, gen);
+    })->Arg(1024);
+  if constexpr (std::random_access_iterator<typename Container::iterator>) {
+    for (auto gen : generators)
+      benchmark::RegisterBenchmark(container + "::erase(middle)" + tostr(gen), [=](auto& st) {
+        BM_erase_middle<Container>(st, gen);
+      })->Arg(1024);
+  }
+
+  // push_back (optional)
+  if constexpr (requires(Container c, ValueType v) { c.push_back(v); }) {
+    for (auto gen : generators)
+      benchmark::RegisterBenchmark(container + "::push_back()" + tostr(gen), [=](auto& st) {
+        BM_push_back<Container>(st, gen);
+      })->Arg(1024);
+    if constexpr (requires(Container c) { c.reserve(0); }) {
+      for (auto gen : generators)
+        benchmark::RegisterBenchmark(container + "::push_back() (with reserve)" + tostr(gen), [=](auto& st) {
+          BM_push_back_with_reserve<Container>(st, gen);
+        })->Arg(1024);
+    }
+  }
+}
+
+//
+// Misc operations
+//
+template <class Container, class GenInputs>
+void BM_InsertValue(benchmark::State& st, Container c, GenInputs gen) {
+  auto in        = gen(st.range(0));
+  const auto end = in.end();
+  while (st.KeepRunning()) {
+    c.clear();
+    for (auto it = in.begin(); it != end; ++it) {
+      benchmark::DoNotOptimize(&(*c.insert(*it).first));
+    }
+    benchmark::ClobberMemory();
+  }
+}
+
+template <class Container, class GenInputs>
+void BM_InsertValueRehash(benchmark::State& st, Container c, GenInputs gen) {
+  auto in        = gen(st.range(0));
+  const auto end = in.end();
+  while (st.KeepRunning()) {
+    c.clear();
+    c.rehash(16);
+    for (auto it = in.begin(); it != end; ++it) {
+      benchmark::DoNotOptimize(&(*c.insert(*it).first));
+    }
+    benchmark::ClobberMemory();
+  }
+}
+
+template <class Container, class GenInputs>
+void BM_InsertDuplicate(benchmark::State& st, Container c, GenInputs gen) {
+  auto in        = gen(st.range(0));
+  const auto end = in.end();
+  c.insert(in.begin(), in.end());
+  benchmark::DoNotOptimize(c);
+  benchmark::DoNotOptimize(in);
+  while (st.KeepRunning()) {
+    for (auto it = in.begin(); it != end; ++it) {
+      benchmark::DoNotOptimize(&(*c.insert(*it).first));
+    }
+    benchmark::ClobberMemory();
+  }
+}
+
+template <class Container, class GenInputs>
+void BM_EmplaceDuplicate(benchmark::State& st, Container c, GenInputs gen) {
+  auto in        = gen(st.range(0));
+  const auto end = in.end();
+  c.insert(in.begin(), in.end());
+  benchmark::DoNotOptimize(c);
+  benchmark::DoNotOptimize(in);
+  while (st.KeepRunning()) {
+    for (auto it = in.begin(); it != end; ++it) {
+      benchmark::DoNotOptimize(&(*c.emplace(*it).first));
+    }
+    benchmark::ClobberMemory();
+  }
+}
+
+template <class Container, class GenInputs>
+void BM_Find(benchmark::State& st, Container c, GenInputs gen) {
+  auto in = gen(st.range(0));
+  c.insert(in.begin(), in.end());
+  benchmark::DoNotOptimize(&(*c.begin()));
+  const auto end = in.data() + in.size();
+  while (st.KeepRunning()) {
+    for (auto it = in.data(); it != end; ++it) {
+      benchmark::DoNotOptimize(&(*c.find(*it)));
+    }
+    benchmark::ClobberMemory();
+  }
+}
+
+template <class Container, class GenInputs>
+void BM_FindRehash(benchmark::State& st, Container c, GenInputs gen) {
+  c.rehash(8);
+  auto in = gen(st.range(0));
+  c.insert(in.begin(), in.end());
+  benchmark::DoNotOptimize(&(*c.begin()));
+  const auto end = in.data() + in.size();
+  while (st.KeepRunning()) {
+    for (auto it = in.data(); it != end; ++it) {
+      benchmark::DoNotOptimize(&(*c.find(*it)));
+    }
+    benchmark::ClobberMemory();
+  }
+}
+
+template <class Container, class GenInputs>
+void BM_Rehash(benchmark::State& st, Container c, GenInputs gen) {
+  auto in = gen(st.range(0));
+  c.max_load_factor(3.0);
+  c.insert(in.begin(), in.end());
+  benchmark::DoNotOptimize(c);
+  const auto bucket_count = c.bucket_count();
+  while (st.KeepRunning()) {
+    c.rehash(bucket_count + 1);
+    c.rehash(bucket_count);
+    benchmark::ClobberMemory();
+  }
+}
+
+template <class Container, class GenInputs>
+void BM_Compare_same_container(benchmark::State& st, Container, GenInputs gen) {
+  auto in = gen(st.range(0));
+  Container c1(in.begin(), in.end());
+  Container c2 = c1;
+
+  benchmark::DoNotOptimize(&(*c1.begin()));
+  benchmark::DoNotOptimize(&(*c2.begin()));
+  while (st.KeepRunning()) {
+    bool res = c1 == c2;
+    benchmark::DoNotOptimize(&res);
+    benchmark::ClobberMemory();
+  }
+}
+
+template <class Container, class GenInputs>
+void BM_Compare_different_containers(benchmark::State& st, Container, GenInputs gen) {
+  auto in1 = gen(st.range(0));
+  auto in2 = gen(st.range(0));
+  Container c1(in1.begin(), in1.end());
+  Container c2(in2.begin(), in2.end());
+
+  benchmark::DoNotOptimize(&(*c1.begin()));
+  benchmark::DoNotOptimize(&(*c2.begin()));
+  while (st.KeepRunning()) {
+    bool res = c1 == c2;
+    benchmark::DoNotOptimize(&res);
+    benchmark::ClobberMemory();
+  }
+}
+
+} // namespace ContainerBenchmarks
+
+#endif // TEST_BENCHMARKS_CONTAINERS_CONTAINER_BENCHMARKS_H
diff --git libcxx/test/benchmarks/containers/deque.bench.cpp libcxx/test/benchmarks/containers/deque.bench.cpp
index 7ff1093a9391..6a650fa4dce2 100644
--- libcxx/test/benchmarks/containers/deque.bench.cpp
+++ libcxx/test/benchmarks/containers/deque.bench.cpp
@@ -6,50 +6,20 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 #include <deque>
 #include <string>
 
+#include "container_benchmarks.h"
 #include "benchmark/benchmark.h"
 
-#include "ContainerBenchmarks.h"
-#include "../GenerateInput.h"
+int main(int argc, char** argv) {
+  ContainerBenchmarks::sequence_container_benchmarks<std::deque<int>>("std::deque<int>");
+  ContainerBenchmarks::sequence_container_benchmarks<std::deque<std::string>>("std::deque<std::string>");
 
-using namespace ContainerBenchmarks;
-
-constexpr std::size_t TestNumInputs = 1024;
-
-BENCHMARK_CAPTURE(BM_ConstructSize, deque_byte, std::deque<unsigned char>{})->Arg(5140480);
-
-BENCHMARK_CAPTURE(BM_ConstructSizeValue, deque_byte, std::deque<unsigned char>{}, 0)->Arg(5140480);
-
-BENCHMARK_CAPTURE(BM_ConstructIterIter, deque_char, std::deque<char>{}, getRandomIntegerInputs<char>)
-    ->Arg(TestNumInputs);
-
-BENCHMARK_CAPTURE(BM_ConstructIterIter, deque_size_t, std::deque<size_t>{}, getRandomIntegerInputs<size_t>)
-    ->Arg(TestNumInputs);
-
-BENCHMARK_CAPTURE(BM_ConstructIterIter, deque_string, std::deque<std::string>{}, getRandomStringInputs)
-    ->Arg(TestNumInputs);
-
-BENCHMARK_CAPTURE(BM_ConstructFromRange, deque_char, std::deque<char>{}, getRandomIntegerInputs<char>)
-    ->Arg(TestNumInputs);
-
-BENCHMARK_CAPTURE(BM_ConstructFromRange, deque_size_t, std::deque<size_t>{}, getRandomIntegerInputs<size_t>)
-    ->Arg(TestNumInputs);
-
-BENCHMARK_CAPTURE(BM_ConstructFromRange, deque_string, std::deque<std::string>{}, getRandomStringInputs)
-    ->Arg(TestNumInputs);
-
-BENCHMARK_CAPTURE(BM_erase_iter_in_middle, deque_int, std::deque<int>{}, getRandomIntegerInputs<int>)
-    ->Range(TestNumInputs, TestNumInputs * 10);
-BENCHMARK_CAPTURE(BM_erase_iter_in_middle, deque_string, std::deque<std::string>{}, getRandomStringInputs)
-    ->Range(TestNumInputs, TestNumInputs * 10);
-
-BENCHMARK_CAPTURE(BM_erase_iter_at_start, deque_int, std::deque<int>{}, getRandomIntegerInputs<int>)
-    ->Range(TestNumInputs, TestNumInputs * 10);
-BENCHMARK_CAPTURE(BM_erase_iter_at_start, deque_string, std::deque<std::string>{}, getRandomStringInputs)
-    ->Range(TestNumInputs, TestNumInputs * 10);
-
-BENCHMARK_MAIN();
+  benchmark::Initialize(&argc, argv);
+  benchmark::RunSpecifiedBenchmarks();
+  benchmark::Shutdown();
+  return 0;
+}
diff --git libcxx/test/benchmarks/containers/list.bench.cpp libcxx/test/benchmarks/containers/list.bench.cpp
new file mode 100644
index 000000000000..2212affa02ba
--- /dev/null
+++ libcxx/test/benchmarks/containers/list.bench.cpp
@@ -0,0 +1,25 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+#include <list>
+#include <string>
+
+#include "container_benchmarks.h"
+#include "benchmark/benchmark.h"
+
+int main(int argc, char** argv) {
+  ContainerBenchmarks::sequence_container_benchmarks<std::list<int>>("std::list<int>");
+  ContainerBenchmarks::sequence_container_benchmarks<std::list<std::string>>("std::list<std::string>");
+
+  benchmark::Initialize(&argc, argv);
+  benchmark::RunSpecifiedBenchmarks();
+  benchmark::Shutdown();
+  return 0;
+}
diff --git libcxx/test/benchmarks/containers/string.bench.cpp libcxx/test/benchmarks/containers/string.bench.cpp
index 0b62c87acf7a..aeff6ad6f633 100644
--- libcxx/test/benchmarks/containers/string.bench.cpp
+++ libcxx/test/benchmarks/containers/string.bench.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++03, c++11, c++14
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 #include <cstdint>
 #include <cstdlib>
diff --git libcxx/test/benchmarks/containers/unordered_set_operations.bench.cpp libcxx/test/benchmarks/containers/unordered_set.bench.cpp
similarity index 99%
rename from libcxx/test/benchmarks/containers/unordered_set_operations.bench.cpp
rename to libcxx/test/benchmarks/containers/unordered_set.bench.cpp
index a8448ef5a0cf..ad8d0feaa043 100644
--- libcxx/test/benchmarks/containers/unordered_set_operations.bench.cpp
+++ libcxx/test/benchmarks/containers/unordered_set.bench.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 #include <cstdint>
 #include <cstdlib>
@@ -17,7 +17,7 @@
 
 #include "benchmark/benchmark.h"
 
-#include "ContainerBenchmarks.h"
+#include "container_benchmarks.h"
 #include "../GenerateInput.h"
 #include "test_macros.h"
 
diff --git libcxx/test/benchmarks/containers/vector.bench.cpp libcxx/test/benchmarks/containers/vector.bench.cpp
new file mode 100644
index 000000000000..eef23d298164
--- /dev/null
+++ libcxx/test/benchmarks/containers/vector.bench.cpp
@@ -0,0 +1,25 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+#include <string>
+#include <vector>
+
+#include "container_benchmarks.h"
+#include "benchmark/benchmark.h"
+
+int main(int argc, char** argv) {
+  ContainerBenchmarks::sequence_container_benchmarks<std::vector<int>>("std::vector<int>");
+  ContainerBenchmarks::sequence_container_benchmarks<std::vector<std::string>>("std::vector<std::string>");
+
+  benchmark::Initialize(&argc, argv);
+  benchmark::RunSpecifiedBenchmarks();
+  benchmark::Shutdown();
+  return 0;
+}
diff --git libcxx/test/benchmarks/containers/vector_operations.bench.cpp libcxx/test/benchmarks/containers/vector_operations.bench.cpp
deleted file mode 100644
index 1cd754ca7e78..000000000000
--- libcxx/test/benchmarks/containers/vector_operations.bench.cpp
+++ /dev/null
@@ -1,108 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
-
-#include <cstdint>
-#include <cstdlib>
-#include <cstring>
-#include <deque>
-#include <functional>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "benchmark/benchmark.h"
-#include "ContainerBenchmarks.h"
-#include "../GenerateInput.h"
-
-using namespace ContainerBenchmarks;
-
-constexpr std::size_t TestNumInputs = 1024;
-
-BENCHMARK_CAPTURE(BM_ConstructSize, vector_byte, std::vector<unsigned char>{})->Arg(5140480);
-
-BENCHMARK_CAPTURE(BM_CopyConstruct, vector_int, std::vector<int>{})->Arg(5140480);
-
-BENCHMARK_CAPTURE(BM_Assignment, vector_int, std::vector<int>{})->Arg(5140480);
-
-BENCHMARK_CAPTURE(BM_ConstructSizeValue, vector_byte, std::vector<unsigned char>{}, 0)->Arg(5140480);
-
-BENCHMARK_CAPTURE(BM_ConstructIterIter, vector_char, std::vector<char>{}, getRandomIntegerInputs<char>)
-    ->Arg(TestNumInputs);
-
-BENCHMARK_CAPTURE(BM_ConstructIterIter, vector_size_t, std::vector<size_t>{}, getRandomIntegerInputs<size_t>)
-    ->Arg(TestNumInputs);
-
-BENCHMARK_CAPTURE(BM_ConstructIterIter, vector_string, std::vector<std::string>{}, getRandomStringInputs)
-    ->Arg(TestNumInputs);
-
-BENCHMARK_CAPTURE(BM_ConstructFromRange, vector_char, std::vector<char>{}, getRandomIntegerInputs<char>)
-    ->Arg(TestNumInputs);
-
-BENCHMARK_CAPTURE(BM_ConstructFromRange, vector_size_t, std::vector<size_t>{}, getRandomIntegerInputs<size_t>)
-    ->Arg(TestNumInputs);
-
-BENCHMARK_CAPTURE(BM_ConstructFromRange, vector_string, std::vector<std::string>{}, getRandomStringInputs)
-    ->Arg(TestNumInputs);
-
-BENCHMARK_CAPTURE(BM_Pushback_no_grow, vector_int, std::vector<int>{})->Arg(TestNumInputs);
-
-BENCHMARK_CAPTURE(BM_erase_iter_in_middle, vector_int, std::vector<int>{}, getRandomIntegerInputs<int>)
-    ->Range(TestNumInputs, TestNumInputs * 10);
-BENCHMARK_CAPTURE(BM_erase_iter_in_middle, vector_string, std::vector<std::string>{}, getRandomStringInputs)
-    ->Range(TestNumInputs, TestNumInputs * 10);
-
-BENCHMARK_CAPTURE(BM_erase_iter_at_start, vector_int, std::vector<int>{}, getRandomIntegerInputs<int>)
-    ->Range(TestNumInputs, TestNumInputs * 10);
-BENCHMARK_CAPTURE(BM_erase_iter_at_start, vector_string, std::vector<std::string>{}, getRandomStringInputs)
-    ->Range(TestNumInputs, TestNumInputs * 10);
-
-template <class T>
-void bm_grow(benchmark::State& state) {
-  for (auto _ : state) {
-    std::vector<T> vec;
-    benchmark::DoNotOptimize(vec);
-    for (size_t i = 0; i != 2048; ++i)
-      vec.emplace_back();
-    benchmark::DoNotOptimize(vec);
-  }
-}
-BENCHMARK(bm_grow<int>);
-BENCHMARK(bm_grow<std::string>);
-BENCHMARK(bm_grow<std::unique_ptr<int>>);
-BENCHMARK(bm_grow<std::deque<int>>);
-
-BENCHMARK_CAPTURE(BM_AssignInputIterIter, vector_int, std::vector<int>{}, getRandomIntegerInputs<int>)
-    ->Args({TestNumInputs, TestNumInputs});
-
-BENCHMARK_CAPTURE(
-    BM_AssignInputIterIter<32>, vector_string, std::vector<std::string>{}, getRandomStringInputsWithLength)
-    ->Args({TestNumInputs, TestNumInputs});
-
-BENCHMARK_CAPTURE(BM_AssignInputIterIter<100>,
-                  vector_vector_int,
-                  std::vector<std::vector<int>>{},
-                  getRandomIntegerInputsWithLength<int>)
-    ->Args({TestNumInputs, TestNumInputs});
-
-BENCHMARK_CAPTURE(BM_Insert_InputIterIter_NoRealloc, vector_int, std::vector<int>(100, 1), getRandomIntegerInputs<int>)
-    ->Arg(514048);
-BENCHMARK_CAPTURE(
-    BM_Insert_InputIterIter_Realloc_HalfFilled, vector_int, std::vector<int>{}, getRandomIntegerInputs<int>)
-    ->Arg(514048);
-BENCHMARK_CAPTURE(BM_Insert_InputIterIter_Realloc_NearFull, vector_int, std::vector<int>{}, getRandomIntegerInputs<int>)
-    ->Arg(514048);
-BENCHMARK_CAPTURE(
-    BM_Insert_InputIterIter_Realloc_HalfFilled, vector_string, std::vector<std::string>{}, getSSORandomStringInputs)
-    ->Arg(514048);
-BENCHMARK_CAPTURE(
-    BM_Insert_InputIterIter_Realloc_NearFull, vector_string, std::vector<std::string>{}, getSSORandomStringInputs)
-    ->Arg(514048);
-
-BENCHMARK_MAIN();
diff --git libcxx/test/benchmarks/filesystem.bench.cpp libcxx/test/benchmarks/filesystem.bench.cpp
index 83a87c86d3de..dc6b0ac537f7 100644
--- libcxx/test/benchmarks/filesystem.bench.cpp
+++ libcxx/test/benchmarks/filesystem.bench.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++03, c++11, c++14
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 #include <filesystem>
 
diff --git libcxx/test/benchmarks/hash.bench.cpp libcxx/test/benchmarks/hash.bench.cpp
index 1e1a0f36ec11..ca958765dc21 100644
--- libcxx/test/benchmarks/hash.bench.cpp
+++ libcxx/test/benchmarks/hash.bench.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++03
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 #include <cstdint>
 #include <cstddef>
diff --git libcxx/test/benchmarks/variant_visit_1.bench.cpp libcxx/test/benchmarks/variant_visit_1.bench.cpp
index 42b22aabaee0..f1b702530bed 100644
--- libcxx/test/benchmarks/variant_visit_1.bench.cpp
+++ libcxx/test/benchmarks/variant_visit_1.bench.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++03, c++11, c++14
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 #include "benchmark/benchmark.h"
 
diff --git libcxx/test/benchmarks/variant_visit_2.bench.cpp libcxx/test/benchmarks/variant_visit_2.bench.cpp
index 328048cabc44..7dd8d02b358b 100644
--- libcxx/test/benchmarks/variant_visit_2.bench.cpp
+++ libcxx/test/benchmarks/variant_visit_2.bench.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++03, c++11, c++14
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 #include "benchmark/benchmark.h"
 
diff --git libcxx/test/benchmarks/variant_visit_3.bench.cpp libcxx/test/benchmarks/variant_visit_3.bench.cpp
index 40f8c1b5fa26..0fe42b0d8e00 100644
--- libcxx/test/benchmarks/variant_visit_3.bench.cpp
+++ libcxx/test/benchmarks/variant_visit_3.bench.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++03, c++11, c++14
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 #include "benchmark/benchmark.h"
 
diff --git libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy.pass.cpp libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy.pass.cpp
index b5f0a32b986a..1ca397c92a33 100644
--- libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy.pass.cpp
+++ libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy.pass.cpp
@@ -14,6 +14,7 @@
 
 #include <algorithm>
 #include <cassert>
+#include <vector>
 
 #include "test_macros.h"
 #include "test_iterators.h"
@@ -59,6 +60,26 @@ struct TestInIters {
   }
 };
 
+TEST_CONSTEXPR_CXX20 bool test_vector_bool(std::size_t N) {
+  std::vector<bool> in(N, false);
+  for (std::size_t i = 0; i < N; i += 2)
+    in[i] = true;
+
+  { // Test copy with aligned bytes
+    std::vector<bool> out(N);
+    std::copy(in.begin(), in.end(), out.begin());
+    assert(in == out);
+  }
+  { // Test copy with unaligned bytes
+    std::vector<bool> out(N + 8);
+    std::copy(in.begin(), in.end(), out.begin() + 4);
+    for (std::size_t i = 0; i < N; ++i)
+      assert(out[i + 4] == in[i]);
+  }
+
+  return true;
+}
+
 TEST_CONSTEXPR_CXX20 bool test() {
   types::for_each(types::cpp17_input_iterator_list<int*>(), TestInIters());
 
@@ -78,13 +99,23 @@ TEST_CONSTEXPR_CXX20 bool test() {
     assert(std::equal(a, a + 10, expected));
   }
 
+  { // Test vector<bool>::iterator optimization
+    assert(test_vector_bool(8));
+    assert(test_vector_bool(19));
+    assert(test_vector_bool(32));
+    assert(test_vector_bool(49));
+    assert(test_vector_bool(64));
+    assert(test_vector_bool(199));
+    assert(test_vector_bool(256));
+  }
+
   return true;
 }
 
 int main(int, char**) {
   test();
 
-#if TEST_STD_VER > 17
+#if TEST_STD_VER >= 20
   static_assert(test());
 #endif
 
diff --git libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy_backward.pass.cpp libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy_backward.pass.cpp
index 928903de1ade..445c7718e111 100644
--- libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy_backward.pass.cpp
+++ libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy_backward.pass.cpp
@@ -15,6 +15,7 @@
 
 #include <algorithm>
 #include <cassert>
+#include <vector>
 
 #include "test_macros.h"
 #include "test_iterators.h"
@@ -36,47 +37,63 @@ public:
 };
 
 template <class InIter, class OutIter>
-TEST_CONSTEXPR_CXX20 void
-test_copy_backward()
-{
+TEST_CONSTEXPR_CXX20 void test_copy_backward() {
   {
     const unsigned N = 1000;
-    int ia[N] = {};
+    int ia[N]        = {};
     for (unsigned i = 0; i < N; ++i)
-        ia[i] = i;
+      ia[i] = i;
     int ib[N] = {0};
 
-    OutIter r = std::copy_backward(InIter(ia), InIter(ia+N), OutIter(ib+N));
+    OutIter r = std::copy_backward(InIter(ia), InIter(ia + N), OutIter(ib + N));
     assert(base(r) == ib);
     for (unsigned i = 0; i < N; ++i)
-        assert(ia[i] == ib[i]);
+      assert(ia[i] == ib[i]);
   }
 }
 
-TEST_CONSTEXPR_CXX20 bool
-test()
-{
-    test_copy_backward<bidirectional_iterator<const int*>, bidirectional_iterator<int*> >();
-    test_copy_backward<bidirectional_iterator<const int*>, random_access_iterator<int*> >();
-    test_copy_backward<bidirectional_iterator<const int*>, int*>();
+TEST_CONSTEXPR_CXX20 bool test_vector_bool(std::size_t N) {
+  std::vector<bool> in(N, false);
+  for (std::size_t i = 0; i < N; i += 2)
+    in[i] = true;
 
-    test_copy_backward<random_access_iterator<const int*>, bidirectional_iterator<int*> >();
-    test_copy_backward<random_access_iterator<const int*>, random_access_iterator<int*> >();
-    test_copy_backward<random_access_iterator<const int*>, int*>();
+  { // Test copy_backward with aligned bytes
+    std::vector<bool> out(N);
+    std::copy_backward(in.begin(), in.end(), out.end());
+    assert(in == out);
+  }
+  { // Test copy_backward with unaligned bytes
+    std::vector<bool> out(N + 8);
+    std::copy_backward(in.begin(), in.end(), out.end() - 4);
+    for (std::size_t i = 0; i < N; ++i)
+      assert(out[i + 4] == in[i]);
+  }
+
+  return true;
+};
 
-    test_copy_backward<const int*, bidirectional_iterator<int*> >();
-    test_copy_backward<const int*, random_access_iterator<int*> >();
-    test_copy_backward<const int*, int*>();
+TEST_CONSTEXPR_CXX20 bool test() {
+  test_copy_backward<bidirectional_iterator<const int*>, bidirectional_iterator<int*> >();
+  test_copy_backward<bidirectional_iterator<const int*>, random_access_iterator<int*> >();
+  test_copy_backward<bidirectional_iterator<const int*>, int*>();
+
+  test_copy_backward<random_access_iterator<const int*>, bidirectional_iterator<int*> >();
+  test_copy_backward<random_access_iterator<const int*>, random_access_iterator<int*> >();
+  test_copy_backward<random_access_iterator<const int*>, int*>();
+
+  test_copy_backward<const int*, bidirectional_iterator<int*> >();
+  test_copy_backward<const int*, random_access_iterator<int*> >();
+  test_copy_backward<const int*, int*>();
 
 #if TEST_STD_VER > 17
-    test_copy_backward<contiguous_iterator<const int*>, bidirectional_iterator<int*>>();
-    test_copy_backward<contiguous_iterator<const int*>, random_access_iterator<int*>>();
-    test_copy_backward<contiguous_iterator<const int*>, int*>();
-
-    test_copy_backward<bidirectional_iterator<const int*>, contiguous_iterator<int*>>();
-    test_copy_backward<random_access_iterator<const int*>, contiguous_iterator<int*>>();
-    test_copy_backward<contiguous_iterator<const int*>, contiguous_iterator<int*>>();
-    test_copy_backward<const int*, contiguous_iterator<int*>>();
+  test_copy_backward<contiguous_iterator<const int*>, bidirectional_iterator<int*>>();
+  test_copy_backward<contiguous_iterator<const int*>, random_access_iterator<int*>>();
+  test_copy_backward<contiguous_iterator<const int*>, int*>();
+
+  test_copy_backward<bidirectional_iterator<const int*>, contiguous_iterator<int*>>();
+  test_copy_backward<random_access_iterator<const int*>, contiguous_iterator<int*>>();
+  test_copy_backward<contiguous_iterator<const int*>, contiguous_iterator<int*>>();
+  test_copy_backward<const int*, contiguous_iterator<int*>>();
 #endif
 
   { // Make sure that padding bits aren't copied
@@ -96,15 +113,24 @@ test()
     assert(std::equal(a, a + 10, expected));
   }
 
-    return true;
+  { // Test vector<bool>::iterator optimization
+    assert(test_vector_bool(8));
+    assert(test_vector_bool(19));
+    assert(test_vector_bool(32));
+    assert(test_vector_bool(49));
+    assert(test_vector_bool(64));
+    assert(test_vector_bool(199));
+    assert(test_vector_bool(256));
+  }
+
+  return true;
 }
 
-int main(int, char**)
-{
-    test();
+int main(int, char**) {
+  test();
 
 #if TEST_STD_VER > 17
-    static_assert(test());
+  static_assert(test());
 #endif
 
   return 0;
diff --git libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy_n.pass.cpp libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy_n.pass.cpp
index b0acc1060101..889e71f4eceb 100644
--- libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy_n.pass.cpp
+++ libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy_n.pass.cpp
@@ -14,6 +14,7 @@
 
 #include <algorithm>
 #include <cassert>
+#include <vector>
 
 #include "test_macros.h"
 #include "test_iterators.h"
@@ -37,20 +38,18 @@ public:
 };
 
 template <class InIter, class OutIter>
-TEST_CONSTEXPR_CXX20 void
-test_copy_n()
-{
+TEST_CONSTEXPR_CXX20 void test_copy_n() {
   {
     const unsigned N = 1000;
-    int ia[N] = {};
+    int ia[N]        = {};
     for (unsigned i = 0; i < N; ++i)
-        ia[i] = i;
+      ia[i] = i;
     int ib[N] = {0};
 
-    OutIter r = std::copy_n(InIter(ia), UDI(N/2), OutIter(ib));
-    assert(base(r) == ib+N/2);
-    for (unsigned i = 0; i < N/2; ++i)
-        assert(ia[i] == ib[i]);
+    OutIter r = std::copy_n(InIter(ia), UDI(N / 2), OutIter(ib));
+    assert(base(r) == ib + N / 2);
+    for (unsigned i = 0; i < N / 2; ++i)
+      assert(ia[i] == ib[i]);
   }
 
   { // Make sure that padding bits aren't copied
@@ -70,53 +69,80 @@ test_copy_n()
   }
 }
 
-TEST_CONSTEXPR_CXX20 bool
-test()
-{
-    test_copy_n<cpp17_input_iterator<const int*>, cpp17_output_iterator<int*> >();
-    test_copy_n<cpp17_input_iterator<const int*>, cpp17_input_iterator<int*> >();
-    test_copy_n<cpp17_input_iterator<const int*>, forward_iterator<int*> >();
-    test_copy_n<cpp17_input_iterator<const int*>, bidirectional_iterator<int*> >();
-    test_copy_n<cpp17_input_iterator<const int*>, random_access_iterator<int*> >();
-    test_copy_n<cpp17_input_iterator<const int*>, int*>();
-
-    test_copy_n<forward_iterator<const int*>, cpp17_output_iterator<int*> >();
-    test_copy_n<forward_iterator<const int*>, cpp17_input_iterator<int*> >();
-    test_copy_n<forward_iterator<const int*>, forward_iterator<int*> >();
-    test_copy_n<forward_iterator<const int*>, bidirectional_iterator<int*> >();
-    test_copy_n<forward_iterator<const int*>, random_access_iterator<int*> >();
-    test_copy_n<forward_iterator<const int*>, int*>();
-
-    test_copy_n<bidirectional_iterator<const int*>, cpp17_output_iterator<int*> >();
-    test_copy_n<bidirectional_iterator<const int*>, cpp17_input_iterator<int*> >();
-    test_copy_n<bidirectional_iterator<const int*>, forward_iterator<int*> >();
-    test_copy_n<bidirectional_iterator<const int*>, bidirectional_iterator<int*> >();
-    test_copy_n<bidirectional_iterator<const int*>, random_access_iterator<int*> >();
-    test_copy_n<bidirectional_iterator<const int*>, int*>();
-
-    test_copy_n<random_access_iterator<const int*>, cpp17_output_iterator<int*> >();
-    test_copy_n<random_access_iterator<const int*>, cpp17_input_iterator<int*> >();
-    test_copy_n<random_access_iterator<const int*>, forward_iterator<int*> >();
-    test_copy_n<random_access_iterator<const int*>, bidirectional_iterator<int*> >();
-    test_copy_n<random_access_iterator<const int*>, random_access_iterator<int*> >();
-    test_copy_n<random_access_iterator<const int*>, int*>();
-
-    test_copy_n<const int*, cpp17_output_iterator<int*> >();
-    test_copy_n<const int*, cpp17_input_iterator<int*> >();
-    test_copy_n<const int*, forward_iterator<int*> >();
-    test_copy_n<const int*, bidirectional_iterator<int*> >();
-    test_copy_n<const int*, random_access_iterator<int*> >();
-    test_copy_n<const int*, int*>();
+TEST_CONSTEXPR_CXX20 bool test_vector_bool(std::size_t N) {
+  std::vector<bool> in(N, false);
+  for (std::size_t i = 0; i < N; i += 2)
+    in[i] = true;
+
+  { // Test copy with aligned bytes
+    std::vector<bool> out(N);
+    std::copy_n(in.begin(), N, out.begin());
+    assert(in == out);
+  }
+  { // Test copy with unaligned bytes
+    std::vector<bool> out(N + 8);
+    std::copy_n(in.begin(), N, out.begin() + 4);
+    for (std::size_t i = 0; i < N; ++i)
+      assert(out[i + 4] == in[i]);
+  }
+
+  return true;
+}
+
+TEST_CONSTEXPR_CXX20 bool test() {
+  test_copy_n<cpp17_input_iterator<const int*>, cpp17_output_iterator<int*> >();
+  test_copy_n<cpp17_input_iterator<const int*>, cpp17_input_iterator<int*> >();
+  test_copy_n<cpp17_input_iterator<const int*>, forward_iterator<int*> >();
+  test_copy_n<cpp17_input_iterator<const int*>, bidirectional_iterator<int*> >();
+  test_copy_n<cpp17_input_iterator<const int*>, random_access_iterator<int*> >();
+  test_copy_n<cpp17_input_iterator<const int*>, int*>();
+
+  test_copy_n<forward_iterator<const int*>, cpp17_output_iterator<int*> >();
+  test_copy_n<forward_iterator<const int*>, cpp17_input_iterator<int*> >();
+  test_copy_n<forward_iterator<const int*>, forward_iterator<int*> >();
+  test_copy_n<forward_iterator<const int*>, bidirectional_iterator<int*> >();
+  test_copy_n<forward_iterator<const int*>, random_access_iterator<int*> >();
+  test_copy_n<forward_iterator<const int*>, int*>();
+
+  test_copy_n<bidirectional_iterator<const int*>, cpp17_output_iterator<int*> >();
+  test_copy_n<bidirectional_iterator<const int*>, cpp17_input_iterator<int*> >();
+  test_copy_n<bidirectional_iterator<const int*>, forward_iterator<int*> >();
+  test_copy_n<bidirectional_iterator<const int*>, bidirectional_iterator<int*> >();
+  test_copy_n<bidirectional_iterator<const int*>, random_access_iterator<int*> >();
+  test_copy_n<bidirectional_iterator<const int*>, int*>();
+
+  test_copy_n<random_access_iterator<const int*>, cpp17_output_iterator<int*> >();
+  test_copy_n<random_access_iterator<const int*>, cpp17_input_iterator<int*> >();
+  test_copy_n<random_access_iterator<const int*>, forward_iterator<int*> >();
+  test_copy_n<random_access_iterator<const int*>, bidirectional_iterator<int*> >();
+  test_copy_n<random_access_iterator<const int*>, random_access_iterator<int*> >();
+  test_copy_n<random_access_iterator<const int*>, int*>();
+
+  test_copy_n<const int*, cpp17_output_iterator<int*> >();
+  test_copy_n<const int*, cpp17_input_iterator<int*> >();
+  test_copy_n<const int*, forward_iterator<int*> >();
+  test_copy_n<const int*, bidirectional_iterator<int*> >();
+  test_copy_n<const int*, random_access_iterator<int*> >();
+  test_copy_n<const int*, int*>();
+
+  { // Test vector<bool>::iterator optimization
+    assert(test_vector_bool(8));
+    assert(test_vector_bool(19));
+    assert(test_vector_bool(32));
+    assert(test_vector_bool(49));
+    assert(test_vector_bool(64));
+    assert(test_vector_bool(199));
+    assert(test_vector_bool(256));
+  }
 
   return true;
 }
 
-int main(int, char**)
-{
-    test();
+int main(int, char**) {
+  test();
 
 #if TEST_STD_VER > 17
-    static_assert(test());
+  static_assert(test());
 #endif
 
   return 0;
diff --git libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/ranges.copy.pass.cpp libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/ranges.copy.pass.cpp
index 2507e594fe94..68356c80ba7f 100644
--- libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/ranges.copy.pass.cpp
+++ libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/ranges.copy.pass.cpp
@@ -26,6 +26,7 @@
 
 #include "almost_satisfies_types.h"
 #include "test_iterators.h"
+#include "test_macros.h"
 #include "type_algorithms.h"
 
 template <class In, class Out = In, class Sent = sentinel_wrapper<In>>
@@ -99,6 +100,28 @@ constexpr void test_iterators() {
 }
 // clang-format on
 
+#if TEST_STD_VER >= 23
+constexpr bool test_vector_bool(std::size_t N) {
+  std::vector<bool> in(N, false);
+  for (std::size_t i = 0; i < N; i += 2)
+    in[i] = true;
+
+  { // Test copy with aligned bytes
+    std::vector<bool> out(N);
+    std::ranges::copy(in, out.begin());
+    assert(in == out);
+  }
+  { // Test copy with unaligned bytes
+    std::vector<bool> out(N + 8);
+    std::ranges::copy(in, out.begin() + 4);
+    for (std::size_t i = 0; i < N; ++i)
+      assert(out[i + 4] == in[i]);
+  }
+
+  return true;
+}
+#endif
+
 constexpr bool test() {
   types::for_each(types::forward_iterator_list<int*>{}, []<class Out>() {
     test_iterators<cpp20_input_iterator<int*>, Out, sentinel_wrapper<cpp20_input_iterator<int*>>>();
@@ -204,6 +227,18 @@ constexpr bool test() {
     }
   }
 
+#if TEST_STD_VER >= 23
+  { // Test vector<bool>::iterator optimization
+    assert(test_vector_bool(8));
+    assert(test_vector_bool(19));
+    assert(test_vector_bool(32));
+    assert(test_vector_bool(49));
+    assert(test_vector_bool(64));
+    assert(test_vector_bool(199));
+    assert(test_vector_bool(256));
+  }
+#endif
+
   return true;
 }
 
diff --git libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/ranges.copy_backward.pass.cpp libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/ranges.copy_backward.pass.cpp
index 343447446ab2..a7fa3db23e6b 100644
--- libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/ranges.copy_backward.pass.cpp
+++ libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/ranges.copy_backward.pass.cpp
@@ -29,6 +29,7 @@
 
 #include "almost_satisfies_types.h"
 #include "test_iterators.h"
+#include "test_macros.h"
 
 template <class In, class Out = In, class Sent = sentinel_wrapper<In>>
 concept HasCopyBackwardIt = requires(In in, Sent sent, Out out) { std::ranges::copy_backward(in, sent, out); };
@@ -61,16 +62,16 @@ template <class In, class Out, class Sent>
 constexpr void test_iterators() {
   { // simple test
     {
-      std::array in {1, 2, 3, 4};
+      std::array in{1, 2, 3, 4};
       std::array<int, 4> out;
       std::same_as<std::ranges::in_out_result<In, Out>> auto ret =
-        std::ranges::copy_backward(In(in.data()), Sent(In(in.data() + in.size())), Out(out.data() + out.size()));
+          std::ranges::copy_backward(In(in.data()), Sent(In(in.data() + in.size())), Out(out.data() + out.size()));
       assert(in == out);
       assert(base(ret.in) == in.data() + in.size());
       assert(base(ret.out) == out.data());
     }
     {
-      std::array in {1, 2, 3, 4};
+      std::array in{1, 2, 3, 4};
       std::array<int, 4> out;
       auto range = std::ranges::subrange(In(in.data()), Sent(In(in.data() + in.size())));
       std::same_as<std::ranges::in_out_result<In, Out>> auto ret =
@@ -94,7 +95,7 @@ constexpr void test_iterators() {
       std::array<int, 0> in;
       std::array<int, 0> out;
       auto range = std::ranges::subrange(In(in.data()), Sent(In(in.data() + in.size())));
-      auto ret = std::ranges::copy_backward(range, Out(out.data()));
+      auto ret   = std::ranges::copy_backward(range, Out(out.data()));
       assert(base(ret.in) == in.data() + in.size());
       assert(base(ret.out) == out.data());
     }
@@ -104,16 +105,16 @@ constexpr void test_iterators() {
 template <class InContainer, class OutContainer, class In, class Out, class Sent = In>
 constexpr void test_containers() {
   {
-    InContainer in {1, 2, 3, 4};
+    InContainer in{1, 2, 3, 4};
     OutContainer out(4);
     std::same_as<std::ranges::in_out_result<In, Out>> auto ret =
-      std::ranges::copy_backward(In(in.begin()), Sent(In(in.end())), Out(out.end()));
+        std::ranges::copy_backward(In(in.begin()), Sent(In(in.end())), Out(out.end()));
     assert(std::ranges::equal(in, out));
     assert(base(ret.in) == in.end());
     assert(base(ret.out) == out.begin());
   }
   {
-    InContainer in {1, 2, 3, 4};
+    InContainer in{1, 2, 3, 4};
     OutContainer out(4);
     auto range = std::ranges::subrange(In(in.begin()), Sent(In(in.end())));
     std::same_as<std::ranges::in_out_result<In, Out>> auto ret = std::ranges::copy_backward(range, Out(out.end()));
@@ -125,13 +126,12 @@ constexpr void test_containers() {
 
 template <class Iter, class Sent>
 constexpr void test_join_view() {
-  auto to_subranges = std::views::transform([](auto& vec) {
-          return std::ranges::subrange(Iter(vec.begin()), Sent(Iter(vec.end())));
-        });
+  auto to_subranges =
+      std::views::transform([](auto& vec) { return std::ranges::subrange(Iter(vec.begin()), Sent(Iter(vec.end()))); });
 
   { // segmented -> contiguous
     std::vector<std::vector<int>> vectors = {};
-    auto range = vectors | to_subranges;
+    auto range                            = vectors | to_subranges;
     std::vector<std::ranges::subrange<Iter, Sent>> subrange_vector(range.begin(), range.end());
     std::array<int, 0> arr;
 
@@ -140,7 +140,7 @@ constexpr void test_join_view() {
   }
   { // segmented -> contiguous
     std::vector<std::vector<int>> vectors = {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10}, {}};
-    auto range = vectors | to_subranges;
+    auto range                            = vectors | to_subranges;
     std::vector<std::ranges::subrange<Iter, Sent>> subrange_vector(range.begin(), range.end());
     std::array<int, 10> arr;
 
@@ -149,7 +149,7 @@ constexpr void test_join_view() {
   }
   { // contiguous -> segmented
     std::vector<std::vector<int>> vectors = {{0, 0, 0, 0}, {0, 0}, {0, 0, 0, 0}, {}};
-    auto range = vectors | to_subranges;
+    auto range                            = vectors | to_subranges;
     std::vector<std::ranges::subrange<Iter, Sent>> subrange_vector(range.begin(), range.end());
     std::array arr = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
 
@@ -158,10 +158,10 @@ constexpr void test_join_view() {
   }
   { // segmented -> segmented
     std::vector<std::vector<int>> vectors = {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10}, {}};
-    auto range1 = vectors | to_subranges;
+    auto range1                           = vectors | to_subranges;
     std::vector<std::ranges::subrange<Iter, Sent>> subrange_vector(range1.begin(), range1.end());
     std::vector<std::vector<int>> to_vectors = {{0, 0, 0, 0}, {0, 0, 0, 0}, {}, {0, 0}};
-    auto range2 = to_vectors | to_subranges;
+    auto range2                              = to_vectors | to_subranges;
     std::vector<std::ranges::subrange<Iter, Sent>> to_subrange_vector(range2.begin(), range2.end());
 
     std::ranges::copy_backward(subrange_vector | std::views::join, (to_subrange_vector | std::views::join).end());
@@ -224,6 +224,30 @@ constexpr void test_proxy_in_iterators() {
   test_sentinels<ProxyIterator, Out>();
 }
 
+#if TEST_STD_VER >= 23
+
+constexpr bool test_vector_bool(std::size_t N) {
+  std::vector<bool> in(N, false);
+  for (std::size_t i = 0; i < N; i += 2)
+    in[i] = true;
+
+  { // Test copy_backward with aligned bytes
+    std::vector<bool> out(N);
+    std::ranges::copy_backward(in, out.end());
+    assert(in == out);
+  }
+  { // Test copy_backward with unaligned bytes
+    std::vector<bool> out(N + 8);
+    std::ranges::copy_backward(in, out.end() - 4);
+    for (std::size_t i = 0; i < N; ++i)
+      assert(out[i + 4] == in[i]);
+  }
+
+  return true;
+};
+
+#endif
+
 constexpr bool test() {
   test_in_iterators<bidirectional_iterator>();
   test_in_iterators<random_access_iterator>();
@@ -237,13 +261,13 @@ constexpr bool test() {
   { // check that ranges::dangling is returned
     std::array<int, 4> out;
     std::same_as<std::ranges::in_out_result<std::ranges::dangling, int*>> auto ret =
-      std::ranges::copy_backward(std::array {1, 2, 3, 4}, out.data() + out.size());
+        std::ranges::copy_backward(std::array{1, 2, 3, 4}, out.data() + out.size());
     assert(ret.out == out.data());
     assert((out == std::array{1, 2, 3, 4}));
   }
 
   { // check that an iterator is returned with a borrowing range
-    std::array in {1, 2, 3, 4};
+    std::array in{1, 2, 3, 4};
     std::array<int, 4> out;
     std::same_as<std::ranges::in_out_result<std::array<int, 4>::iterator, int*>> auto ret =
         std::ranges::copy_backward(std::views::all(in), out.data() + out.size());
@@ -254,8 +278,8 @@ constexpr bool test() {
 
   { // check that every element is copied exactly once
     struct CopyOnce {
-      bool copied = false;
-      constexpr CopyOnce() = default;
+      bool copied                               = false;
+      constexpr CopyOnce()                      = default;
       constexpr CopyOnce(const CopyOnce& other) = delete;
       constexpr CopyOnce& operator=(const CopyOnce& other) {
         assert(!other.copied);
@@ -264,16 +288,16 @@ constexpr bool test() {
       }
     };
     {
-      std::array<CopyOnce, 4> in {};
-      std::array<CopyOnce, 4> out {};
+      std::array<CopyOnce, 4> in{};
+      std::array<CopyOnce, 4> out{};
       auto ret = std::ranges::copy_backward(in.begin(), in.end(), out.end());
       assert(ret.in == in.end());
       assert(ret.out == out.begin());
       assert(std::all_of(out.begin(), out.end(), [](const auto& e) { return e.copied; }));
     }
     {
-      std::array<CopyOnce, 4> in {};
-      std::array<CopyOnce, 4> out {};
+      std::array<CopyOnce, 4> in{};
+      std::array<CopyOnce, 4> out{};
       auto ret = std::ranges::copy_backward(in, out.end());
       assert(ret.in == in.end());
       assert(ret.out == out.begin());
@@ -284,8 +308,8 @@ constexpr bool test() {
   { // check that the range is copied backwards
     struct OnlyBackwardsCopyable {
       OnlyBackwardsCopyable* next = nullptr;
-      bool canCopy = false;
-      OnlyBackwardsCopyable() = default;
+      bool canCopy                = false;
+      OnlyBackwardsCopyable()     = default;
       constexpr OnlyBackwardsCopyable& operator=(const OnlyBackwardsCopyable&) {
         assert(canCopy);
         if (next != nullptr)
@@ -294,12 +318,12 @@ constexpr bool test() {
       }
     };
     {
-      std::array<OnlyBackwardsCopyable, 3> in {};
-      std::array<OnlyBackwardsCopyable, 3> out {};
-      out[1].next = &out[0];
-      out[2].next = &out[1];
+      std::array<OnlyBackwardsCopyable, 3> in{};
+      std::array<OnlyBackwardsCopyable, 3> out{};
+      out[1].next    = &out[0];
+      out[2].next    = &out[1];
       out[2].canCopy = true;
-      auto ret = std::ranges::copy_backward(in, out.end());
+      auto ret       = std::ranges::copy_backward(in, out.end());
       assert(ret.in == in.end());
       assert(ret.out == out.begin());
       assert(out[0].canCopy);
@@ -307,12 +331,12 @@ constexpr bool test() {
       assert(out[2].canCopy);
     }
     {
-      std::array<OnlyBackwardsCopyable, 3> in {};
-      std::array<OnlyBackwardsCopyable, 3> out {};
-      out[1].next = &out[0];
-      out[2].next = &out[1];
+      std::array<OnlyBackwardsCopyable, 3> in{};
+      std::array<OnlyBackwardsCopyable, 3> out{};
+      out[1].next    = &out[0];
+      out[2].next    = &out[1];
       out[2].canCopy = true;
-      auto ret = std::ranges::copy_backward(in.begin(), in.end(), out.end());
+      auto ret       = std::ranges::copy_backward(in.begin(), in.end(), out.end());
       assert(ret.in == in.end());
       assert(ret.out == out.begin());
       assert(out[0].canCopy);
@@ -321,6 +345,18 @@ constexpr bool test() {
     }
   }
 
+#if TEST_STD_VER >= 23
+  { // Test vector<bool>::iterator optimization
+    assert(test_vector_bool(8));
+    assert(test_vector_bool(19));
+    assert(test_vector_bool(32));
+    assert(test_vector_bool(49));
+    assert(test_vector_bool(64));
+    assert(test_vector_bool(199));
+    assert(test_vector_bool(256));
+  }
+#endif
+
   return true;
 }
 
diff --git libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/ranges.copy_n.pass.cpp libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/ranges.copy_n.pass.cpp
index d2a2b7c48883..c7031f63a02f 100644
--- libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/ranges.copy_n.pass.cpp
+++ libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/ranges.copy_n.pass.cpp
@@ -19,8 +19,10 @@
 #include <array>
 #include <cassert>
 #include <ranges>
+#include <vector>
 
 #include "almost_satisfies_types.h"
+#include "test_macros.h"
 #include "test_iterators.h"
 
 template <class In, class Out = In, class Count = std::size_t>
@@ -41,10 +43,10 @@ static_assert(std::is_same_v<std::ranges::copy_result<int, long>, std::ranges::i
 template <class In, class Out, class Sent = In>
 constexpr void test_iterators() {
   { // simple test
-    std::array in {1, 2, 3, 4};
+    std::array in{1, 2, 3, 4};
     std::array<int, 4> out;
     std::same_as<std::ranges::in_out_result<In, Out>> auto ret =
-      std::ranges::copy_n(In(in.data()), in.size(), Out(out.data()));
+        std::ranges::copy_n(In(in.data()), in.size(), Out(out.data()));
     assert(in == out);
     assert(base(ret.in) == in.data() + in.size());
     assert(base(ret.out) == out.data() + out.size());
@@ -70,13 +72,37 @@ constexpr void test_in_iterators() {
 
 template <class Out>
 constexpr void test_proxy_in_iterators() {
-  test_iterators<ProxyIterator<cpp20_input_iterator<int*>>, Out, sentinel_wrapper<ProxyIterator<cpp20_input_iterator<int*>>>>();
+  test_iterators<ProxyIterator<cpp20_input_iterator<int*>>,
+                 Out,
+                 sentinel_wrapper<ProxyIterator<cpp20_input_iterator<int*>>>>();
   test_iterators<ProxyIterator<forward_iterator<int*>>, Out>();
   test_iterators<ProxyIterator<bidirectional_iterator<int*>>, Out>();
   test_iterators<ProxyIterator<random_access_iterator<int*>>, Out>();
   test_iterators<ProxyIterator<contiguous_iterator<int*>>, Out>();
 }
 
+#if TEST_STD_VER >= 23
+constexpr bool test_vector_bool(std::size_t N) {
+  std::vector<bool> in(N, false);
+  for (std::size_t i = 0; i < N; i += 2)
+    in[i] = true;
+
+  { // Test copy with aligned bytes
+    std::vector<bool> out(N);
+    std::ranges::copy_n(in.begin(), N, out.begin());
+    assert(in == out);
+  }
+  { // Test copy with unaligned bytes
+    std::vector<bool> out(N + 8);
+    std::ranges::copy_n(in.begin(), N, out.begin() + 4);
+    for (std::size_t i = 0; i < N; ++i)
+      assert(out[i + 4] == in[i]);
+  }
+
+  return true;
+};
+#endif
+
 constexpr bool test() {
   test_in_iterators<cpp20_input_iterator<int*>>();
   test_in_iterators<forward_iterator<int*>>();
@@ -92,8 +118,8 @@ constexpr bool test() {
 
   { // check that every element is copied exactly once
     struct CopyOnce {
-      bool copied = false;
-      constexpr CopyOnce() = default;
+      bool copied                               = false;
+      constexpr CopyOnce()                      = default;
       constexpr CopyOnce(const CopyOnce& other) = delete;
       constexpr CopyOnce& operator=(const CopyOnce& other) {
         assert(!other.copied);
@@ -101,14 +127,26 @@ constexpr bool test() {
         return *this;
       }
     };
-    std::array<CopyOnce, 4> in {};
-    std::array<CopyOnce, 4> out {};
+    std::array<CopyOnce, 4> in{};
+    std::array<CopyOnce, 4> out{};
     auto ret = std::ranges::copy_n(in.begin(), in.size(), out.begin());
     assert(ret.in == in.end());
     assert(ret.out == out.end());
     assert(std::all_of(out.begin(), out.end(), [](const auto& e) { return e.copied; }));
   }
 
+#if TEST_STD_VER >= 23
+  { // Test vector<bool>::iterator optimization
+    assert(test_vector_bool(8));
+    assert(test_vector_bool(19));
+    assert(test_vector_bool(32));
+    assert(test_vector_bool(49));
+    assert(test_vector_bool(64));
+    assert(test_vector_bool(199));
+    assert(test_vector_bool(256));
+  }
+#endif
+
   return true;
 }
 
diff --git libcxx/test/std/containers/associative/map/map.cons/move_assign_noexcept.compile.pass.cpp libcxx/test/std/containers/associative/map/map.cons/move_assign_noexcept.compile.pass.cpp
new file mode 100644
index 000000000000..a4c8ef1c5b42
--- /dev/null
+++ libcxx/test/std/containers/associative/map/map.cons/move_assign_noexcept.compile.pass.cpp
@@ -0,0 +1,61 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <map>
+
+// map& operator=(map&& c)
+//     noexcept(
+//          allocator_type::propagate_on_container_move_assignment::value &&
+//          is_nothrow_move_assignable<allocator_type>::value &&
+//          is_nothrow_move_assignable<key_compare>::value);
+
+// This tests a conforming extension
+
+// UNSUPPORTED: c++03
+
+#include <map>
+
+#include "test_macros.h"
+#include "MoveOnly.h"
+#include "test_allocator.h"
+
+template <class T>
+struct some_comp {
+  using value_type = T;
+  some_comp& operator=(const some_comp&);
+  bool operator()(const T&, const T&) const { return false; }
+};
+
+template <class T>
+struct always_equal_alloc {
+  using value_type = T;
+  always_equal_alloc(const always_equal_alloc&);
+  void allocate(std::size_t);
+};
+
+template <class T>
+struct not_always_equal_alloc {
+  int i;
+  using value_type = T;
+  not_always_equal_alloc(const not_always_equal_alloc&);
+  void allocate(std::size_t);
+};
+
+template <template <class> class Alloc>
+using multimap_alloc = std::map<MoveOnly, MoveOnly, std::less<MoveOnly>, Alloc<std::pair<const MoveOnly, MoveOnly>>>;
+
+static_assert(std::is_nothrow_move_assignable<multimap_alloc<std::allocator>>::value, "");
+static_assert(!std::is_nothrow_move_assignable<multimap_alloc<test_allocator>>::value, "");
+#if TEST_STD_VER >= 17
+static_assert(std::is_nothrow_move_assignable<multimap_alloc<always_equal_alloc>>::value, "");
+#endif
+static_assert(!std::is_nothrow_move_assignable<multimap_alloc<not_always_equal_alloc>>::value, "");
+#if defined(_LIBCPP_VERSION)
+static_assert(std::is_nothrow_move_assignable<multimap_alloc<other_allocator>>::value, "");
+#endif // _LIBCPP_VERSION
+static_assert(!std::is_nothrow_move_assignable<std::map<int, int, some_comp<int>>>::value, "");
diff --git libcxx/test/std/containers/associative/map/map.cons/move_assign_noexcept.pass.cpp libcxx/test/std/containers/associative/map/map.cons/move_assign_noexcept.pass.cpp
deleted file mode 100644
index 82a5dbac1261..000000000000
--- libcxx/test/std/containers/associative/map/map.cons/move_assign_noexcept.pass.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <map>
-
-// map& operator=(map&& c)
-//     noexcept(
-//          allocator_type::propagate_on_container_move_assignment::value &&
-//          is_nothrow_move_assignable<allocator_type>::value &&
-//          is_nothrow_move_assignable<key_compare>::value);
-
-// This tests a conforming extension
-
-// UNSUPPORTED: c++03
-
-#include <map>
-#include <cassert>
-
-#include "test_macros.h"
-#include "MoveOnly.h"
-#include "test_allocator.h"
-
-template <class T>
-struct some_comp
-{
-    typedef T value_type;
-    some_comp& operator=(const some_comp&);
-    bool operator()(const T&, const T&) const { return false; }
-};
-
-int main(int, char**)
-{
-    typedef std::pair<const MoveOnly, MoveOnly> V;
-    {
-        typedef std::map<MoveOnly, MoveOnly> C;
-        static_assert(std::is_nothrow_move_assignable<C>::value, "");
-    }
-    {
-        typedef std::map<MoveOnly, MoveOnly, std::less<MoveOnly>, test_allocator<V>> C;
-        static_assert(!std::is_nothrow_move_assignable<C>::value, "");
-    }
-#if defined(_LIBCPP_VERSION)
-    {
-        typedef std::map<MoveOnly, MoveOnly, std::less<MoveOnly>, other_allocator<V>> C;
-        static_assert(std::is_nothrow_move_assignable<C>::value, "");
-    }
-#endif // _LIBCPP_VERSION
-    {
-        typedef std::map<MoveOnly, MoveOnly, some_comp<MoveOnly>> C;
-        static_assert(!std::is_nothrow_move_assignable<C>::value, "");
-    }
-
-  return 0;
-}
diff --git libcxx/test/std/containers/associative/multimap/multimap.cons/move_assign_noexcept.compile.pass.cpp libcxx/test/std/containers/associative/multimap/multimap.cons/move_assign_noexcept.compile.pass.cpp
new file mode 100644
index 000000000000..dd4c6f76fbcb
--- /dev/null
+++ libcxx/test/std/containers/associative/multimap/multimap.cons/move_assign_noexcept.compile.pass.cpp
@@ -0,0 +1,62 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <map>
+
+// multimap& operator=(multimap&& c)
+//     noexcept(
+//          allocator_type::propagate_on_container_move_assignment::value &&
+//          is_nothrow_move_assignable<allocator_type>::value &&
+//          is_nothrow_move_assignable<key_compare>::value);
+
+// This tests a conforming extension
+
+// UNSUPPORTED: c++03
+
+#include <map>
+
+#include "test_macros.h"
+#include "MoveOnly.h"
+#include "test_allocator.h"
+
+template <class T>
+struct some_comp {
+  using value_type = T;
+  some_comp& operator=(const some_comp&);
+  bool operator()(const T&, const T&) const { return false; }
+};
+
+template <class T>
+struct always_equal_alloc {
+  using value_type = T;
+  always_equal_alloc(const always_equal_alloc&);
+  void allocate(std::size_t);
+};
+
+template <class T>
+struct not_always_equal_alloc {
+  int i;
+  using value_type = T;
+  not_always_equal_alloc(const not_always_equal_alloc&);
+  void allocate(std::size_t);
+};
+
+template <template <class> class Alloc>
+using multimap_alloc =
+    std::multimap<MoveOnly, MoveOnly, std::less<MoveOnly>, Alloc<std::pair<const MoveOnly, MoveOnly>>>;
+
+static_assert(std::is_nothrow_move_assignable<multimap_alloc<std::allocator>>::value, "");
+static_assert(!std::is_nothrow_move_assignable<multimap_alloc<test_allocator>>::value, "");
+#if TEST_STD_VER >= 17
+static_assert(std::is_nothrow_move_assignable<multimap_alloc<always_equal_alloc>>::value, "");
+#endif
+static_assert(!std::is_nothrow_move_assignable<multimap_alloc<not_always_equal_alloc>>::value, "");
+#if defined(_LIBCPP_VERSION)
+static_assert(std::is_nothrow_move_assignable<multimap_alloc<other_allocator>>::value, "");
+#endif // _LIBCPP_VERSION
+static_assert(!std::is_nothrow_move_assignable<std::multimap<int, int, some_comp<int>>>::value, "");
diff --git libcxx/test/std/containers/associative/multimap/multimap.cons/move_assign_noexcept.pass.cpp libcxx/test/std/containers/associative/multimap/multimap.cons/move_assign_noexcept.pass.cpp
deleted file mode 100644
index 4cd8c19fce5f..000000000000
--- libcxx/test/std/containers/associative/multimap/multimap.cons/move_assign_noexcept.pass.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <map>
-
-// multimap& operator=(multimap&& c)
-//     noexcept(
-//          allocator_type::propagate_on_container_move_assignment::value &&
-//          is_nothrow_move_assignable<allocator_type>::value &&
-//          is_nothrow_move_assignable<key_compare>::value);
-
-// This tests a conforming extension
-
-// UNSUPPORTED: c++03
-
-#include <map>
-#include <cassert>
-
-#include "test_macros.h"
-#include "MoveOnly.h"
-#include "test_allocator.h"
-
-template <class T>
-struct some_comp
-{
-    typedef T value_type;
-    some_comp& operator=(const some_comp&);
-    bool operator()(const T&, const T&) const { return false; }
-};
-
-int main(int, char**)
-{
-    typedef std::pair<const MoveOnly, MoveOnly> V;
-    {
-        typedef std::multimap<MoveOnly, MoveOnly> C;
-        static_assert(std::is_nothrow_move_assignable<C>::value, "");
-    }
-    {
-        typedef std::multimap<MoveOnly, MoveOnly, std::less<MoveOnly>, test_allocator<V>> C;
-        static_assert(!std::is_nothrow_move_assignable<C>::value, "");
-    }
-#if defined(_LIBCPP_VERSION)
-    {
-        typedef std::multimap<MoveOnly, MoveOnly, std::less<MoveOnly>, other_allocator<V>> C;
-        static_assert(std::is_nothrow_move_assignable<C>::value, "");
-    }
-#endif // _LIBCPP_VERSION
-    {
-        typedef std::multimap<MoveOnly, MoveOnly, some_comp<MoveOnly>> C;
-        static_assert(!std::is_nothrow_move_assignable<C>::value, "");
-    }
-
-  return 0;
-}
diff --git libcxx/test/std/containers/associative/multiset/multiset.cons/move_assign_noexcept.compile.pass.cpp libcxx/test/std/containers/associative/multiset/multiset.cons/move_assign_noexcept.compile.pass.cpp
new file mode 100644
index 000000000000..930eae1bc908
--- /dev/null
+++ libcxx/test/std/containers/associative/multiset/multiset.cons/move_assign_noexcept.compile.pass.cpp
@@ -0,0 +1,61 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <set>
+
+// multiset& operator=(multiset&& c)
+//     noexcept(
+//          allocator_type::propagate_on_container_move_assignment::value &&
+//          is_nothrow_move_assignable<allocator_type>::value &&
+//          is_nothrow_move_assignable<key_compare>::value);
+
+// This tests a conforming extension
+
+// UNSUPPORTED: c++03
+
+#include <set>
+
+#include "test_macros.h"
+#include "MoveOnly.h"
+#include "test_allocator.h"
+
+template <class T>
+struct some_comp {
+  using value_type = T;
+  some_comp& operator=(const some_comp&);
+  bool operator()(const T&, const T&) const { return false; }
+};
+
+template <class T>
+struct always_equal_alloc {
+  using value_type = T;
+  always_equal_alloc(const always_equal_alloc&);
+  void allocate(std::size_t);
+};
+
+template <class T>
+struct not_always_equal_alloc {
+  int i;
+  using value_type = T;
+  not_always_equal_alloc(const not_always_equal_alloc&);
+  void allocate(std::size_t);
+};
+
+template <template <class> class Alloc>
+using unordered_set_alloc = std::set<MoveOnly, std::less<MoveOnly>, Alloc<MoveOnly>>;
+
+static_assert(std::is_nothrow_move_assignable<unordered_set_alloc<std::allocator>>::value, "");
+static_assert(!std::is_nothrow_move_assignable<unordered_set_alloc<test_allocator>>::value, "");
+#if TEST_STD_VER >= 17
+static_assert(std::is_nothrow_move_assignable<unordered_set_alloc<always_equal_alloc>>::value, "");
+#endif
+static_assert(!std::is_nothrow_move_assignable<unordered_set_alloc<not_always_equal_alloc>>::value, "");
+#if defined(_LIBCPP_VERSION)
+static_assert(std::is_nothrow_move_assignable<unordered_set_alloc<other_allocator>>::value, "");
+#endif // _LIBCPP_VERSION
+static_assert(!std::is_nothrow_move_assignable<std::set<int, some_comp<int>>>::value, "");
diff --git libcxx/test/std/containers/associative/multiset/multiset.cons/move_assign_noexcept.pass.cpp libcxx/test/std/containers/associative/multiset/multiset.cons/move_assign_noexcept.pass.cpp
deleted file mode 100644
index a94b5b1abf1e..000000000000
--- libcxx/test/std/containers/associative/multiset/multiset.cons/move_assign_noexcept.pass.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <set>
-
-// multiset& operator=(multiset&& c)
-//     noexcept(
-//          allocator_type::propagate_on_container_move_assignment::value &&
-//          is_nothrow_move_assignable<allocator_type>::value &&
-//          is_nothrow_move_assignable<key_compare>::value);
-
-// This tests a conforming extension
-
-// UNSUPPORTED: c++03
-
-#include <set>
-#include <cassert>
-
-#include "test_macros.h"
-#include "MoveOnly.h"
-#include "test_allocator.h"
-
-template <class T>
-struct some_comp
-{
-    typedef T value_type;
-    some_comp& operator=(const some_comp&);
-    bool operator()(const T&, const T&) const { return false; }
-};
-
-int main(int, char**)
-{
-    {
-        typedef std::multiset<MoveOnly> C;
-        static_assert(std::is_nothrow_move_assignable<C>::value, "");
-    }
-    {
-        typedef std::multiset<MoveOnly, std::less<MoveOnly>, test_allocator<MoveOnly>> C;
-        static_assert(!std::is_nothrow_move_assignable<C>::value, "");
-    }
-#if defined(_LIBCPP_VERSION)
-    {
-        typedef std::multiset<MoveOnly, std::less<MoveOnly>, other_allocator<MoveOnly>> C;
-        static_assert(std::is_nothrow_move_assignable<C>::value, "");
-    }
-#endif // _LIBCPP_VERSION
-    {
-        typedef std::multiset<MoveOnly, some_comp<MoveOnly>> C;
-        static_assert(!std::is_nothrow_move_assignable<C>::value, "");
-    }
-
-  return 0;
-}
diff --git libcxx/test/std/containers/associative/set/set.cons/move_assign_noexcept.compile.pass.cpp libcxx/test/std/containers/associative/set/set.cons/move_assign_noexcept.compile.pass.cpp
new file mode 100644
index 000000000000..f8a0472fb069
--- /dev/null
+++ libcxx/test/std/containers/associative/set/set.cons/move_assign_noexcept.compile.pass.cpp
@@ -0,0 +1,61 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <set>
+
+// set& operator=(set&& c)
+//     noexcept(
+//          allocator_type::propagate_on_container_move_assignment::value &&
+//          is_nothrow_move_assignable<allocator_type>::value &&
+//          is_nothrow_move_assignable<key_compare>::value);
+
+// This tests a conforming extension
+
+// UNSUPPORTED: c++03
+
+#include <set>
+
+#include "test_macros.h"
+#include "MoveOnly.h"
+#include "test_allocator.h"
+
+template <class T>
+struct some_comp {
+  using value_type = T;
+  some_comp& operator=(const some_comp&);
+  bool operator()(const T&, const T&) const { return false; }
+};
+
+template <class T>
+struct always_equal_alloc {
+  using value_type = T;
+  always_equal_alloc(const always_equal_alloc&);
+  void allocate(std::size_t);
+};
+
+template <class T>
+struct not_always_equal_alloc {
+  int i;
+  using value_type = T;
+  not_always_equal_alloc(const not_always_equal_alloc&);
+  void allocate(std::size_t);
+};
+
+template <template <class> class Alloc>
+using unordered_map_alloc = std::set<MoveOnly, std::less<MoveOnly>, Alloc<MoveOnly>>;
+
+static_assert(std::is_nothrow_move_assignable<unordered_map_alloc<std::allocator>>::value, "");
+static_assert(!std::is_nothrow_move_assignable<unordered_map_alloc<test_allocator>>::value, "");
+#if TEST_STD_VER >= 17
+static_assert(std::is_nothrow_move_assignable<unordered_map_alloc<always_equal_alloc>>::value, "");
+#endif
+static_assert(!std::is_nothrow_move_assignable<unordered_map_alloc<not_always_equal_alloc>>::value, "");
+#if defined(_LIBCPP_VERSION)
+static_assert(std::is_nothrow_move_assignable<unordered_map_alloc<other_allocator>>::value, "");
+#endif // _LIBCPP_VERSION
+static_assert(!std::is_nothrow_move_assignable<std::set<int, some_comp<int>>>::value, "");
diff --git libcxx/test/std/containers/associative/set/set.cons/move_assign_noexcept.pass.cpp libcxx/test/std/containers/associative/set/set.cons/move_assign_noexcept.pass.cpp
deleted file mode 100644
index 9c177f219dc7..000000000000
--- libcxx/test/std/containers/associative/set/set.cons/move_assign_noexcept.pass.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <set>
-
-// set& operator=(set&& c)
-//     noexcept(
-//          allocator_type::propagate_on_container_move_assignment::value &&
-//          is_nothrow_move_assignable<allocator_type>::value &&
-//          is_nothrow_move_assignable<key_compare>::value);
-
-// This tests a conforming extension
-
-// UNSUPPORTED: c++03
-
-#include <set>
-#include <cassert>
-
-#include "test_macros.h"
-#include "MoveOnly.h"
-#include "test_allocator.h"
-
-template <class T>
-struct some_comp
-{
-    typedef T value_type;
-    some_comp& operator=(const some_comp&);
-    bool operator()(const T&, const T&) const { return false; }
-};
-
-int main(int, char**)
-{
-    {
-        typedef std::set<MoveOnly> C;
-        static_assert(std::is_nothrow_move_assignable<C>::value, "");
-    }
-    {
-        typedef std::set<MoveOnly, std::less<MoveOnly>, test_allocator<MoveOnly>> C;
-        static_assert(!std::is_nothrow_move_assignable<C>::value, "");
-    }
-#if defined(_LIBCPP_VERSION)
-    {
-        typedef std::set<MoveOnly, std::less<MoveOnly>, other_allocator<MoveOnly>> C;
-        static_assert(std::is_nothrow_move_assignable<C>::value, "");
-    }
-#endif // _LIBCPP_VERSION
-    {
-        typedef std::set<MoveOnly, some_comp<MoveOnly>> C;
-        static_assert(!std::is_nothrow_move_assignable<C>::value, "");
-    }
-
-  return 0;
-}
diff --git libcxx/test/std/containers/sequences/deque/deque.cons/move_assign_noexcept.compile.pass.cpp libcxx/test/std/containers/sequences/deque/deque.cons/move_assign_noexcept.compile.pass.cpp
new file mode 100644
index 000000000000..417f44922d37
--- /dev/null
+++ libcxx/test/std/containers/sequences/deque/deque.cons/move_assign_noexcept.compile.pass.cpp
@@ -0,0 +1,49 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <deque>
+
+// deque& operator=(deque&& c)
+//     noexcept(
+//          allocator_type::propagate_on_container_move_assignment::value &&
+//          is_nothrow_move_assignable<allocator_type>::value);
+
+// This tests a conforming extension
+
+// UNSUPPORTED: c++03
+
+#include <deque>
+
+#include "test_macros.h"
+#include "MoveOnly.h"
+#include "test_allocator.h"
+
+template <class T>
+struct always_equal_alloc {
+  using value_type = T;
+  always_equal_alloc(const always_equal_alloc&);
+  void allocate(std::size_t);
+};
+
+template <class T>
+struct not_always_equal_alloc {
+  int i;
+  using value_type = T;
+  not_always_equal_alloc(const not_always_equal_alloc&);
+  void allocate(std::size_t);
+};
+
+static_assert(std::is_nothrow_move_assignable<std::deque<MoveOnly>>::value, "");
+static_assert(!std::is_nothrow_move_assignable<std::deque<MoveOnly, test_allocator<MoveOnly>>>::value, "");
+#if TEST_STD_VER >= 17
+static_assert(std::is_nothrow_move_assignable<std::deque<MoveOnly, always_equal_alloc<MoveOnly>>>::value, "");
+#endif
+static_assert(!std::is_nothrow_move_assignable<std::deque<MoveOnly, not_always_equal_alloc<MoveOnly>>>::value, "");
+#if defined(_LIBCPP_VERSION)
+static_assert(std::is_nothrow_move_assignable<std::deque<MoveOnly, other_allocator<MoveOnly>>>::value, "");
+#endif // _LIBCPP_VERSION
diff --git libcxx/test/std/containers/sequences/deque/deque.cons/move_assign_noexcept.pass.cpp libcxx/test/std/containers/sequences/deque/deque.cons/move_assign_noexcept.pass.cpp
deleted file mode 100644
index cbefbf6dae91..000000000000
--- libcxx/test/std/containers/sequences/deque/deque.cons/move_assign_noexcept.pass.cpp
+++ /dev/null
@@ -1,57 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <deque>
-
-// deque& operator=(deque&& c)
-//     noexcept(
-//          allocator_type::propagate_on_container_move_assignment::value &&
-//          is_nothrow_move_assignable<allocator_type>::value);
-
-// This tests a conforming extension
-
-// UNSUPPORTED: c++03
-
-#include <deque>
-#include <cassert>
-
-#include "test_macros.h"
-#include "MoveOnly.h"
-#include "test_allocator.h"
-
-template <class T>
-struct some_alloc
-{
-    typedef T value_type;
-    some_alloc(const some_alloc&);
-    void allocate(std::size_t);
-};
-
-int main(int, char**)
-{
-    {
-        typedef std::deque<MoveOnly> C;
-        static_assert(std::is_nothrow_move_assignable<C>::value, "");
-    }
-    {
-        typedef std::deque<MoveOnly, test_allocator<MoveOnly>> C;
-        static_assert(!std::is_nothrow_move_assignable<C>::value, "");
-    }
-#if defined(_LIBCPP_VERSION)
-    {
-        typedef std::deque<MoveOnly, other_allocator<MoveOnly>> C;
-        static_assert(std::is_nothrow_move_assignable<C>::value, "");
-    }
-    {
-        typedef std::deque<MoveOnly, some_alloc<MoveOnly>> C;
-        static_assert(!std::is_nothrow_move_assignable<C>::value, "");
-    }
-#endif // _LIBCPP_VERSION
-
-  return 0;
-}
diff --git libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/move_assign_noexcept.compile.pass.cpp libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/move_assign_noexcept.compile.pass.cpp
new file mode 100644
index 000000000000..b73177a00d46
--- /dev/null
+++ libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/move_assign_noexcept.compile.pass.cpp
@@ -0,0 +1,50 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <forward_list>
+
+// forward_list& operator=(forward_list&& c)
+//     noexcept(
+//          allocator_type::propagate_on_container_move_assignment::value &&
+//          is_nothrow_move_assignable<allocator_type>::value);
+
+// This tests a conforming extension
+
+// UNSUPPORTED: c++03
+
+#include <forward_list>
+
+#include "test_macros.h"
+#include "MoveOnly.h"
+#include "test_allocator.h"
+
+template <class T>
+struct always_equal_alloc {
+  using value_type = T;
+  always_equal_alloc(const always_equal_alloc&);
+  void allocate(std::size_t);
+};
+
+template <class T>
+struct not_always_equal_alloc {
+  int i;
+  using value_type = T;
+  not_always_equal_alloc(const not_always_equal_alloc&);
+  void allocate(std::size_t);
+};
+
+static_assert(std::is_nothrow_move_assignable<std::forward_list<MoveOnly>>::value, "");
+static_assert(!std::is_nothrow_move_assignable<std::forward_list<MoveOnly, test_allocator<MoveOnly>>>::value, "");
+#if TEST_STD_VER >= 17
+static_assert(std::is_nothrow_move_assignable<std::forward_list<MoveOnly, always_equal_alloc<MoveOnly>>>::value, "");
+#endif
+static_assert(!std::is_nothrow_move_assignable<std::forward_list<MoveOnly, not_always_equal_alloc<MoveOnly>>>::value,
+              "");
+#if defined(_LIBCPP_VERSION)
+static_assert(std::is_nothrow_move_assignable<std::forward_list<MoveOnly, other_allocator<MoveOnly>>>::value, "");
+#endif // _LIBCPP_VERSION
diff --git libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/move_assign_noexcept.pass.cpp libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/move_assign_noexcept.pass.cpp
deleted file mode 100644
index f361e373d23f..000000000000
--- libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/move_assign_noexcept.pass.cpp
+++ /dev/null
@@ -1,57 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <forward_list>
-
-// forward_list& operator=(forward_list&& c)
-//     noexcept(
-//          allocator_type::propagate_on_container_move_assignment::value &&
-//          is_nothrow_move_assignable<allocator_type>::value);
-
-// This tests a conforming extension
-
-// UNSUPPORTED: c++03
-
-#include <forward_list>
-#include <cassert>
-
-#include "test_macros.h"
-#include "MoveOnly.h"
-#include "test_allocator.h"
-
-template <class T>
-struct some_alloc
-{
-    typedef T value_type;
-    some_alloc(const some_alloc&);
-    void allocate(std::size_t);
-};
-
-int main(int, char**)
-{
-    {
-        typedef std::forward_list<MoveOnly> C;
-        static_assert(std::is_nothrow_move_assignable<C>::value, "");
-    }
-    {
-        typedef std::forward_list<MoveOnly, test_allocator<MoveOnly>> C;
-        static_assert(!std::is_nothrow_move_assignable<C>::value, "");
-    }
-#if defined(_LIBCPP_VERSION)
-    {
-        typedef std::forward_list<MoveOnly, other_allocator<MoveOnly>> C;
-        static_assert(std::is_nothrow_move_assignable<C>::value, "");
-    }
-    {
-        typedef std::forward_list<MoveOnly, some_alloc<MoveOnly>> C;
-        static_assert(!std::is_nothrow_move_assignable<C>::value, "");
-    }
-#endif // _LIBCPP_VERSION
-
-  return 0;
-}
diff --git libcxx/test/std/containers/sequences/list/list.cons/move_assign_noexcept.compile.pass.cpp libcxx/test/std/containers/sequences/list/list.cons/move_assign_noexcept.compile.pass.cpp
new file mode 100644
index 000000000000..8da5cfc4716e
--- /dev/null
+++ libcxx/test/std/containers/sequences/list/list.cons/move_assign_noexcept.compile.pass.cpp
@@ -0,0 +1,49 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <list>
+
+// list& operator=(list&& c)
+//     noexcept(
+//          allocator_type::propagate_on_container_move_assignment::value &&
+//          is_nothrow_move_assignable<allocator_type>::value);
+
+// This tests a conforming extension
+
+// UNSUPPORTED: c++03
+
+#include <list>
+
+#include "test_macros.h"
+#include "MoveOnly.h"
+#include "test_allocator.h"
+
+template <class T>
+struct always_equal_alloc {
+  using value_type = T;
+  always_equal_alloc(const always_equal_alloc&);
+  void allocate(std::size_t);
+};
+
+template <class T>
+struct not_always_equal_alloc {
+  int i;
+  using value_type = T;
+  not_always_equal_alloc(const not_always_equal_alloc&);
+  void allocate(std::size_t);
+};
+
+static_assert(std::is_nothrow_move_assignable<std::list<MoveOnly>>::value, "");
+static_assert(!std::is_nothrow_move_assignable<std::list<MoveOnly, test_allocator<MoveOnly>>>::value, "");
+#if TEST_STD_VER >= 17
+static_assert(std::is_nothrow_move_assignable<std::list<MoveOnly, always_equal_alloc<MoveOnly>>>::value, "");
+#endif
+static_assert(!std::is_nothrow_move_assignable<std::list<MoveOnly, not_always_equal_alloc<MoveOnly>>>::value, "");
+#if defined(_LIBCPP_VERSION)
+static_assert(std::is_nothrow_move_assignable<std::list<MoveOnly, other_allocator<MoveOnly>>>::value, "");
+#endif // _LIBCPP_VERSION
diff --git libcxx/test/std/containers/sequences/list/list.cons/move_assign_noexcept.pass.cpp libcxx/test/std/containers/sequences/list/list.cons/move_assign_noexcept.pass.cpp
deleted file mode 100644
index 9df8413bb19c..000000000000
--- libcxx/test/std/containers/sequences/list/list.cons/move_assign_noexcept.pass.cpp
+++ /dev/null
@@ -1,57 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <list>
-
-// list& operator=(list&& c)
-//     noexcept(
-//          allocator_type::propagate_on_container_move_assignment::value &&
-//          is_nothrow_move_assignable<allocator_type>::value);
-
-// This tests a conforming extension
-
-// UNSUPPORTED: c++03
-
-#include <list>
-#include <cassert>
-
-#include "test_macros.h"
-#include "MoveOnly.h"
-#include "test_allocator.h"
-
-template <class T>
-struct some_alloc
-{
-    typedef T value_type;
-    some_alloc(const some_alloc&);
-    void allocate(std::size_t);
-};
-
-int main(int, char**)
-{
-    {
-        typedef std::list<MoveOnly> C;
-        static_assert(std::is_nothrow_move_assignable<C>::value, "");
-    }
-    {
-        typedef std::list<MoveOnly, test_allocator<MoveOnly>> C;
-        static_assert(!std::is_nothrow_move_assignable<C>::value, "");
-    }
-#if defined(_LIBCPP_VERSION)
-    {
-        typedef std::list<MoveOnly, other_allocator<MoveOnly>> C;
-        static_assert(std::is_nothrow_move_assignable<C>::value, "");
-    }
-    {
-        typedef std::list<MoveOnly, some_alloc<MoveOnly>> C;
-        static_assert(!std::is_nothrow_move_assignable<C>::value, "");
-    }
-#endif // _LIBCPP_VERSION
-
-  return 0;
-}
diff --git libcxx/test/std/containers/unord/unord.map/unord.map.cnstr/move_assign_noexcept.compile.pass.cpp libcxx/test/std/containers/unord/unord.map/unord.map.cnstr/move_assign_noexcept.compile.pass.cpp
new file mode 100644
index 000000000000..098e83d3521e
--- /dev/null
+++ libcxx/test/std/containers/unord/unord.map/unord.map.cnstr/move_assign_noexcept.compile.pass.cpp
@@ -0,0 +1,78 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <unordered_map>
+
+// unordered_map& operator=(unordered_map&& c)
+//     noexcept(
+//          allocator_type::propagate_on_container_move_assignment::value &&
+//          is_nothrow_move_assignable<allocator_type>::value &&
+//          is_nothrow_move_assignable<key_compare>::value);
+
+// This tests a conforming extension
+
+// UNSUPPORTED: c++03
+
+#include <unordered_map>
+
+#include "test_macros.h"
+#include "MoveOnly.h"
+#include "test_allocator.h"
+
+template <class T>
+struct some_hash {
+  using value_type = T;
+  some_hash();
+  some_hash(const some_hash&);
+  some_hash& operator=(const some_hash&);
+
+  std::size_t operator()(T const&) const;
+};
+
+template <class T>
+struct some_comp {
+  using value_type = T;
+  some_comp& operator=(const some_comp&);
+  bool operator()(const T&, const T&) const { return false; }
+};
+
+template <class T>
+struct always_equal_alloc {
+  using value_type = T;
+  always_equal_alloc(const always_equal_alloc&);
+  void allocate(std::size_t);
+};
+
+template <class T>
+struct not_always_equal_alloc {
+  int i;
+  using value_type = T;
+  not_always_equal_alloc(const not_always_equal_alloc&);
+  void allocate(std::size_t);
+};
+
+template <template <class> class Alloc>
+using unordered_set_alloc =
+    std::unordered_map<MoveOnly,
+                       MoveOnly,
+                       std::hash<MoveOnly>,
+                       std::equal_to<MoveOnly>,
+                       Alloc<std::pair<const MoveOnly, MoveOnly>>>;
+
+static_assert(std::is_nothrow_move_assignable<unordered_set_alloc<std::allocator>>::value, "");
+static_assert(!std::is_nothrow_move_assignable<unordered_set_alloc<test_allocator>>::value, "");
+#if TEST_STD_VER >= 17
+static_assert(std::is_nothrow_move_assignable<unordered_set_alloc<always_equal_alloc>>::value, "");
+#endif
+static_assert(!std::is_nothrow_move_assignable<unordered_set_alloc<not_always_equal_alloc>>::value, "");
+#if defined(_LIBCPP_VERSION)
+static_assert(std::is_nothrow_move_assignable<unordered_set_alloc<other_allocator>>::value, "");
+#endif // _LIBCPP_VERSION
+static_assert(!std::is_nothrow_move_assignable<std::unordered_map<int, int, some_hash<int>>>::value, "");
+static_assert(!std::is_nothrow_move_assignable<std::unordered_map<int, int, std::hash<int>, some_comp<int>>>::value,
+              "");
diff --git libcxx/test/std/containers/unord/unord.map/unord.map.cnstr/move_assign_noexcept.pass.cpp libcxx/test/std/containers/unord/unord.map/unord.map.cnstr/move_assign_noexcept.pass.cpp
deleted file mode 100644
index 9d5053721671..000000000000
--- libcxx/test/std/containers/unord/unord.map/unord.map.cnstr/move_assign_noexcept.pass.cpp
+++ /dev/null
@@ -1,76 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <unordered_map>
-
-// unordered_map& operator=(unordered_map&& c)
-//     noexcept(
-//          allocator_type::propagate_on_container_move_assignment::value &&
-//          is_nothrow_move_assignable<allocator_type>::value &&
-//          is_nothrow_move_assignable<key_compare>::value);
-
-// This tests a conforming extension
-
-// UNSUPPORTED: c++03
-
-#include <unordered_map>
-#include <cassert>
-
-#include "test_macros.h"
-#include "MoveOnly.h"
-#include "test_allocator.h"
-
-template <class T>
-struct some_comp
-{
-    typedef T value_type;
-    some_comp& operator=(const some_comp&);
-    bool operator()(const T&, const T&) const { return false; }
-};
-
-template <class T>
-struct some_hash
-{
-    typedef T value_type;
-    some_hash();
-    some_hash(const some_hash&);
-    some_hash& operator=(const some_hash&);
-
-    std::size_t operator()(T const&) const;
-};
-
-int main(int, char**)
-{
-    {
-        typedef std::unordered_map<MoveOnly, MoveOnly> C;
-        static_assert(std::is_nothrow_move_assignable<C>::value, "");
-    }
-    {
-        typedef std::unordered_map<MoveOnly, MoveOnly, std::hash<MoveOnly>,
-                           std::equal_to<MoveOnly>, test_allocator<std::pair<const MoveOnly, MoveOnly>>> C;
-        static_assert(!std::is_nothrow_move_assignable<C>::value, "");
-    }
-#if defined(_LIBCPP_VERSION)
-    {
-        typedef std::unordered_map<MoveOnly, MoveOnly, std::hash<MoveOnly>,
-                          std::equal_to<MoveOnly>, other_allocator<std::pair<const MoveOnly, MoveOnly>>> C;
-        static_assert(std::is_nothrow_move_assignable<C>::value, "");
-    }
-#endif // _LIBCPP_VERSION
-    {
-        typedef std::unordered_map<MoveOnly, MoveOnly, some_hash<MoveOnly>> C;
-        static_assert(!std::is_nothrow_move_assignable<C>::value, "");
-    }
-    {
-        typedef std::unordered_map<MoveOnly, MoveOnly, std::hash<MoveOnly>,
-                                                         some_comp<MoveOnly>> C;
-        static_assert(!std::is_nothrow_move_assignable<C>::value, "");
-    }
-
-  return 0;
-}
diff --git libcxx/test/std/containers/unord/unord.multimap/unord.multimap.cnstr/move_assign_noexcept.compile.pass.cpp libcxx/test/std/containers/unord/unord.multimap/unord.multimap.cnstr/move_assign_noexcept.compile.pass.cpp
new file mode 100644
index 000000000000..1420250fad61
--- /dev/null
+++ libcxx/test/std/containers/unord/unord.multimap/unord.multimap.cnstr/move_assign_noexcept.compile.pass.cpp
@@ -0,0 +1,78 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <unordered_map>
+
+// unordered_multimap& operator=(unordered_multimap&& c)
+//     noexcept(
+//          allocator_type::propagate_on_container_move_assignment::value &&
+//          is_nothrow_move_assignable<allocator_type>::value &&
+//          is_nothrow_move_assignable<key_compare>::value);
+
+// This tests a conforming extension
+
+// UNSUPPORTED: c++03
+
+#include <unordered_map>
+
+#include "test_macros.h"
+#include "MoveOnly.h"
+#include "test_allocator.h"
+
+template <class T>
+struct some_hash {
+  using value_type = T;
+  some_hash();
+  some_hash(const some_hash&);
+  some_hash& operator=(const some_hash&);
+
+  std::size_t operator()(T const&) const;
+};
+
+template <class T>
+struct some_comp {
+  using value_type = T;
+  some_comp& operator=(const some_comp&);
+  bool operator()(const T&, const T&) const { return false; }
+};
+
+template <class T>
+struct always_equal_alloc {
+  using value_type = T;
+  always_equal_alloc(const always_equal_alloc&);
+  void allocate(std::size_t);
+};
+
+template <class T>
+struct not_always_equal_alloc {
+  int i;
+  using value_type = T;
+  not_always_equal_alloc(const not_always_equal_alloc&);
+  void allocate(std::size_t);
+};
+
+template <template <class> class Alloc>
+using unordered_set_alloc =
+    std::unordered_multimap<MoveOnly,
+                            MoveOnly,
+                            std::hash<MoveOnly>,
+                            std::equal_to<MoveOnly>,
+                            Alloc<std::pair<const MoveOnly, MoveOnly>>>;
+
+static_assert(std::is_nothrow_move_assignable<unordered_set_alloc<std::allocator>>::value, "");
+static_assert(!std::is_nothrow_move_assignable<unordered_set_alloc<test_allocator>>::value, "");
+#if TEST_STD_VER >= 17
+static_assert(std::is_nothrow_move_assignable<unordered_set_alloc<always_equal_alloc>>::value, "");
+#endif
+static_assert(!std::is_nothrow_move_assignable<unordered_set_alloc<not_always_equal_alloc>>::value, "");
+#if defined(_LIBCPP_VERSION)
+static_assert(std::is_nothrow_move_assignable<unordered_set_alloc<other_allocator>>::value, "");
+#endif // _LIBCPP_VERSION
+static_assert(!std::is_nothrow_move_assignable<std::unordered_map<int, int, some_hash<int>>>::value, "");
+static_assert(!std::is_nothrow_move_assignable<std::unordered_map<int, int, std::hash<int>, some_comp<int>>>::value,
+              "");
diff --git libcxx/test/std/containers/unord/unord.multimap/unord.multimap.cnstr/move_assign_noexcept.pass.cpp libcxx/test/std/containers/unord/unord.multimap/unord.multimap.cnstr/move_assign_noexcept.pass.cpp
deleted file mode 100644
index 2cd99546e5bd..000000000000
--- libcxx/test/std/containers/unord/unord.multimap/unord.multimap.cnstr/move_assign_noexcept.pass.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <unordered_map>
-
-// unordered_multimap& operator=(unordered_multimap&& c)
-//     noexcept(
-//          allocator_type::propagate_on_container_move_assignment::value &&
-//          is_nothrow_move_assignable<allocator_type>::value &&
-//          is_nothrow_move_assignable<key_compare>::value);
-
-// This tests a conforming extension
-
-// UNSUPPORTED: c++03
-
-#include <unordered_map>
-#include <cassert>
-
-#include "test_macros.h"
-#include "MoveOnly.h"
-#include "test_allocator.h"
-
-template <class T>
-struct some_comp
-{
-    typedef T value_type;
-    some_comp& operator=(const some_comp&);
-    bool operator()(const T&, const T&) const { return false; }
-};
-
-template <class T>
-struct some_hash
-{
-    typedef T value_type;
-    some_hash();
-    some_hash(const some_hash&);
-    some_hash& operator=(const some_hash&);
-    std::size_t operator()(T const&) const;
-};
-
-int main(int, char**)
-{
-    {
-        typedef std::unordered_multimap<MoveOnly, MoveOnly> C;
-        static_assert(std::is_nothrow_move_assignable<C>::value, "");
-    }
-    {
-        typedef std::unordered_multimap<MoveOnly, MoveOnly, std::hash<MoveOnly>,
-                           std::equal_to<MoveOnly>, test_allocator<std::pair<const MoveOnly, MoveOnly>>> C;
-        static_assert(!std::is_nothrow_move_assignable<C>::value, "");
-    }
-#if defined(_LIBCPP_VERSION)
-    {
-        typedef std::unordered_multimap<MoveOnly, MoveOnly, std::hash<MoveOnly>,
-                          std::equal_to<MoveOnly>, other_allocator<std::pair<const MoveOnly, MoveOnly>>> C;
-        static_assert(std::is_nothrow_move_assignable<C>::value, "");
-    }
-#endif // _LIBCPP_VERSION
-    {
-        typedef std::unordered_multimap<MoveOnly, MoveOnly, some_hash<MoveOnly>> C;
-        static_assert(!std::is_nothrow_move_assignable<C>::value, "");
-    }
-    {
-        typedef std::unordered_multimap<MoveOnly, MoveOnly, std::hash<MoveOnly>,
-                                                         some_comp<MoveOnly>> C;
-        static_assert(!std::is_nothrow_move_assignable<C>::value, "");
-    }
-
-  return 0;
-}
diff --git libcxx/test/std/containers/unord/unord.multiset/unord.multiset.cnstr/move_assign_noexcept.compile.pass.cpp libcxx/test/std/containers/unord/unord.multiset/unord.multiset.cnstr/move_assign_noexcept.compile.pass.cpp
new file mode 100644
index 000000000000..814810d341e2
--- /dev/null
+++ libcxx/test/std/containers/unord/unord.multiset/unord.multiset.cnstr/move_assign_noexcept.compile.pass.cpp
@@ -0,0 +1,74 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <unordered_set>
+
+// unordered_multiset& operator=(unordered_multiset&& c)
+//     noexcept(
+//          allocator_type::propagate_on_container_move_assignment::value &&
+//          is_nothrow_move_assignable<allocator_type>::value &&
+//          is_nothrow_move_assignable<key_compare>::value);
+
+// This tests a conforming extension
+
+// UNSUPPORTED: c++03
+
+#include <unordered_set>
+
+#include "test_macros.h"
+#include "MoveOnly.h"
+#include "test_allocator.h"
+
+template <class T>
+struct some_hash {
+  using value_type = T;
+  some_hash();
+  some_hash(const some_hash&);
+  some_hash& operator=(const some_hash&);
+
+  std::size_t operator()(T const&) const;
+};
+
+template <class T>
+struct some_comp {
+  using value_type = T;
+  some_comp& operator=(const some_comp&);
+  bool operator()(const T&, const T&) const { return false; }
+};
+
+template <class T>
+struct always_equal_alloc {
+  using value_type = T;
+  always_equal_alloc(const always_equal_alloc&);
+  void allocate(std::size_t);
+};
+
+template <class T>
+struct not_always_equal_alloc {
+  int i;
+  using value_type = T;
+  not_always_equal_alloc(const not_always_equal_alloc&);
+  void allocate(std::size_t);
+};
+
+template <template <class> class Alloc>
+using unordered_set_alloc =
+    std::unordered_multiset<MoveOnly, std::hash<MoveOnly>, std::equal_to<MoveOnly>, Alloc<MoveOnly>>;
+
+static_assert(std::is_nothrow_move_assignable<unordered_set_alloc<std::allocator>>::value, "");
+static_assert(!std::is_nothrow_move_assignable<unordered_set_alloc<test_allocator>>::value, "");
+#if TEST_STD_VER > 17
+static_assert(std::is_nothrow_move_assignable<unordered_set_alloc<always_equal_alloc>>::value, "");
+#endif
+static_assert(!std::is_nothrow_move_assignable<unordered_set_alloc<not_always_equal_alloc>>::value, "");
+#if defined(_LIBCPP_VERSION)
+static_assert(std::is_nothrow_move_assignable<unordered_set_alloc<other_allocator>>::value, "");
+#endif // _LIBCPP_VERSION
+static_assert(!std::is_nothrow_move_assignable<std::unordered_multiset<int, some_hash<int>>>::value, "");
+static_assert(!std::is_nothrow_move_assignable<std::unordered_multiset<int, std::hash<int>, some_comp<int>>>::value,
+              "");
diff --git libcxx/test/std/containers/unord/unord.multiset/unord.multiset.cnstr/move_assign_noexcept.pass.cpp libcxx/test/std/containers/unord/unord.multiset/unord.multiset.cnstr/move_assign_noexcept.pass.cpp
deleted file mode 100644
index 9dbca6e3e867..000000000000
--- libcxx/test/std/containers/unord/unord.multiset/unord.multiset.cnstr/move_assign_noexcept.pass.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <unordered_set>
-
-// unordered_multiset& operator=(unordered_multiset&& c)
-//     noexcept(
-//          allocator_type::propagate_on_container_move_assignment::value &&
-//          is_nothrow_move_assignable<allocator_type>::value &&
-//          is_nothrow_move_assignable<key_compare>::value);
-
-// This tests a conforming extension
-
-// UNSUPPORTED: c++03
-
-#include <unordered_set>
-#include <cassert>
-
-#include "test_macros.h"
-#include "MoveOnly.h"
-#include "test_allocator.h"
-
-template <class T>
-struct some_comp
-{
-    typedef T value_type;
-    some_comp& operator=(const some_comp&);
-    bool operator()(const T&, const T&) const { return false; }
-};
-
-template <class T>
-struct some_hash
-{
-    typedef T value_type;
-    some_hash();
-    some_hash(const some_hash&);
-    some_hash& operator=(const some_hash&);
-    std::size_t operator()(T const&) const;
-};
-
-int main(int, char**)
-{
-    {
-        typedef std::unordered_multiset<MoveOnly> C;
-        static_assert(std::is_nothrow_move_assignable<C>::value, "");
-    }
-    {
-        typedef std::unordered_multiset<MoveOnly, std::hash<MoveOnly>,
-                           std::equal_to<MoveOnly>, test_allocator<MoveOnly>> C;
-        static_assert(!std::is_nothrow_move_assignable<C>::value, "");
-    }
-#if defined(_LIBCPP_VERSION)
-    {
-        typedef std::unordered_multiset<MoveOnly, std::hash<MoveOnly>,
-                          std::equal_to<MoveOnly>, other_allocator<MoveOnly>> C;
-        static_assert(std::is_nothrow_move_assignable<C>::value, "");
-    }
-#endif // _LIBCPP_VERSION
-    {
-        typedef std::unordered_multiset<MoveOnly, some_hash<MoveOnly>> C;
-        static_assert(!std::is_nothrow_move_assignable<C>::value, "");
-    }
-    {
-        typedef std::unordered_multiset<MoveOnly, std::hash<MoveOnly>,
-                                                         some_comp<MoveOnly>> C;
-        static_assert(!std::is_nothrow_move_assignable<C>::value, "");
-    }
-
-  return 0;
-}
diff --git libcxx/test/std/containers/unord/unord.set/unord.set.cnstr/move_assign_noexcept.compile.pass.cpp libcxx/test/std/containers/unord/unord.set/unord.set.cnstr/move_assign_noexcept.compile.pass.cpp
new file mode 100644
index 000000000000..1b4df4e3f23b
--- /dev/null
+++ libcxx/test/std/containers/unord/unord.set/unord.set.cnstr/move_assign_noexcept.compile.pass.cpp
@@ -0,0 +1,72 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <unordered_set>
+
+// unordered_set& operator=(unordered_set&& c)
+//     noexcept(
+//          allocator_type::propagate_on_container_move_assignment::value &&
+//          is_nothrow_move_assignable<allocator_type>::value &&
+//          is_nothrow_move_assignable<key_compare>::value);
+
+// This tests a conforming extension
+
+// UNSUPPORTED: c++03
+
+#include <unordered_set>
+
+#include "test_macros.h"
+#include "MoveOnly.h"
+#include "test_allocator.h"
+
+template <class T>
+struct some_hash {
+  using value_type = T;
+  some_hash();
+  some_hash(const some_hash&);
+  some_hash& operator=(const some_hash&);
+
+  std::size_t operator()(T const&) const;
+};
+
+template <class T>
+struct some_comp {
+  using value_type = T;
+  some_comp& operator=(const some_comp&);
+  bool operator()(const T&, const T&) const { return false; }
+};
+
+template <class T>
+struct always_equal_alloc {
+  using value_type = T;
+  always_equal_alloc(const always_equal_alloc&);
+  void allocate(std::size_t);
+};
+
+template <class T>
+struct not_always_equal_alloc {
+  int i;
+  using value_type = T;
+  not_always_equal_alloc(const not_always_equal_alloc&);
+  void allocate(std::size_t);
+};
+
+template <template <class> class Alloc>
+using unordered_set_alloc = std::unordered_set<MoveOnly, std::hash<MoveOnly>, std::equal_to<MoveOnly>, Alloc<MoveOnly>>;
+
+static_assert(std::is_nothrow_move_assignable<unordered_set_alloc<std::allocator>>::value, "");
+static_assert(!std::is_nothrow_move_assignable<unordered_set_alloc<test_allocator>>::value, "");
+#if TEST_STD_VER >= 17
+static_assert(std::is_nothrow_move_assignable<unordered_set_alloc<always_equal_alloc>>::value, "");
+#endif
+static_assert(!std::is_nothrow_move_assignable<unordered_set_alloc<not_always_equal_alloc>>::value, "");
+#if defined(_LIBCPP_VERSION)
+static_assert(std::is_nothrow_move_assignable<unordered_set_alloc<other_allocator>>::value, "");
+#endif // _LIBCPP_VERSION
+static_assert(!std::is_nothrow_move_assignable<std::unordered_set<int, some_hash<int>>>::value, "");
+static_assert(!std::is_nothrow_move_assignable<std::unordered_set<int, std::hash<int>, some_comp<int>>>::value, "");
diff --git libcxx/test/std/containers/unord/unord.set/unord.set.cnstr/move_assign_noexcept.pass.cpp libcxx/test/std/containers/unord/unord.set/unord.set.cnstr/move_assign_noexcept.pass.cpp
deleted file mode 100644
index 1ff2a7b471a1..000000000000
--- libcxx/test/std/containers/unord/unord.set/unord.set.cnstr/move_assign_noexcept.pass.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <unordered_set>
-
-// unordered_set& operator=(unordered_set&& c)
-//     noexcept(
-//          allocator_type::propagate_on_container_move_assignment::value &&
-//          is_nothrow_move_assignable<allocator_type>::value &&
-//          is_nothrow_move_assignable<key_compare>::value);
-
-// This tests a conforming extension
-
-// UNSUPPORTED: c++03
-
-#include <unordered_set>
-#include <cassert>
-
-#include "test_macros.h"
-#include "MoveOnly.h"
-#include "test_allocator.h"
-
-template <class T>
-struct some_comp
-{
-    typedef T value_type;
-    some_comp& operator=(const some_comp&);
-    bool operator()(const T&, const T&) const { return false; }
-};
-
-template <class T>
-struct some_hash
-{
-    typedef T value_type;
-    some_hash();
-    some_hash(const some_hash&);
-    some_hash& operator=(const some_hash&);
-    std::size_t operator()(T const&) const;
-};
-
-int main(int, char**)
-{
-    {
-        typedef std::unordered_set<MoveOnly> C;
-        static_assert(std::is_nothrow_move_assignable<C>::value, "");
-    }
-    {
-        typedef std::unordered_set<MoveOnly, std::hash<MoveOnly>,
-                           std::equal_to<MoveOnly>, test_allocator<MoveOnly>> C;
-        static_assert(!std::is_nothrow_move_assignable<C>::value, "");
-    }
-#if defined(_LIBCPP_VERSION)
-    {
-        typedef std::unordered_set<MoveOnly, std::hash<MoveOnly>,
-                          std::equal_to<MoveOnly>, other_allocator<MoveOnly>> C;
-        static_assert(std::is_nothrow_move_assignable<C>::value, "");
-    }
-#endif // _LIBCPP_VERSION
-    {
-        typedef std::unordered_set<MoveOnly, some_hash<MoveOnly>> C;
-        static_assert(!std::is_nothrow_move_assignable<C>::value, "");
-    }
-    {
-        typedef std::unordered_set<MoveOnly, std::hash<MoveOnly>,
-                                                         some_comp<MoveOnly>> C;
-        static_assert(!std::is_nothrow_move_assignable<C>::value, "");
-    }
-
-  return 0;
-}
diff --git libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array.pass.cpp libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array.pass.cpp
index 1d763d6caba6..01387feed67b 100644
--- libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array.pass.cpp
+++ libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array.pass.cpp
@@ -14,6 +14,7 @@
 // ADDITIONAL_COMPILE_FLAGS(clang-18): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(apple-clang-15): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(apple-clang-16): -fsized-deallocation
+// ADDITIONAL_COMPILE_FLAGS(apple-clang-17): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(target=x86_64-w64-windows-gnu): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(target=i686-w64-windows-gnu): -fsized-deallocation
 
diff --git libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete.pass.cpp libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete.pass.cpp
index 462037e53374..06d3b0e5b3c3 100644
--- libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete.pass.cpp
+++ libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete.pass.cpp
@@ -14,6 +14,7 @@
 // ADDITIONAL_COMPILE_FLAGS(clang-18): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(apple-clang-15): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(apple-clang-16): -fsized-deallocation
+// ADDITIONAL_COMPILE_FLAGS(apple-clang-17): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(target=x86_64-w64-windows-gnu): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(target=i686-w64-windows-gnu): -fsized-deallocation
 
diff --git libcxx/test/std/numerics/c.math/signbit.pass.cpp libcxx/test/std/numerics/c.math/signbit.pass.cpp
index 143baf1fec94..b5e63dedf136 100644
--- libcxx/test/std/numerics/c.math/signbit.pass.cpp
+++ libcxx/test/std/numerics/c.math/signbit.pass.cpp
@@ -12,7 +12,7 @@
 // UNSUPPORTED: windows
 
 // These compilers don't support constexpr `__builtin_signbit` yet.
-// UNSUPPORTED: clang-18, clang-19, apple-clang-15, apple-clang-16
+// UNSUPPORTED: clang-18, clang-19, apple-clang-15, apple-clang-16, apple-clang-17
 
 // XFAIL: FROZEN-CXX03-HEADERS-FIXME
 
diff --git libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.pass.cpp libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.pass.cpp
index 24adec37431e..681ad13a07df 100644
--- libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.pass.cpp
+++ libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.pass.cpp
@@ -9,7 +9,7 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 
 // These compilers don't support __builtin_is_implicit_lifetime yet.
-// UNSUPPORTED: clang-18, clang-19, gcc-14, apple-clang-15, apple-clang-16
+// UNSUPPORTED: clang-18, clang-19, gcc-14, apple-clang-15, apple-clang-16, apple-clang-17
 
 // <type_traits>
 
diff --git libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.verify.cpp libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.verify.cpp
index 4bcb10d0b757..34462f9bf0ec 100644
--- libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.verify.cpp
+++ libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.verify.cpp
@@ -9,7 +9,7 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 
 // These compilers don't support __builtin_is_implicit_lifetime yet.
-// UNSUPPORTED: clang-18, clang-19, gcc-14, apple-clang-15, apple-clang-16
+// UNSUPPORTED: clang-18, clang-19, gcc-14, apple-clang-15, apple-clang-16, apple-clang-17
 
 // <type_traits>
 
diff --git lld/ELF/Driver.cpp lld/ELF/Driver.cpp
index 9d0c992c1e85..acbc97b331b0 100644
--- lld/ELF/Driver.cpp
+++ lld/ELF/Driver.cpp
@@ -2413,7 +2413,7 @@ static void findKeepUniqueSections(Ctx &ctx, opt::InputArgList &args) {
   // or DSOs, so we conservatively mark them as address-significant.
   bool icfSafe = ctx.arg.icf == ICFLevel::Safe;
   for (Symbol *sym : ctx.symtab->getSymbols())
-    if (sym->includeInDynsym(ctx))
+    if (sym->isExported)
       markAddrsig(icfSafe, sym);
 
   // Visit the address-significance table in each object file and mark each
@@ -2554,7 +2554,8 @@ void LinkerDriver::compileBitcodeFiles(bool skipLinkedOutput) {
       for (Symbol *sym : obj->getGlobalSymbols()) {
         if (!sym->isDefined())
           continue;
-        if (ctx.hasDynsym && sym->includeInDynsym(ctx))
+        if (ctx.hasDynsym && ctx.arg.exportDynamic &&
+            sym->computeBinding(ctx) != STB_LOCAL)
           sym->isExported = true;
         if (sym->hasVersionSuffix)
           sym->parseSymbolVersion(ctx);
diff --git lld/ELF/InputFiles.cpp lld/ELF/InputFiles.cpp
index 42d0e4c202ec..16943c484d96 100644
--- lld/ELF/InputFiles.cpp
+++ lld/ELF/InputFiles.cpp
@@ -1574,7 +1574,7 @@ template <class ELFT> void SharedFile::parse() {
       }
       Symbol *s = ctx.symtab->addSymbol(
           Undefined{this, name, sym.getBinding(), sym.st_other, sym.getType()});
-      s->exportDynamic = true;
+      s->isExported = true;
       if (sym.getBinding() != STB_WEAK &&
           ctx.arg.unresolvedSymbolsInShlib != UnresolvedPolicy::Ignore)
         requiredSymbols.push_back(s);
@@ -1771,7 +1771,7 @@ static void createBitcodeSymbol(Ctx &ctx, Symbol *&sym,
                    nullptr);
     // The definition can be omitted if all bitcode definitions satisfy
     // `canBeOmittedFromSymbolTable()` and isUsedInRegularObj is false.
-    // The latter condition is tested in Symbol::includeInDynsym.
+    // The latter condition is tested in parseVersionAndComputeIsPreemptible.
     sym->ltoCanOmit = objSym.canBeOmittedFromSymbolTable() &&
                       (!sym->isDefined() || sym->ltoCanOmit);
     sym->resolve(ctx, newSym);
diff --git lld/ELF/ScriptLexer.cpp lld/ELF/ScriptLexer.cpp
index e0adf2e5b0c8..8db44f55505f 100644
--- lld/ELF/ScriptLexer.cpp
+++ lld/ELF/ScriptLexer.cpp
@@ -105,7 +105,7 @@ void ScriptLexer::lex() {
       curBuf = buffers.pop_back_val();
       continue;
     }
-    curTokState = inExpr;
+    curTokState = lexState;
 
     // Quoted token. Note that double-quote characters are parts of a token
     // because, in a glob match context, only unquoted tokens are interpreted
@@ -142,7 +142,13 @@ void ScriptLexer::lex() {
     // C-like languages, so that you can write "file-name.cpp" as one bare
     // token.
     size_t pos;
-    if (inExpr) {
+    switch (lexState) {
+    case State::Script:
+      pos = s.find_first_not_of(
+          "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+          "0123456789_.$/\\~=+[]*?-!^:");
+      break;
+    case State::Expr:
       pos = s.find_first_not_of(
           "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
           "0123456789_.$");
@@ -150,10 +156,7 @@ void ScriptLexer::lex() {
           ((s[0] == s[1] && strchr("<>&|", s[0])) ||
            is_contained({"==", "!=", "<=", ">=", "<<", ">>"}, s.substr(0, 2))))
         pos = 2;
-    } else {
-      pos = s.find_first_not_of(
-          "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
-          "0123456789_.$/\\~=+[]*?-!^:");
+      break;
     }
 
     if (pos == 0)
@@ -208,8 +211,8 @@ StringRef ScriptLexer::next() {
 }
 
 StringRef ScriptLexer::peek() {
-  // curTok is invalid if curTokState and inExpr mismatch.
-  if (curTok.size() && curTokState != inExpr) {
+  // curTok is invalid if curTokState and lexState mismatch.
+  if (curTok.size() && curTokState != lexState) {
     curBuf.s = StringRef(curTok.data(), curBuf.s.end() - curTok.data());
     curTok = {};
   }
diff --git lld/ELF/ScriptLexer.h lld/ELF/ScriptLexer.h
index d689a7e108f9..be691022f538 100644
--- lld/ELF/ScriptLexer.h
+++ lld/ELF/ScriptLexer.h
@@ -41,6 +41,11 @@ protected:
   // Used to detect INCLUDE() cycles.
   llvm::DenseSet<StringRef> activeFilenames;
 
+  enum class State {
+    Script,
+    Expr,
+  };
+
   struct Token {
     StringRef str;
     explicit operator bool() const { return !str.empty(); }
@@ -54,8 +59,9 @@ protected:
   // expression state changes.
   StringRef curTok;
   size_t prevTokLine = 1;
-  // The inExpr state when curTok is cached.
-  bool curTokState = false;
+  // The lex state when curTok is cached.
+  State curTokState = State::Script;
+  State lexState = State::Script;
   bool eof = false;
 
 public:
@@ -75,7 +81,6 @@ public:
   MemoryBufferRef getCurrentMB();
 
   std::vector<MemoryBufferRef> mbs;
-  bool inExpr = false;
 
 private:
   StringRef getLine();
diff --git lld/ELF/ScriptParser.cpp lld/ELF/ScriptParser.cpp
index 06a22613ee93..a10af9565a1d 100644
--- lld/ELF/ScriptParser.cpp
+++ lld/ELF/ScriptParser.cpp
@@ -289,7 +289,7 @@ void ScriptParser::readLinkerScript() {
 void ScriptParser::readDefsym() {
   if (errCount(ctx))
     return;
-  inExpr = true;
+  SaveAndRestore saved(lexState, State::Expr);
   StringRef name = readName();
   expect("=");
   Expr e = readExpr();
@@ -954,8 +954,8 @@ bool ScriptParser::readSectionDirective(OutputSection *cmd, StringRef tok) {
 // https://sourceware.org/binutils/docs/ld/Output-Section-Type.html
 void ScriptParser::readSectionAddressType(OutputSection *cmd) {
   if (consume("(")) {
-    // Temporarily set inExpr to support TYPE=<value> without spaces.
-    SaveAndRestore saved(inExpr, true);
+    // Temporarily set lexState to support TYPE=<value> without spaces.
+    SaveAndRestore saved(lexState, State::Expr);
     if (readSectionDirective(cmd, peek()))
       return;
     cmd->addrExpr = readExpr();
@@ -965,7 +965,7 @@ void ScriptParser::readSectionAddressType(OutputSection *cmd) {
   }
 
   if (consume("(")) {
-    SaveAndRestore saved(inExpr, true);
+    SaveAndRestore saved(lexState, State::Expr);
     StringRef tok = peek();
     if (!readSectionDirective(cmd, tok))
       setError("unknown section directive: " + tok);
@@ -1087,10 +1087,10 @@ OutputDesc *ScriptParser::readOutputSectionDescription(StringRef outSec) {
   osec->phdrs = readOutputSectionPhdrs();
 
   if (peek() == "=" || peek().starts_with("=")) {
-    inExpr = true;
+    lexState = State::Expr;
     consume("=");
     osec->filler = readFill();
-    inExpr = false;
+    lexState = State::Script;
   }
 
   // Consume optional comma following output section command.
@@ -1162,7 +1162,7 @@ SymbolAssignment *ScriptParser::readAssignment(StringRef tok) {
   bool savedSeenRelroEnd = ctx.script->seenRelroEnd;
   const StringRef op = peek();
   {
-    SaveAndRestore saved(inExpr, true);
+    SaveAndRestore saved(lexState, State::Expr);
     if (op.starts_with("=")) {
       // Support = followed by an expression without whitespace.
       cmd = readSymbolAssignment(unquote(tok));
@@ -1235,7 +1235,7 @@ SymbolAssignment *ScriptParser::readSymbolAssignment(StringRef name) {
 Expr ScriptParser::readExpr() {
   // Our lexer is context-aware. Set the in-expression bit so that
   // they apply different tokenization rules.
-  SaveAndRestore saved(inExpr, true);
+  SaveAndRestore saved(lexState, State::Expr);
   Expr e = readExpr1(readPrimary(), 0);
   return e;
 }
@@ -1452,12 +1452,11 @@ std::pair<uint64_t, uint64_t> ScriptParser::readInputSectionFlags() {
 
 StringRef ScriptParser::readParenName() {
   expect("(");
-  bool orig = inExpr;
-  inExpr = false;
-  StringRef tok = readName();
-  inExpr = orig;
+  auto saved = std::exchange(lexState, State::Script);
+  StringRef name = readName();
+  lexState = saved;
   expect(")");
-  return tok;
+  return name;
 }
 
 static void checkIfExists(LinkerScript &script, const OutputSection &osec,
diff --git lld/ELF/SymbolTable.cpp lld/ELF/SymbolTable.cpp
index 975700505fac..b8a70d4e898f 100644
--- lld/ELF/SymbolTable.cpp
+++ lld/ELF/SymbolTable.cpp
@@ -203,7 +203,7 @@ void SymbolTable::handleDynamicList() {
       syms = findByVersion(ver);
 
     for (Symbol *sym : syms)
-      sym->exportDynamic = sym->inDynamicList = true;
+      sym->isExported = sym->inDynamicList = true;
   }
 }
 
@@ -350,10 +350,8 @@ void SymbolTable::scanVersionScript() {
         assignAsterisk(pat, &v, true);
   }
 
-  // isPreemptible is false at this point. To correctly compute the binding of a
-  // Defined (which is used by includeInDynsym(ctx)), we need to know if it is
-  // VER_NDX_LOCAL or not. Compute symbol versions before handling
-  // --dynamic-list.
+  // Handle --dynamic-list. If a specified symbol is also matched by local: in a
+  // version script, the version script takes precedence.
   handleDynamicList();
 }
 
diff --git lld/ELF/Symbols.cpp lld/ELF/Symbols.cpp
index b10391c65dfd..890877cb1bc0 100644
--- lld/ELF/Symbols.cpp
+++ lld/ELF/Symbols.cpp
@@ -268,16 +268,6 @@ uint8_t Symbol::computeBinding(Ctx &ctx) const {
   return binding;
 }
 
-bool Symbol::includeInDynsym(Ctx &ctx) const {
-  if (computeBinding(ctx) == STB_LOCAL)
-    return false;
-  if (!isDefined() && !isCommon())
-    return true;
-
-  return exportDynamic ||
-         (ctx.arg.exportDynamic && (isUsedInRegularObj || !ltoCanOmit));
-}
-
 // Print out a log message for --trace-symbol.
 void elf::printTraceSymbol(const Symbol &sym, StringRef name) {
   std::string s;
@@ -374,9 +364,18 @@ void elf::parseVersionAndComputeIsPreemptible(Ctx &ctx) {
   for (Symbol *sym : ctx.symtab->getSymbols()) {
     if (sym->hasVersionSuffix)
       sym->parseSymbolVersion(ctx);
-    if (hasDynsym) {
-      sym->isExported = sym->includeInDynsym(ctx);
-      sym->isPreemptible = sym->isExported && computeIsPreemptible(ctx, *sym);
+    if (!hasDynsym)
+      continue;
+    if (sym->computeBinding(ctx) == STB_LOCAL) {
+      sym->isExported = false;
+      continue;
+    }
+    if (!sym->isDefined() && !sym->isCommon()) {
+      sym->isPreemptible = computeIsPreemptible(ctx, *sym);
+    } else if (ctx.arg.exportDynamic &&
+               (sym->isUsedInRegularObj || !sym->ltoCanOmit)) {
+      sym->isExported = true;
+      sym->isPreemptible = computeIsPreemptible(ctx, *sym);
     }
   }
 }
@@ -655,7 +654,7 @@ void Symbol::resolve(Ctx &ctx, const LazySymbol &other) {
 }
 
 void Symbol::resolve(Ctx &ctx, const SharedSymbol &other) {
-  exportDynamic = true;
+  isExported = true;
   if (isPlaceholder()) {
     other.overwrite(*this);
     return;
diff --git lld/ELF/Symbols.h lld/ELF/Symbols.h
index 48df6f60db86..64f2f6eaa8d0 100644
--- lld/ELF/Symbols.h
+++ lld/ELF/Symbols.h
@@ -105,6 +105,9 @@ public:
   uint8_t partition;
 
   // True if this symbol is preemptible at load time.
+  //
+  // Primarily set in two locations, (a) parseVersionAndComputeIsPreemptible and
+  // (b) demoteSymbolsAndComputeIsPreemptible.
   LLVM_PREFERRED_TYPE(bool)
   uint8_t isPreemptible : 1;
 
@@ -131,16 +134,9 @@ public:
   // - If -shared or --export-dynamic is specified, any symbol in an object
   //   file/bitcode sets this property, unless suppressed by LTO
   //   canBeOmittedFromSymbolTable().
-  //
-  // Primarily set in two locations, (a) after parseSymbolVersion and
-  // (b) during demoteSymbols.
   LLVM_PREFERRED_TYPE(bool)
   uint8_t isExported : 1;
 
-  // Used to compute isExported. Set when defined or referenced by a SharedFile.
-  LLVM_PREFERRED_TYPE(bool)
-  uint8_t exportDynamic : 1;
-
   LLVM_PREFERRED_TYPE(bool)
   uint8_t ltoCanOmit : 1;
 
@@ -159,7 +155,6 @@ public:
     stOther = (stOther & ~3) | visibility;
   }
 
-  bool includeInDynsym(Ctx &) const;
   uint8_t computeBinding(Ctx &) const;
   bool isGlobal() const { return binding == llvm::ELF::STB_GLOBAL; }
   bool isWeak() const { return binding == llvm::ELF::STB_WEAK; }
@@ -247,8 +242,8 @@ protected:
   Symbol(Kind k, InputFile *file, StringRef name, uint8_t binding,
          uint8_t stOther, uint8_t type)
       : file(file), nameData(name.data()), nameSize(name.size()), type(type),
-        binding(binding), stOther(stOther), symbolKind(k), exportDynamic(false),
-        ltoCanOmit(false), archSpecificBit(false) {}
+        binding(binding), stOther(stOther), symbolKind(k), ltoCanOmit(false),
+        archSpecificBit(false) {}
 
   void overwrite(Symbol &sym, Kind k) const {
     if (sym.traced)
diff --git lld/ELF/SyntheticSections.cpp lld/ELF/SyntheticSections.cpp
index eb07d82fc960..ffa6e3c008c4 100644
--- lld/ELF/SyntheticSections.cpp
+++ lld/ELF/SyntheticSections.cpp
@@ -4776,8 +4776,8 @@ template <class ELFT> void elf::createSyntheticSections(Ctx &ctx) {
       add(*part.buildId);
     }
 
-    // dynSymTab is always present to simplify sym->includeInDynsym(ctx) in
-    // finalizeSections.
+    // dynSymTab is always present to simplify several finalizeSections
+    // functions.
     part.dynStrTab = std::make_unique<StringTableSection>(ctx, ".dynstr", true);
     part.dynSymTab =
         std::make_unique<SymbolTableSection<ELFT>>(ctx, *part.dynStrTab);
diff --git lld/ELF/Writer.cpp lld/ELF/Writer.cpp
index 6c7bcee02047..3ba1cdbce572 100644
--- lld/ELF/Writer.cpp
+++ lld/ELF/Writer.cpp
@@ -296,13 +296,12 @@ static void demoteSymbolsAndComputeIsPreemptible(Ctx &ctx) {
                   sym->type)
             .overwrite(*sym);
         sym->versionId = VER_NDX_GLOBAL;
-        if (hasDynsym && sym->includeInDynsym(ctx))
-          sym->isExported = true;
       }
     }
 
     if (hasDynsym)
-      sym->isPreemptible = sym->isExported && computeIsPreemptible(ctx, *sym);
+      sym->isPreemptible = (sym->isUndefined() || sym->isExported) &&
+                           computeIsPreemptible(ctx, *sym);
   }
 }
 
@@ -1841,9 +1840,10 @@ template <class ELFT> void Writer<ELFT>::finalizeSections() {
 
   // If the previous code block defines any non-hidden symbols (e.g.
   // __global_pointer$), they may be exported.
-  if (ctx.hasDynsym)
+  if (ctx.hasDynsym && ctx.arg.exportDynamic)
     for (Symbol *sym : ctx.synthesizedSymbols)
-      sym->isExported = sym->includeInDynsym(ctx);
+      if (sym->computeBinding(ctx) != STB_LOCAL)
+        sym->isExported = true;
 
   demoteSymbolsAndComputeIsPreemptible(ctx);
 
@@ -1931,7 +1931,8 @@ template <class ELFT> void Writer<ELFT>::finalizeSections() {
 
       // computeBinding might localize a linker-synthesized hidden symbol
       // (e.g. __global_pointer$) that was considered exported.
-      if (sym->isExported && !sym->isLocal()) {
+      if (ctx.hasDynsym && (sym->isUndefined() || sym->isExported) &&
+          !sym->isLocal()) {
         ctx.partitions[sym->partition - 1].dynSymTab->addSymbol(sym);
         if (auto *file = dyn_cast<SharedFile>(sym->file))
           if (file->isNeeded && !sym->isUndefined())
diff --git lld/MachO/BPSectionOrderer.cpp lld/MachO/BPSectionOrderer.cpp
index e2f7a387deeb..689afd67712a 100644
--- lld/MachO/BPSectionOrderer.cpp
+++ lld/MachO/BPSectionOrderer.cpp
@@ -26,7 +26,7 @@ struct BPOrdererMachO;
 }
 template <> struct lld::BPOrdererTraits<struct BPOrdererMachO> {
   using Section = macho::InputSection;
-  using Symbol = macho::Symbol;
+  using Defined = macho::Defined;
 };
 namespace {
 struct BPOrdererMachO : lld::BPOrderer<BPOrdererMachO> {
@@ -34,12 +34,8 @@ struct BPOrdererMachO : lld::BPOrderer<BPOrdererMachO> {
   static bool isCodeSection(const Section &sec) {
     return macho::isCodeSection(&sec);
   }
-  static SmallVector<Symbol *, 0> getSymbols(const Section &sec) {
-    SmallVector<Symbol *, 0> symbols;
-    for (auto *sym : sec.symbols)
-      if (auto *d = llvm::dyn_cast_or_null<Defined>(sym))
-        symbols.emplace_back(d);
-    return symbols;
+  static ArrayRef<Defined *> getSymbols(const Section &sec) {
+    return sec.symbols;
   }
 
   // Linkage names can be prefixed with "_" or "l_" on Mach-O. See
@@ -80,17 +76,11 @@ struct BPOrdererMachO : lld::BPOrderer<BPOrdererMachO> {
     hashes.erase(std::unique(hashes.begin(), hashes.end()), hashes.end());
   }
 
-  static llvm::StringRef getSymName(const Symbol &sym) { return sym.getName(); }
-  static uint64_t getSymValue(const Symbol &sym) {
-    if (auto *d = dyn_cast<Defined>(&sym))
-      return d->value;
-    return 0;
-  }
-  static uint64_t getSymSize(const Symbol &sym) {
-    if (auto *d = dyn_cast<Defined>(&sym))
-      return d->size;
-    return 0;
+  static llvm::StringRef getSymName(const Defined &sym) {
+    return sym.getName();
   }
+  static uint64_t getSymValue(const Defined &sym) { return sym.value; }
+  static uint64_t getSymSize(const Defined &sym) { return sym.size; }
 
 private:
   static uint64_t
@@ -120,7 +110,7 @@ DenseMap<const InputSection *, int> lld::macho::runBalancedPartitioning(
     bool compressionSortStartupFunctions, bool verbose) {
   // Collect candidate sections and associated symbols.
   SmallVector<InputSection *> sections;
-  DenseMap<CachedHashStringRef, DenseSet<unsigned>> rootSymbolToSectionIdxs;
+  DenseMap<CachedHashStringRef, std::set<unsigned>> rootSymbolToSectionIdxs;
   for (const auto *file : inputFiles) {
     for (auto *sec : file->sections) {
       for (auto &subsec : sec->subsections) {
@@ -141,8 +131,8 @@ DenseMap<const InputSection *, int> lld::macho::runBalancedPartitioning(
     }
   }
 
-  return BPOrdererMachO::computeOrder(profilePath, forFunctionCompression,
-                                      forDataCompression,
-                                      compressionSortStartupFunctions, verbose,
-                                      sections, rootSymbolToSectionIdxs);
+  return BPOrdererMachO().computeOrder(profilePath, forFunctionCompression,
+                                       forDataCompression,
+                                       compressionSortStartupFunctions, verbose,
+                                       sections, rootSymbolToSectionIdxs);
 }
diff --git lld/MachO/Relocations.cpp lld/MachO/Relocations.cpp
index aac0e1bd3c9e..78cc13388d6e 100644
--- lld/MachO/Relocations.cpp
+++ lld/MachO/Relocations.cpp
@@ -32,7 +32,7 @@ InputSection *Reloc::getReferentInputSection() const {
 }
 
 StringRef Reloc::getReferentString() const {
-  if (auto *isec = referent.dyn_cast<InputSection *>()) {
+  if (auto *isec = dyn_cast<InputSection *>(referent)) {
     const auto *cisec = dyn_cast<CStringInputSection>(isec);
     assert(cisec && "referent must be a CStringInputSection");
     return cisec->getStringRefAtOffset(addend);
diff --git lld/include/lld/Common/BPSectionOrdererBase.inc lld/include/lld/Common/BPSectionOrdererBase.inc
index 9a2ee4d50738..6c7c4a188d13 100644
--- lld/include/lld/Common/BPSectionOrdererBase.inc
+++ lld/include/lld/Common/BPSectionOrdererBase.inc
@@ -22,7 +22,7 @@
 #include "lld/Common/ErrorHandler.h"
 #include "llvm/ADT/CachedHashString.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
@@ -35,6 +35,7 @@
 #include "llvm/Support/VirtualFileSystem.h"
 #include <memory>
 #include <optional>
+#include <set>
 
 #define DEBUG_TYPE "bp-section-orderer"
 
@@ -46,7 +47,7 @@ template <class D> struct BPOrdererTraits;
 
 template <class D> struct BPOrderer {
   using Section = typename BPOrdererTraits<D>::Section;
-  using Symbol = typename BPOrdererTraits<D>::Symbol;
+  using Defined = typename BPOrdererTraits<D>::Defined;
 
   // Compute a section order using the Balanced Partitioning algorithm.
   //
@@ -56,12 +57,12 @@ template <class D> struct BPOrderer {
   //   program startup.
   // * compressionSortStartupFunctions: if profilePath is specified, allocate
   //   extra utility vertices to prioritize nearby function similarity.
-  static auto
-  computeOrder(llvm::StringRef profilePath, bool forFunctionCompression,
-               bool forDataCompression, bool compressionSortStartupFunctions,
-               bool verbose, llvm::ArrayRef<Section *> sections,
-               const DenseMap<CachedHashStringRef, DenseSet<unsigned>>
-                   &rootSymbolToSectionIdxs)
+  auto computeOrder(llvm::StringRef profilePath, bool forFunctionCompression,
+                    bool forDataCompression,
+                    bool compressionSortStartupFunctions, bool verbose,
+                    llvm::ArrayRef<Section *> sections,
+                    const DenseMap<CachedHashStringRef, std::set<unsigned>>
+                        &rootSymbolToSectionIdxs)
       -> llvm::DenseMap<const Section *, int>;
 };
 } // namespace lld
@@ -73,7 +74,7 @@ static SmallVector<std::pair<unsigned, UtilityNodes>> getUnsForCompression(
     ArrayRef<const typename D::Section *> sections,
     const DenseMap<const void *, uint64_t> &sectionToIdx,
     ArrayRef<unsigned> sectionIdxs,
-    DenseMap<unsigned, SmallVector<unsigned>> *duplicateSectionIdxs,
+    DenseMap<unsigned, SmallVector<unsigned, 0>> *duplicateSectionIdxs,
     BPFunctionNode::UtilityNodeT &maxUN) {
   TimeTraceScope timeScope("Build nodes for compression");
 
@@ -88,7 +89,7 @@ static SmallVector<std::pair<unsigned, UtilityNodes>> getUnsForCompression(
     hashes.clear();
   }
 
-  DenseMap<uint64_t, unsigned> hashFrequency;
+  MapVector<uint64_t, unsigned> hashFrequency;
   for (auto &[sectionIdx, hashes] : sectionHashes)
     for (auto hash : hashes)
       ++hashFrequency[hash];
@@ -156,7 +157,7 @@ auto BPOrderer<D>::computeOrder(
     StringRef profilePath, bool forFunctionCompression, bool forDataCompression,
     bool compressionSortStartupFunctions, bool verbose,
     ArrayRef<Section *> sections,
-    const DenseMap<CachedHashStringRef, DenseSet<unsigned>>
+    const DenseMap<CachedHashStringRef, std::set<unsigned>>
         &rootSymbolToSectionIdxs) -> DenseMap<const Section *, int> {
   TimeTraceScope timeScope("Setup Balanced Partitioning");
   DenseMap<const void *, uint64_t> sectionToIdx;
@@ -257,7 +258,7 @@ auto BPOrderer<D>::computeOrder(
 
   // Map a section index (order directly) to a list of duplicate section indices
   // (not ordered directly).
-  DenseMap<unsigned, SmallVector<unsigned>> duplicateSectionIdxs;
+  DenseMap<unsigned, SmallVector<unsigned, 0>> duplicateSectionIdxs;
   auto unsForFunctionCompression = getUnsForCompression<D>(
       sections, sectionToIdx, sectionIdxsForFunctionCompression,
       &duplicateSectionIdxs, maxUN);
@@ -357,7 +358,7 @@ auto BPOrderer<D>::computeOrder(
       const uint64_t pageSize = (1 << 14);
       uint64_t currentAddress = 0;
       for (const auto *isec : orderedSections) {
-        for (auto *sym : D::getSymbols(*isec)) {
+        for (auto *sym : static_cast<D *>(this)->getSymbols(*isec)) {
           uint64_t startAddress = currentAddress + D::getSymValue(*sym);
           uint64_t endAddress = startAddress + D::getSymSize(*sym);
           uint64_t firstPage = startAddress / pageSize;
diff --git lld/test/ELF/icf-safe.s lld/test/ELF/icf-safe.s
index 96776feccbc6..538153260993 100644
--- lld/test/ELF/icf-safe.s
+++ lld/test/ELF/icf-safe.s
@@ -1,16 +1,19 @@
 # REQUIRES: x86
+# RUN: llvm-mc -filetype=obj -triple=x86_64 %S/Inputs/shared.s -o %ta.o
+# RUN: ld.lld -shared -soname=ta %ta.o -o %ta.so
 
 # RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t1.o
 # RUN: llvm-objcopy %t1.o %t1copy.o
 # RUN: llvm-objcopy --localize-symbol=h1 %t1.o %t1changed.o
 # RUN: ld.lld -r %t1.o -o %t1reloc.o
 # RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %S/Inputs/icf-safe.s -o %t2.o
-# RUN: ld.lld %t1.o %t2.o -o %t2 --icf=safe --print-icf-sections | FileCheck %s
+# RUN: ld.lld %t1.o %t2.o -o %t2 --icf=safe --print-icf-sections --export-dynamic | FileCheck %s
 # RUN: ld.lld %t1copy.o %t2.o -o %t2 --icf=safe --print-icf-sections | FileCheck %s
 # RUN: ld.lld %t1.o %t2.o -o %t3 --icf=safe --print-icf-sections -shared | FileCheck --check-prefix=EXPORT %s
-# RUN: ld.lld %t1.o %t2.o -o %t3 --icf=safe --print-icf-sections --export-dynamic | FileCheck --check-prefix=EXPORT %s
+## Exported symbols are suppressed for ICF when dynamic linking is enabled.
+# RUN: ld.lld %t1.o %t2.o %ta.so -o %t3 --icf=safe --print-icf-sections --export-dynamic | FileCheck --check-prefix=EXPORT %s
 # RUN: ld.lld %t1.o %t2.o -o %t2 --icf=all --print-icf-sections | FileCheck --check-prefix=ALL %s
-# RUN: ld.lld %t1.o %t2.o -o %t2 --icf=all --print-icf-sections --export-dynamic | FileCheck --check-prefix=ALL-EXPORT %s
+# RUN: ld.lld %t1.o %t2.o %ta.so -o %t2 --icf=all --print-icf-sections --export-dynamic | FileCheck --check-prefix=ALL-EXPORT %s
 # RUN: ld.lld %t1changed.o -o %t4 --icf=safe 2>&1 | FileCheck --check-prefix=SH_LINK_0 %s
 # RUN: ld.lld %t1reloc.o -o %t4 --icf=safe 2>&1 | FileCheck --check-prefix=SH_LINK_0 %s
 
diff --git lldb/bindings/interface/SBProcessDocstrings.i lldb/bindings/interface/SBProcessDocstrings.i
index 1b98a79e4f6d..b5b22052a609 100644
--- lldb/bindings/interface/SBProcessDocstrings.i
+++ lldb/bindings/interface/SBProcessDocstrings.i
@@ -258,3 +258,14 @@ SBProcess supports thread iteration. For example (from test/lldbutil.py), ::
     Deallocates the block of memory (previously allocated using
     AllocateMemory) given in the argument."
 ) lldb::SBProcess::DeallocateMemory;
+
+%feature("docstring", "
+    Get a list of all the memory regions associated with this process.
+    ```
+        readable_regions = []
+        for region in process.GetMemoryRegions():
+            if region.IsReadable():
+                readable_regions.append(region)
+    ```
+"
+) lldb::SBProcess::GetMemoryRegions;
diff --git lldb/docs/resources/test.rst lldb/docs/resources/test.rst
index 5f1bd0d57383..1272aeeeb969 100644
--- lldb/docs/resources/test.rst
+++ lldb/docs/resources/test.rst
@@ -586,24 +586,60 @@ To run a specific test, pass a filter, for example:
 Running the Test Suite Remotely
 ```````````````````````````````
 
-Running the test-suite remotely is similar to the process of running a local
-test suite, but there are two things to have in mind:
-
-1. You must have the lldb-server running on the remote system, ready to accept
-   multiple connections. For more information on how to setup remote debugging
-   see the Remote debugging page.
-2. You must tell the test-suite how to connect to the remote system. This is
-   achieved using the ``LLDB_TEST_PLATFORM_URL``, ``LLDB_TEST_PLATFORM_WORKING_DIR``
-   flags to cmake, and ``--platform-name`` parameter to ``dotest.py``.
-   These parameters correspond to the platform select and platform connect
-   LLDB commands. You will usually also need to specify the compiler and
-   architecture for the remote system.
-3. Remote Shell tests execution is currently supported only for Linux target
-   platform. It's triggered when ``LLDB_TEST_SYSROOT`` is provided for building
-   test sources. It can be disabled by setting ``LLDB_TEST_SHELL_DISABLE_REMOTE=On``.
-   Shell tests are not guaranteed to pass against remote target if the compiler
-   being used is other than Clang.
+1. Run lldb-server on the remote system, so that it can accept multiple connections.
+   This is called "platform" mode:
 
+   ::
+
+      lldb-server platform --server --listen 0.0.0.0:<port A> --gdbserver-port <port B>
+
+   Assuming that ``port A`` and ``port B`` on the remote system can be reached
+   from your host system. If your remote system is a simulator on your host machine,
+   you may need to forward these ports to the host when you start the simulator.
+
+   For more information on how to setup remote debugging see :doc:`/use/remote`.
+
+2. Tell the test-suite how to connect to the remote system. This is done using the
+   ``LLDB_TEST_PLATFORM_URL`` and ``LLDB_TEST_PLATFORM_WORKING_DIR`` flags of CMake,
+   or the ``--platform-name``, ``--platform-url`` and ``--platform-working-dir``
+   parameters of ``dotest.py``. These parameters are passed on to the ``platform select``
+   and ``platform connect`` LLDB commands when the tests are run.
+
+   You will usually need to specify the compiler and architecture for the
+   remote system. This is done with CMake options ``LLDB_TEST_COMPILER`` and
+   ``LLDB_TEST_ARCH``, or the ``dotest.py`` options ``--compiler`` and ``--arch``.
+
+   .. note::
+      Even in cases where the two systems are the same architecture and run the
+      same operating system, there may be version differences between the two
+      which require you to use a different compiler version for remote testing.
+
+   For example, to run tests using ``dotest.py`` on a remote AArch64 Linux system
+   you might run:
+
+   ::
+
+      ./bin/lldb-dotest --platform-name remote-linux --arch aarch64 --compiler aarch64-none-linux-gnu-gcc --platform-url connect://<remote-ip>:<port A> --platform-working-dir /tmp/test_lldb -p <test-name>.py
+
+   This is the equivalent of:
+
+      * ``LLDB_TEST_ARCH`` = ``aarch64``
+      * ``LLDB_TEST_COMPILER`` = ``aarch64-none-linux-gnu-gcc``
+      * ``LLDB_TEST_PLATFORM_URL`` = ``connect://<remote-ip>:<port A>``
+      * ``LLDB_TEST_PLATFORM_WORKING_DIR`` = ``/tmp/test_lldb``
+
+   Setting these values using CMake allows you to run ``ninja check-lldb`` to run
+   tests on the remote system.
+
+   If you have a host build that you sometimes check on a remote system, but otherwise
+   test on the host, adding arguments to ``dotest.py`` manually is easier.
+
+.. note::
+   Remote Shell test execution is currently supported only for Linux targets.
+   It is enabled when ``LLDB_TEST_SYSROOT`` is set. Remote Shell testing can
+   be disabled by setting ``LLDB_TEST_SHELL_DISABLE_REMOTE=On``. Shell tests
+   are not guaranteed to pass against remote target if the test compiler is not
+   Clang.
 
 Running tests in QEMU System Emulation Environment
 ``````````````````````````````````````````````````
diff --git lldb/docs/use/variable.rst lldb/docs/use/variable.rst
index 22c2bee73fa5..3ad71cb93c51 100644
--- lldb/docs/use/variable.rst
+++ lldb/docs/use/variable.rst
@@ -1226,13 +1226,13 @@ By default, several categories are created in LLDB:
 - CoreServices: CS classes
 - VectorTypes: compact display for several vector types
 
-If you want to use a custom category for your formatters, all the type ... add
-provide a --category (-w) option, that names the category to add the formatter
+If you want to use a custom category for your formatters, all the ``type ... add``
+provide a ``--category`` (``-w``) option, that names the category to add the formatter
 to. To delete the formatter, you then have to specify the correct category.
 
 Categories can be in one of two states: enabled and disabled. A category is
-initially disabled, and can be enabled using the type category enable command.
-To disable an enabled category, the command to use is type category disable.
+initially disabled, and can be enabled using the ``type category enable`` command.
+To disable an enabled category, the command to use is ``type category disable``.
 
 The order in which categories are enabled or disabled is significant, in that
 LLDB uses that order when looking for formatters. Therefore, when you enable a
diff --git lldb/include/lldb/Core/Value.h lldb/include/lldb/Core/Value.h
index d0c338ffec0c..3714621b469e 100644
--- lldb/include/lldb/Core/Value.h
+++ lldb/include/lldb/Core/Value.h
@@ -109,8 +109,10 @@ public:
 
   Scalar &ResolveValue(ExecutionContext *exe_ctx, Module *module = nullptr);
 
+  /// See comment on m_scalar to understand what GetScalar returns.
   const Scalar &GetScalar() const { return m_value; }
 
+  /// See comment on m_scalar to understand what GetScalar returns.
   Scalar &GetScalar() { return m_value; }
 
   size_t ResizeData(size_t len);
@@ -148,6 +150,32 @@ public:
   static ValueType GetValueTypeFromAddressType(AddressType address_type);
 
 protected:
+  /// Represents a value, which can be a scalar, a load address, a file address,
+  /// or a host address.
+  ///
+  /// The interpretation of `m_value` depends on `m_value_type`:
+  /// - Scalar: `m_value` contains the scalar value.
+  /// - Load Address: `m_value` contains the load address.
+  /// - File Address: `m_value` contains the file address.
+  /// - Host Address: `m_value` contains a pointer to the start of the buffer in
+  ///    host memory.
+  ///   Currently, this can point to either:
+  ///     - The `m_data_buffer` of this Value instance (e.g., in DWARF
+  ///     computations).
+  ///     - The `m_data` of a Value Object containing this Value.
+  // TODO: the GetScalar() API relies on knowledge not codified by the type
+  // system, making it hard to understand and easy to misuse.
+  // - Separate the scalar from the variable that contains the address (be it a
+  //   load, file or host address).
+  // - Rename GetScalar() to something more indicative to what the scalar is,
+  //   like GetScalarOrAddress() for example.
+  // - Split GetScalar() into two functions, GetScalar() and GetAddress(), which
+  //   verify (or assert) what m_value_type is to make sure users of the class are
+  //   querying the right thing.
+  // TODO: It's confusing to point to multiple possible buffers when the
+  // ValueType is a host address. Value should probably always own its buffer.
+  // Perhaps as a shared pointer with a copy on write system if the same buffer
+  // can be shared by multiple classes.
   Scalar m_value;
   CompilerType m_compiler_type;
   void *m_context = nullptr;
diff --git lldb/include/lldb/Expression/ExpressionVariable.h lldb/include/lldb/Expression/ExpressionVariable.h
index fc36793b3a47..f5bd93892196 100644
--- lldb/include/lldb/Expression/ExpressionVariable.h
+++ lldb/include/lldb/Expression/ExpressionVariable.h
@@ -107,9 +107,18 @@ public:
 
   FlagType m_flags; // takes elements of Flags
 
-  // these should be private
+  /// These members should be private.
+  /// @{
+  /// A value object whose value's data lives in host (lldb's) memory.
   lldb::ValueObjectSP m_frozen_sp;
+  /// The ValueObject counterpart to m_frozen_sp that tracks the value in
+  /// inferior memory. This object may not always exist; its presence depends on
+  /// whether it is logical for the value to exist in the inferior memory. For
+  /// example, when evaluating a C++ expression that generates an r-value, such
+  /// as a single function call, there is no memory address in the inferior to
+  /// track.
   lldb::ValueObjectSP m_live_sp;
+  /// @}
 };
 
 /// \class ExpressionVariableList ExpressionVariable.h
diff --git lldb/include/lldb/Interpreter/CommandInterpreter.h lldb/include/lldb/Interpreter/CommandInterpreter.h
index 2bafc30cc8e2..910c1d843033 100644
--- lldb/include/lldb/Interpreter/CommandInterpreter.h
+++ lldb/include/lldb/Interpreter/CommandInterpreter.h
@@ -100,8 +100,7 @@ public:
                                LazyBool stop_on_error, LazyBool stop_on_crash,
                                LazyBool echo_commands, LazyBool echo_comments,
                                LazyBool print_results, LazyBool print_errors,
-                               LazyBool add_to_history,
-                               LazyBool handle_repeats)
+                               LazyBool add_to_history, LazyBool handle_repeats)
       : m_stop_on_continue(stop_on_continue), m_stop_on_error(stop_on_error),
         m_stop_on_crash(stop_on_crash), m_echo_commands(echo_commands),
         m_echo_comment_commands(echo_comments), m_print_results(print_results),
@@ -248,13 +247,13 @@ public:
   enum CommandTypes {
     eCommandTypesBuiltin = 0x0001, //< native commands such as "frame"
     eCommandTypesUserDef = 0x0002, //< scripted commands
-    eCommandTypesUserMW  = 0x0004, //< multiword commands (command containers)
+    eCommandTypesUserMW = 0x0004,  //< multiword commands (command containers)
     eCommandTypesAliases = 0x0008, //< aliases such as "po"
-    eCommandTypesHidden  = 0x0010, //< commands prefixed with an underscore
+    eCommandTypesHidden = 0x0010,  //< commands prefixed with an underscore
     eCommandTypesAllThem = 0xFFFF  //< all commands
   };
 
-  // The CommandAlias and CommandInterpreter both have a hand in 
+  // The CommandAlias and CommandInterpreter both have a hand in
   // substituting for alias commands.  They work by writing special tokens
   // in the template form of the Alias command, and then detecting them when the
   // command is executed.  These are the special tokens:
@@ -334,9 +333,8 @@ public:
   ///         dummy "contains everything MWC, so we return null here, but
   ///         in this case error.Success is true.
 
-  CommandObjectMultiword *VerifyUserMultiwordCmdPath(Args &path,
-                                                     bool leaf_is_command,
-                                                     Status &result);
+  CommandObjectMultiword *
+  VerifyUserMultiwordCmdPath(Args &path, bool leaf_is_command, Status &result);
 
   CommandAlias *AddAlias(llvm::StringRef alias_name,
                          lldb::CommandObjectSP &command_obj_sp,
@@ -596,7 +594,7 @@ public:
   void SetEchoCommentCommands(bool enable);
 
   bool GetRepeatPreviousCommand() const;
-  
+
   bool GetRequireCommandOverwrite() const;
 
   const CommandObject::CommandMap &GetUserCommands() const {
diff --git lldb/include/lldb/Interpreter/CommandReturnObject.h lldb/include/lldb/Interpreter/CommandReturnObject.h
index 9fef59337016..f96da34889a3 100644
--- lldb/include/lldb/Interpreter/CommandReturnObject.h
+++ lldb/include/lldb/Interpreter/CommandReturnObject.h
@@ -32,9 +32,9 @@ public:
   ~CommandReturnObject() = default;
 
   /// Format any inline diagnostics with an indentation of \c indent.
-  std::string GetInlineDiagnosticString(unsigned indent);
+  std::string GetInlineDiagnosticString(unsigned indent) const;
 
-  llvm::StringRef GetOutputString() {
+  llvm::StringRef GetOutputString() const {
     lldb::StreamSP stream_sp(m_out_stream.GetStreamAtIndex(eStreamStringIndex));
     if (stream_sp)
       return std::static_pointer_cast<StreamString>(stream_sp)->GetString();
@@ -46,7 +46,7 @@ public:
   /// If \c with_diagnostics is true, all diagnostics are also
   /// rendered into the string. Otherwise the expectation is that they
   /// are fetched with \ref GetInlineDiagnosticString().
-  std::string GetErrorString(bool with_diagnostics = true);
+  std::string GetErrorString(bool with_diagnostics = true) const;
   StructuredData::ObjectSP GetErrorData();
 
   Stream &GetOutputStream() {
@@ -95,11 +95,11 @@ public:
     m_err_stream.SetStreamAtIndex(eImmediateStreamIndex, stream_sp);
   }
 
-  lldb::StreamSP GetImmediateOutputStream() {
+  lldb::StreamSP GetImmediateOutputStream() const {
     return m_out_stream.GetStreamAtIndex(eImmediateStreamIndex);
   }
 
-  lldb::StreamSP GetImmediateErrorStream() {
+  lldb::StreamSP GetImmediateErrorStream() const {
     return m_err_stream.GetStreamAtIndex(eImmediateStreamIndex);
   }
 
diff --git lldb/include/lldb/Interpreter/Options.h lldb/include/lldb/Interpreter/Options.h
index 9a6a17c2793f..864bda6f24c8 100644
--- lldb/include/lldb/Interpreter/Options.h
+++ lldb/include/lldb/Interpreter/Options.h
@@ -76,12 +76,12 @@ public:
   // This gets passed the short option as an integer...
   void OptionSeen(int short_option);
 
-  bool VerifyOptions(CommandReturnObject &result);
+  llvm::Error VerifyOptions();
 
   // Verify that the options given are in the options table and can be used
   // together, but there may be some required options that are missing (used to
   // verify options that get folded into command aliases).
-  bool VerifyPartialOptions(CommandReturnObject &result);
+  llvm::Error VerifyPartialOptions();
 
   void OutputFormattedUsageText(Stream &strm,
                                 const OptionDefinition &option_def,
diff --git lldb/include/lldb/Target/Thread.h lldb/include/lldb/Target/Thread.h
index ef66fa11574d..9749fd8d575a 100644
--- lldb/include/lldb/Target/Thread.h
+++ lldb/include/lldb/Target/Thread.h
@@ -470,6 +470,26 @@ public:
 
   virtual void ClearStackFrames();
 
+  /// Sets the thread that is backed by this thread.
+  /// If backed_thread.GetBackedThread() is null, this method also calls
+  /// backed_thread.SetBackingThread(this).
+  /// If backed_thread.GetBackedThread() is non-null, asserts that it is equal
+  /// to `this`.
+  void SetBackedThread(Thread &backed_thread) {
+    m_backed_thread = backed_thread.shared_from_this();
+
+    // Ensure the bidrectional relationship is preserved.
+    Thread *backing_thread = backed_thread.GetBackingThread().get();
+    assert(backing_thread == nullptr || backing_thread == this);
+    if (backing_thread == nullptr)
+      backed_thread.SetBackingThread(shared_from_this());
+  }
+
+  void ClearBackedThread() { m_backed_thread.reset(); }
+
+  /// Returns the thread that is backed by this thread, if any.
+  lldb::ThreadSP GetBackedThread() const { return m_backed_thread.lock(); }
+
   virtual bool SetBackingThread(const lldb::ThreadSP &thread_sp) {
     return false;
   }
@@ -1349,6 +1369,9 @@ protected:
   LazyBool m_override_should_notify;
   mutable std::unique_ptr<ThreadPlanStack> m_null_plan_stack_up;
 
+  /// The Thread backed by this thread, if any.
+  lldb::ThreadWP m_backed_thread;
+
 private:
   bool m_extended_info_fetched; // Have we tried to retrieve the m_extended_info
                                 // for this thread?
diff --git lldb/include/lldb/Target/ThreadList.h lldb/include/lldb/Target/ThreadList.h
index f931bb83a8ce..bca01f5fe208 100644
--- lldb/include/lldb/Target/ThreadList.h
+++ lldb/include/lldb/Target/ThreadList.h
@@ -101,8 +101,6 @@ public:
 
   lldb::ThreadSP GetThreadSPForThreadPtr(Thread *thread_ptr);
 
-  lldb::ThreadSP GetBackingThread(const lldb::ThreadSP &real_thread);
-
   bool ShouldStop(Event *event_ptr);
 
   Vote ShouldReportStop(Event *event_ptr);
diff --git lldb/include/lldb/Utility/StreamTee.h lldb/include/lldb/Utility/StreamTee.h
index 5695586171f3..571548e2e23f 100644
--- lldb/include/lldb/Utility/StreamTee.h
+++ lldb/include/lldb/Utility/StreamTee.h
@@ -85,7 +85,7 @@ public:
     return result;
   }
 
-  lldb::StreamSP GetStreamAtIndex(uint32_t idx) {
+  lldb::StreamSP GetStreamAtIndex(uint32_t idx) const {
     lldb::StreamSP stream_sp;
     std::lock_guard<std::recursive_mutex> guard(m_streams_mutex);
     if (idx < m_streams.size())
diff --git lldb/include/lldb/ValueObject/ValueObjectConstResultImpl.h lldb/include/lldb/ValueObject/ValueObjectConstResultImpl.h
index dbd68160acb4..5509886a8965 100644
--- lldb/include/lldb/ValueObject/ValueObjectConstResultImpl.h
+++ lldb/include/lldb/ValueObject/ValueObjectConstResultImpl.h
@@ -66,6 +66,10 @@ public:
 
 private:
   ValueObject *m_impl_backend;
+  /// The memory address in the inferior process that this ValueObject tracks.
+  /// This address is used to request additional memory when the actual data
+  /// size exceeds the initial local buffer size, such as when a dynamic type
+  /// resolution results in a type larger than its statically determined type.
   lldb::addr_t m_live_address;
   AddressType m_live_address_type;
   lldb::ValueObjectSP m_address_of_backend;
diff --git lldb/packages/Python/lldbsuite/test/gdbclientutils.py lldb/packages/Python/lldbsuite/test/gdbclientutils.py
index 1784487323ad..4b782b3b470f 100644
--- lldb/packages/Python/lldbsuite/test/gdbclientutils.py
+++ lldb/packages/Python/lldbsuite/test/gdbclientutils.py
@@ -126,6 +126,9 @@ class MockGDBServerResponder:
         if packet[0] == "m":
             addr, length = [int(x, 16) for x in packet[1:].split(",")]
             return self.readMemory(addr, length)
+        if packet[0] == "x":
+            addr, length = [int(x, 16) for x in packet[1:].split(",")]
+            return self.x(addr, length)
         if packet[0] == "M":
             location, encoded_data = packet[1:].split(":")
             addr, length = [int(x, 16) for x in location.split(",")]
@@ -267,6 +270,9 @@ class MockGDBServerResponder:
     def readMemory(self, addr, length):
         return "00" * length
 
+    def x(self, addr, length):
+        return ""
+
     def writeMemory(self, addr, data_hex):
         return "OK"
 
diff --git lldb/packages/Python/lldbsuite/test/make/Makefile.rules lldb/packages/Python/lldbsuite/test/make/Makefile.rules
index 2da6ff226b61..06959f226066 100644
--- lldb/packages/Python/lldbsuite/test/make/Makefile.rules
+++ lldb/packages/Python/lldbsuite/test/make/Makefile.rules
@@ -207,6 +207,10 @@ else
 		override ARCH :=
 		override ARCHFLAG :=
 	endif
+	ifeq "$(ARCH)" "riscv"
+		override ARCH :=
+		override ARCHFLAG :=
+	endif
 	ifeq "$(findstring mips,$(ARCH))" "mips"
 		override ARCHFLAG := -
 	endif
diff --git lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
index c29992ce9c78..043d82e2e2c7 100644
--- lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
+++ lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
@@ -612,6 +612,28 @@ class DebugCommunication(object):
         command_dict = {"command": "attach", "type": "request", "arguments": args_dict}
         return self.send_recv(command_dict)
 
+    def request_breakpointLocations(
+        self, file_path, line, end_line=None, column=None, end_column=None
+    ):
+        (dir, base) = os.path.split(file_path)
+        source_dict = {"name": base, "path": file_path}
+        args_dict = {}
+        args_dict["source"] = source_dict
+        if line is not None:
+            args_dict["line"] = line
+        if end_line is not None:
+            args_dict["endLine"] = end_line
+        if column is not None:
+            args_dict["column"] = column
+        if end_column is not None:
+            args_dict["endColumn"] = end_column
+        command_dict = {
+            "command": "breakpointLocations",
+            "type": "request",
+            "arguments": args_dict,
+        }
+        return self.send_recv(command_dict)
+
     def request_configurationDone(self):
         command_dict = {
             "command": "configurationDone",
@@ -851,6 +873,8 @@ class DebugCommunication(object):
     def request_stepIn(self, threadId, targetId, granularity="statement"):
         if self.exit_status is not None:
             raise ValueError("request_stepIn called after process exited")
+        if threadId is None:
+            threadId = self.get_thread_id()
         args_dict = {
             "threadId": threadId,
             "targetId": targetId,
@@ -911,18 +935,14 @@ class DebugCommunication(object):
                     breakpoint_data = data[i]
                 bp = {"line": line}
                 if breakpoint_data is not None:
-                    if "condition" in breakpoint_data and breakpoint_data["condition"]:
+                    if breakpoint_data.get("condition"):
                         bp["condition"] = breakpoint_data["condition"]
-                    if (
-                        "hitCondition" in breakpoint_data
-                        and breakpoint_data["hitCondition"]
-                    ):
+                    if breakpoint_data.get("hitCondition"):
                         bp["hitCondition"] = breakpoint_data["hitCondition"]
-                    if (
-                        "logMessage" in breakpoint_data
-                        and breakpoint_data["logMessage"]
-                    ):
+                    if breakpoint_data.get("logMessage"):
                         bp["logMessage"] = breakpoint_data["logMessage"]
+                    if breakpoint_data.get("column"):
+                        bp["column"] = breakpoint_data["column"]
                 breakpoints.append(bp)
             args_dict["breakpoints"] = breakpoints
 
diff --git lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
index a25466f07fa5..34e9b96dbcc3 100644
--- lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
+++ lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
@@ -238,9 +238,10 @@ class DAPTestCaseBase(TestBase):
     def stepIn(
         self, threadId=None, targetId=None, waitForStop=True, granularity="statement"
     ):
-        self.dap_server.request_stepIn(
+        response = self.dap_server.request_stepIn(
             threadId=threadId, targetId=targetId, granularity=granularity
         )
+        self.assertTrue(response["success"])
         if waitForStop:
             return self.dap_server.wait_for_stopped()
         return None
diff --git lldb/source/Breakpoint/BreakpointSite.cpp lldb/source/Breakpoint/BreakpointSite.cpp
index 9700a57d3346..8b964c571146 100644
--- lldb/source/Breakpoint/BreakpointSite.cpp
+++ lldb/source/Breakpoint/BreakpointSite.cpp
@@ -12,6 +12,7 @@
 
 #include "lldb/Breakpoint/Breakpoint.h"
 #include "lldb/Breakpoint/BreakpointLocation.h"
+#include "lldb/Target/Thread.h"
 #include "lldb/Utility/Stream.h"
 
 using namespace lldb;
@@ -161,6 +162,8 @@ BreakpointLocationSP BreakpointSite::GetConstituentAtIndex(size_t index) {
 
 bool BreakpointSite::ValidForThisThread(Thread &thread) {
   std::lock_guard<std::recursive_mutex> guard(m_constituents_mutex);
+  if (ThreadSP backed_thread = thread.GetBackedThread())
+    return m_constituents.ValidForThisThread(*backed_thread);
   return m_constituents.ValidForThisThread(thread);
 }
 
diff --git lldb/source/Breakpoint/WatchpointList.cpp lldb/source/Breakpoint/WatchpointList.cpp
index f7564483e6f1..57369b76c03a 100644
--- lldb/source/Breakpoint/WatchpointList.cpp
+++ lldb/source/Breakpoint/WatchpointList.cpp
@@ -236,7 +236,7 @@ void WatchpointList::RemoveAll(bool notify) {
       wp_collection::iterator pos, end = m_watchpoints.end();
       for (pos = m_watchpoints.begin(); pos != end; ++pos) {
         if ((*pos)->GetTarget().EventTypeHasListeners(
-                Target::eBroadcastBitBreakpointChanged)) {
+                Target::eBroadcastBitWatchpointChanged)) {
           auto data_sp = std::make_shared<Watchpoint::WatchpointEventData>(
               eWatchpointEventTypeRemoved, *pos);
           (*pos)->GetTarget().BroadcastEvent(
diff --git lldb/source/Core/Module.cpp lldb/source/Core/Module.cpp
index 9601c834d9b8..33668c5d20dd 100644
--- lldb/source/Core/Module.cpp
+++ lldb/source/Core/Module.cpp
@@ -919,9 +919,8 @@ void Module::FindFunctions(const RegularExpression &regex,
               const SymbolContext &sc = sc_list[i];
               if (sc.block)
                 continue;
-              file_addr_to_index[sc.function->GetAddressRange()
-                                     .GetBaseAddress()
-                                     .GetFileAddress()] = i;
+              file_addr_to_index[sc.function->GetAddress().GetFileAddress()] =
+                  i;
             }
 
             FileAddrToIndexMap::const_iterator end = file_addr_to_index.end();
diff --git lldb/source/Expression/Materializer.cpp lldb/source/Expression/Materializer.cpp
index 8cd050f9fdb7..13a72a9921e1 100644
--- lldb/source/Expression/Materializer.cpp
+++ lldb/source/Expression/Materializer.cpp
@@ -1187,6 +1187,9 @@ public:
 
 private:
   CompilerType m_type;
+  /// This is used both to control whether this result entity can (and should)
+  /// track the value in inferior memory, as well as to control whether LLDB
+  /// needs to allocate memory for the variable during materialization.
   bool m_is_program_reference;
   bool m_keep_in_memory;
 
diff --git lldb/source/Interpreter/CommandAlias.cpp lldb/source/Interpreter/CommandAlias.cpp
index c5971b52f837..b45fcca358a5 100644
--- lldb/source/Interpreter/CommandAlias.cpp
+++ lldb/source/Interpreter/CommandAlias.cpp
@@ -10,6 +10,7 @@
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormatAdapters.h"
 
 #include "lldb/Interpreter/CommandInterpreter.h"
 #include "lldb/Interpreter/CommandObject.h"
@@ -20,20 +21,17 @@
 using namespace lldb;
 using namespace lldb_private;
 
-static bool ProcessAliasOptionsArgs(lldb::CommandObjectSP &cmd_obj_sp,
-                                    llvm::StringRef options_args,
-                                    OptionArgVectorSP &option_arg_vector_sp) {
-  bool success = true;
+static llvm::Error
+ProcessAliasOptionsArgs(lldb::CommandObjectSP &cmd_obj_sp,
+                        llvm::StringRef options_args,
+                        OptionArgVectorSP &option_arg_vector_sp) {
   OptionArgVector *option_arg_vector = option_arg_vector_sp.get();
 
   if (options_args.size() < 1)
-    return true;
+    return llvm::Error::success();
 
   Args args(options_args);
   std::string options_string(options_args);
-  // TODO: Find a way to propagate errors in this CommandReturnObject up the
-  // stack.
-  CommandReturnObject result(false);
   // Check to see if the command being aliased can take any command options.
   Options *options = cmd_obj_sp->GetOptions();
   if (options) {
@@ -45,34 +43,30 @@ static bool ProcessAliasOptionsArgs(lldb::CommandObjectSP &cmd_obj_sp,
 
     llvm::Expected<Args> args_or =
         options->ParseAlias(args, option_arg_vector, options_string);
-    if (!args_or) {
-      result.AppendError(toString(args_or.takeError()));
-      result.AppendError("Unable to create requested alias.\n");
-      return false;
-    }
+    if (!args_or)
+      return llvm::createStringError(
+          llvm::formatv("unable to create alias: {0}",
+                        llvm::fmt_consume(args_or.takeError())));
     args = std::move(*args_or);
-    options->VerifyPartialOptions(result);
-    if (!result.Succeeded() &&
-        result.GetStatus() != lldb::eReturnStatusStarted) {
-      result.AppendError("Unable to create requested alias.\n");
-      return false;
-    }
+    if (llvm::Error error = options->VerifyPartialOptions())
+      return error;
   }
 
   if (!options_string.empty()) {
-    if (cmd_obj_sp->WantsRawCommandString())
-      option_arg_vector->emplace_back(CommandInterpreter::g_argument, 
-                                      -1, options_string);
-    else {
+    if (cmd_obj_sp->WantsRawCommandString()) {
+      option_arg_vector->emplace_back(CommandInterpreter::g_argument, -1,
+                                      options_string);
+    } else {
       for (auto &entry : args.entries()) {
         if (!entry.ref().empty())
-          option_arg_vector->emplace_back(std::string(CommandInterpreter::g_argument), -1,
-                                          std::string(entry.ref()));
+          option_arg_vector->emplace_back(
+              std::string(CommandInterpreter::g_argument), -1,
+              std::string(entry.ref()));
       }
     }
   }
 
-  return success;
+  return llvm::Error::success();
 }
 
 CommandAlias::CommandAlias(CommandInterpreter &interpreter,
@@ -85,10 +79,15 @@ CommandAlias::CommandAlias(CommandInterpreter &interpreter,
       m_option_args_sp(new OptionArgVector),
       m_is_dashdash_alias(eLazyBoolCalculate), m_did_set_help(false),
       m_did_set_help_long(false) {
-  if (ProcessAliasOptionsArgs(cmd_sp, options_args, m_option_args_sp)) {
+  if (llvm::Error error =
+          ProcessAliasOptionsArgs(cmd_sp, options_args, m_option_args_sp)) {
+    // FIXME: Find a way to percolate this error up.
+    LLDB_LOG_ERROR(GetLog(LLDBLog::Host), std::move(error),
+                   "ProcessAliasOptionsArgs failed: {0}");
+  } else {
     m_underlying_command_sp = cmd_sp;
     for (int i = 0;
-         auto cmd_entry = m_underlying_command_sp->GetArgumentEntryAtIndex(i);
+         auto *cmd_entry = m_underlying_command_sp->GetArgumentEntryAtIndex(i);
          i++) {
       m_arguments.push_back(*cmd_entry);
     }
diff --git lldb/source/Interpreter/CommandInterpreter.cpp lldb/source/Interpreter/CommandInterpreter.cpp
index 764dcfd1903b..4869b811f99e 100644
--- lldb/source/Interpreter/CommandInterpreter.cpp
+++ lldb/source/Interpreter/CommandInterpreter.cpp
@@ -113,7 +113,6 @@ const char *CommandInterpreter::g_no_argument = "<no-argument>";
 const char *CommandInterpreter::g_need_argument = "<need-argument>";
 const char *CommandInterpreter::g_argument = "<argument>";
 
-
 #define LLDB_PROPERTIES_interpreter
 #include "InterpreterProperties.inc"
 
@@ -285,8 +284,6 @@ bool CommandInterpreter::GetRequireCommandOverwrite() const {
 void CommandInterpreter::Initialize() {
   LLDB_SCOPED_TIMER();
 
-  CommandReturnObject result(m_debugger.GetUseColor());
-
   LoadCommandDictionary();
 
   // An alias arguments vector to reuse - reset it before use...
@@ -447,16 +444,17 @@ void CommandInterpreter::Initialize() {
     CommandAlias *parray_alias =
         AddAlias("parray", cmd_obj_sp, "--element-count %1 --");
     if (parray_alias) {
-        parray_alias->SetHelp
-          ("parray <COUNT> <EXPRESSION> -- lldb will evaluate EXPRESSION "
-           "to get a typed-pointer-to-an-array in memory, and will display "
-           "COUNT elements of that type from the array.");
-        parray_alias->SetHelpLong("");
-    }
-    CommandAlias *poarray_alias = AddAlias("poarray", cmd_obj_sp,
-             "--object-description --element-count %1 --");
+      parray_alias->SetHelp(
+          "parray <COUNT> <EXPRESSION> -- lldb will evaluate EXPRESSION "
+          "to get a typed-pointer-to-an-array in memory, and will display "
+          "COUNT elements of that type from the array.");
+      parray_alias->SetHelpLong("");
+    }
+    CommandAlias *poarray_alias = AddAlias(
+        "poarray", cmd_obj_sp, "--object-description --element-count %1 --");
     if (poarray_alias) {
-      poarray_alias->SetHelp("poarray <COUNT> <EXPRESSION> -- lldb will "
+      poarray_alias->SetHelp(
+          "poarray <COUNT> <EXPRESSION> -- lldb will "
           "evaluate EXPRESSION to get the address of an array of COUNT "
           "objects in memory, and will call po on them.");
       poarray_alias->SetHelpLong("");
@@ -536,9 +534,7 @@ void CommandInterpreter::Initialize() {
   }
 }
 
-void CommandInterpreter::Clear() {
-  m_command_io_handler_sp.reset();
-}
+void CommandInterpreter::Clear() { m_command_io_handler_sp.reset(); }
 
 const char *CommandInterpreter::ProcessEmbeddedScriptCommands(const char *arg) {
   // This function has not yet been implemented.
@@ -851,9 +847,12 @@ void CommandInterpreter::LoadCommandDictionary() {
     // now "bt 3" is the preferred form, in line with gdb.
     if (bt_regex_cmd_up->AddRegexCommand("^([[:digit:]]+)[[:space:]]*$",
                                          "thread backtrace -c %1") &&
-        bt_regex_cmd_up->AddRegexCommand("^(-[^[:space:]].*)$", "thread backtrace %1") &&
-        bt_regex_cmd_up->AddRegexCommand("^all[[:space:]]*$", "thread backtrace all") &&
-        bt_regex_cmd_up->AddRegexCommand("^[[:space:]]*$", "thread backtrace")) {
+        bt_regex_cmd_up->AddRegexCommand("^(-[^[:space:]].*)$",
+                                         "thread backtrace %1") &&
+        bt_regex_cmd_up->AddRegexCommand("^all[[:space:]]*$",
+                                         "thread backtrace all") &&
+        bt_regex_cmd_up->AddRegexCommand("^[[:space:]]*$",
+                                         "thread backtrace")) {
       CommandObjectSP command_sp(bt_regex_cmd_up.release());
       m_command_dict[std::string(command_sp->GetCommandName())] = command_sp;
     }
@@ -954,13 +953,14 @@ int CommandInterpreter::GetCommandNamesMatchingPartialString(
   return matches.GetSize();
 }
 
-CommandObjectMultiword *CommandInterpreter::VerifyUserMultiwordCmdPath(
-    Args &path, bool leaf_is_command, Status &result) {
+CommandObjectMultiword *
+CommandInterpreter::VerifyUserMultiwordCmdPath(Args &path, bool leaf_is_command,
+                                               Status &result) {
   result.Clear();
 
   auto get_multi_or_report_error =
       [&result](CommandObjectSP cmd_sp,
-                           const char *name) -> CommandObjectMultiword * {
+                const char *name) -> CommandObjectMultiword * {
     if (!cmd_sp) {
       result = Status::FromErrorStringWithFormat(
           "Path component: '%s' not found", name);
@@ -1265,8 +1265,8 @@ CommandInterpreter::GetCommandObject(llvm::StringRef cmd_str,
   // Try to find a match among commands and aliases. Allowing inexact matches,
   // but perferring exact matches.
   return GetCommandSP(cmd_str, /*include_aliases=*/true, /*exact=*/false,
-                             matches, descriptions)
-                    .get();
+                      matches, descriptions)
+      .get();
 }
 
 CommandObject *CommandInterpreter::GetUserCommandObject(
@@ -1299,8 +1299,8 @@ CommandObject *CommandInterpreter::GetUserCommandObject(
   StringList tmp_list;
   StringList *matches_ptr = matches ? matches : &tmp_list;
   AddNamesMatchingPartialString(GetUserCommands(), cmd_str, *matches_ptr);
-  AddNamesMatchingPartialString(GetUserMultiwordCommands(),
-                                cmd_str, *matches_ptr);
+  AddNamesMatchingPartialString(GetUserMultiwordCommands(), cmd_str,
+                                *matches_ptr);
 
   return {};
 }
@@ -1798,8 +1798,7 @@ Status CommandInterpreter::PreprocessCommand(std::string &command) {
   return error;
 }
 
-Status
-CommandInterpreter::PreprocessToken(std::string &expr_str) {
+Status CommandInterpreter::PreprocessToken(std::string &expr_str) {
   Status error;
   ExecutionContext exe_ctx(GetExecutionContext());
 
@@ -1819,9 +1818,8 @@ CommandInterpreter::PreprocessToken(std::string &expr_str) {
   options.SetTryAllThreads(true);
   options.SetTimeout(std::nullopt);
 
-  ExpressionResults expr_result =
-      target.EvaluateExpression(expr_str.c_str(), exe_ctx.GetFramePtr(),
-                                expr_result_valobj_sp, options);
+  ExpressionResults expr_result = target.EvaluateExpression(
+      expr_str.c_str(), exe_ctx.GetFramePtr(), expr_result_valobj_sp, options);
 
   if (expr_result == eExpressionCompleted) {
     Scalar scalar;
@@ -1890,7 +1888,7 @@ bool CommandInterpreter::HandleCommand(const char *command_line,
 
   Log *log = GetLog(LLDBLog::Commands);
   llvm::PrettyStackTraceFormat stack_trace("HandleCommand(command = \"%s\")",
-                                   command_line);
+                                           command_line);
 
   LLDB_LOGF(log, "Processing command: %s", command_line);
   LLDB_SCOPED_TIMERF("Processing command: %s.", command_line);
@@ -2011,7 +2009,8 @@ bool CommandInterpreter::HandleCommand(const char *command_line,
   // has the command expanded to the full name.  For example, if the input was
   // "br s -n main", command_string is now "breakpoint set -n main".
   if (log) {
-    llvm::StringRef command_name = cmd_obj ? cmd_obj->GetCommandName() : "<not found>";
+    llvm::StringRef command_name =
+        cmd_obj ? cmd_obj->GetCommandName() : "<not found>";
     LLDB_LOGF(log, "HandleCommand, cmd_obj : '%s'", command_name.str().c_str());
     LLDB_LOGF(log, "HandleCommand, (revised) command_string: '%s'",
               command_string.c_str());
@@ -2216,11 +2215,15 @@ CommandInterpreter::GetAlias(llvm::StringRef alias_name) const {
   return nullptr;
 }
 
-bool CommandInterpreter::HasCommands() const { return (!m_command_dict.empty()); }
+bool CommandInterpreter::HasCommands() const {
+  return (!m_command_dict.empty());
+}
 
 bool CommandInterpreter::HasAliases() const { return (!m_alias_dict.empty()); }
 
-bool CommandInterpreter::HasUserCommands() const { return (!m_user_dict.empty()); }
+bool CommandInterpreter::HasUserCommands() const {
+  return (!m_user_dict.empty());
+}
 
 bool CommandInterpreter::HasUserMultiwordCommands() const {
   return (!m_user_mw_dict.empty());
@@ -2578,20 +2581,18 @@ bool CommandInterpreter::DidProcessStopAbnormally() const {
   return false;
 }
 
-void
-CommandInterpreter::HandleCommands(const StringList &commands,
-                                   const ExecutionContext &override_context,
-                                   const CommandInterpreterRunOptions &options,
-                                   CommandReturnObject &result) {
+void CommandInterpreter::HandleCommands(
+    const StringList &commands, const ExecutionContext &override_context,
+    const CommandInterpreterRunOptions &options, CommandReturnObject &result) {
 
   OverrideExecutionContext(override_context);
   HandleCommands(commands, options, result);
   RestoreExecutionContext();
 }
 
-void CommandInterpreter::HandleCommands(const StringList &commands,
-                                        const CommandInterpreterRunOptions &options,
-                                        CommandReturnObject &result) {
+void CommandInterpreter::HandleCommands(
+    const StringList &commands, const CommandInterpreterRunOptions &options,
+    CommandReturnObject &result) {
   size_t num_lines = commands.GetSize();
 
   // If we are going to continue past a "continue" then we need to run the
@@ -2728,8 +2729,9 @@ void CommandInterpreter::HandleCommandsFromFile(
   RestoreExecutionContext();
 }
 
-void CommandInterpreter::HandleCommandsFromFile(FileSpec &cmd_file,
-    const CommandInterpreterRunOptions &options, CommandReturnObject &result) {
+void CommandInterpreter::HandleCommandsFromFile(
+    FileSpec &cmd_file, const CommandInterpreterRunOptions &options,
+    CommandReturnObject &result) {
   if (!FileSystem::Instance().Exists(cmd_file)) {
     result.AppendErrorWithFormat(
         "Error reading commands from file %s - file not found.\n",
@@ -3134,9 +3136,9 @@ bool CommandInterpreter::EchoCommandNonInteractive(
 
 void CommandInterpreter::IOHandlerInputComplete(IOHandler &io_handler,
                                                 std::string &line) {
-    // If we were interrupted, bail out...
-    if (WasInterrupted())
-      return;
+  // If we were interrupted, bail out...
+  if (WasInterrupted())
+    return;
 
   const bool is_interactive = io_handler.GetIsInteractive();
   const bool allow_repeats =
diff --git lldb/source/Interpreter/CommandObject.cpp lldb/source/Interpreter/CommandObject.cpp
index 6b044a28eb37..7008253e32bc 100644
--- lldb/source/Interpreter/CommandObject.cpp
+++ lldb/source/Interpreter/CommandObject.cpp
@@ -120,17 +120,24 @@ bool CommandObject::ParseOptions(Args &args, CommandReturnObject &result) {
     if (args_or) {
       args = std::move(*args_or);
       error = options->NotifyOptionParsingFinished(&exe_ctx);
-    } else
+    } else {
       error = Status::FromError(args_or.takeError());
+    }
 
-    if (error.Success()) {
-      if (options->VerifyOptions(result))
-        return true;
-    } else {
+    if (error.Fail()) {
       result.SetError(error.takeError());
+      result.SetStatus(eReturnStatusFailed);
+      return false;
     }
-    result.SetStatus(eReturnStatusFailed);
-    return false;
+
+    if (llvm::Error error = options->VerifyOptions()) {
+      result.SetError(std::move(error));
+      result.SetStatus(eReturnStatusFailed);
+      return false;
+    }
+
+    result.SetStatus(eReturnStatusSuccessFinishNoResult);
+    return true;
   }
   return true;
 }
@@ -278,7 +285,6 @@ void CommandObject::HandleCompletion(CompletionRequest &request) {
   } else {
     // Can we do anything generic with the options?
     Options *cur_options = GetOptions();
-    CommandReturnObject result(m_interpreter.GetDebugger().GetUseColor());
     OptionElementVector opt_element_vector;
 
     if (cur_options != nullptr) {
diff --git lldb/source/Interpreter/CommandReturnObject.cpp lldb/source/Interpreter/CommandReturnObject.cpp
index b99b2bc7b36c..0a2948e8e6ca 100644
--- lldb/source/Interpreter/CommandReturnObject.cpp
+++ lldb/source/Interpreter/CommandReturnObject.cpp
@@ -147,7 +147,8 @@ void CommandReturnObject::SetError(llvm::Error error) {
   }
 }
 
-std::string CommandReturnObject::GetInlineDiagnosticString(unsigned indent) {
+std::string
+CommandReturnObject::GetInlineDiagnosticString(unsigned indent) const {
   StreamString diag_stream(m_colors);
   RenderDiagnosticDetails(diag_stream, indent, true, m_diagnostics);
   // Duplex the diagnostics to the secondary stream (but not inlined).
@@ -157,7 +158,7 @@ std::string CommandReturnObject::GetInlineDiagnosticString(unsigned indent) {
   return diag_stream.GetString().str();
 }
 
-std::string CommandReturnObject::GetErrorString(bool with_diagnostics) {
+std::string CommandReturnObject::GetErrorString(bool with_diagnostics) const {
   StreamString stream(m_colors);
   if (with_diagnostics)
     RenderDiagnosticDetails(stream, std::nullopt, false, m_diagnostics);
diff --git lldb/source/Interpreter/Options.cpp lldb/source/Interpreter/Options.cpp
index 893a3b71604b..fdadba62987d 100644
--- lldb/source/Interpreter/Options.cpp
+++ lldb/source/Interpreter/Options.cpp
@@ -138,46 +138,6 @@ void Options::OptionsSetUnion(const OptionSet &set_a, const OptionSet &set_b,
   }
 }
 
-bool Options::VerifyOptions(CommandReturnObject &result) {
-  bool options_are_valid = false;
-
-  int num_levels = GetRequiredOptions().size();
-  if (num_levels) {
-    for (int i = 0; i < num_levels && !options_are_valid; ++i) {
-      // This is the correct set of options if:  1). m_seen_options contains
-      // all of m_required_options[i] (i.e. all the required options at this
-      // level are a subset of m_seen_options); AND 2). { m_seen_options -
-      // m_required_options[i] is a subset of m_options_options[i] (i.e. all
-      // the rest of m_seen_options are in the set of optional options at this
-      // level.
-
-      // Check to see if all of m_required_options[i] are a subset of
-      // m_seen_options
-      if (IsASubset(GetRequiredOptions()[i], m_seen_options)) {
-        // Construct the set difference: remaining_options = {m_seen_options} -
-        // {m_required_options[i]}
-        OptionSet remaining_options;
-        OptionsSetDiff(m_seen_options, GetRequiredOptions()[i],
-                       remaining_options);
-        // Check to see if remaining_options is a subset of
-        // m_optional_options[i]
-        if (IsASubset(remaining_options, GetOptionalOptions()[i]))
-          options_are_valid = true;
-      }
-    }
-  } else {
-    options_are_valid = true;
-  }
-
-  if (options_are_valid) {
-    result.SetStatus(eReturnStatusSuccessFinishNoResult);
-  } else {
-    result.AppendError("invalid combination of options for the given command");
-  }
-
-  return options_are_valid;
-}
-
 // This is called in the Options constructor, though we could call it lazily if
 // that ends up being a performance problem.
 
@@ -590,13 +550,50 @@ void Options::GenerateOptionUsage(Stream &strm, CommandObject &cmd,
   strm.SetIndentLevel(save_indent_level);
 }
 
+llvm::Error Options::VerifyOptions() {
+  bool options_are_valid = false;
+
+  int num_levels = GetRequiredOptions().size();
+  if (num_levels) {
+    for (int i = 0; i < num_levels && !options_are_valid; ++i) {
+      // This is the correct set of options if:  1). m_seen_options contains
+      // all of m_required_options[i] (i.e. all the required options at this
+      // level are a subset of m_seen_options); AND 2). { m_seen_options -
+      // m_required_options[i] is a subset of m_options_options[i] (i.e. all
+      // the rest of m_seen_options are in the set of optional options at this
+      // level.
+
+      // Check to see if all of m_required_options[i] are a subset of
+      // m_seen_options
+      if (IsASubset(GetRequiredOptions()[i], m_seen_options)) {
+        // Construct the set difference: remaining_options = {m_seen_options} -
+        // {m_required_options[i]}
+        OptionSet remaining_options;
+        OptionsSetDiff(m_seen_options, GetRequiredOptions()[i],
+                       remaining_options);
+        // Check to see if remaining_options is a subset of
+        // m_optional_options[i]
+        if (IsASubset(remaining_options, GetOptionalOptions()[i]))
+          options_are_valid = true;
+      }
+    }
+  } else {
+    options_are_valid = true;
+  }
+
+  if (!options_are_valid)
+    return llvm::createStringError(
+        "invalid combination of options for the given command");
+
+  return llvm::Error::success();
+}
+
 // This function is called when we have been given a potentially incomplete set
 // of options, such as when an alias has been defined (more options might be
 // added at at the time the alias is invoked).  We need to verify that the
 // options in the set m_seen_options are all part of a set that may be used
 // together, but m_seen_options may be missing some of the "required" options.
-
-bool Options::VerifyPartialOptions(CommandReturnObject &result) {
+llvm::Error Options::VerifyPartialOptions() {
   bool options_are_valid = false;
 
   int num_levels = GetRequiredOptions().size();
@@ -613,7 +610,11 @@ bool Options::VerifyPartialOptions(CommandReturnObject &result) {
     }
   }
 
-  return options_are_valid;
+  if (!options_are_valid)
+    return llvm::createStringError(
+        "invalid combination of options for the given command");
+
+  return llvm::Error::success();
 }
 
 bool Options::HandleOptionCompletion(CompletionRequest &request,
diff --git lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
index 2bf574e97768..4b045d12ad49 100644
--- lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
+++ lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
@@ -641,7 +641,7 @@ static void LoadLibCxxFormatters(lldb::TypeCategoryImplSP cpp_category_sp) {
       .SetSkipPointers(false)
       .SetSkipReferences(false)
       .SetDontShowChildren(true)
-      .SetDontShowValue(true)
+      .SetDontShowValue(false)
       .SetShowMembersOneLiner(false)
       .SetHideItemNames(false);
 
@@ -1204,7 +1204,7 @@ static void LoadLibStdcppFormatters(lldb::TypeCategoryImplSP cpp_category_sp) {
       .SetSkipPointers(false)
       .SetSkipReferences(false)
       .SetDontShowChildren(true)
-      .SetDontShowValue(true)
+      .SetDontShowValue(false)
       .SetShowMembersOneLiner(false)
       .SetHideItemNames(false);
 
diff --git lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp
index 6d0ccdbbe4a7..2aa8fdba7063 100644
--- lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp
+++ lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp
@@ -430,12 +430,6 @@ size_t lldb_private::formatters::LibcxxUniquePtrSyntheticFrontEnd::
 
 bool lldb_private::formatters::LibcxxContainerSummaryProvider(
     ValueObject &valobj, Stream &stream, const TypeSummaryOptions &options) {
-  if (valobj.IsPointerType()) {
-    uint64_t value = valobj.GetValueAsUnsigned(0);
-    if (!value)
-      return false;
-    stream.Printf("0x%016" PRIx64 " ", value);
-  }
   return FormatEntity::FormatStringRef("size=${svar%#}", stream, nullptr,
                                        nullptr, nullptr, &valobj, false, false);
 }
diff --git lldb/source/Plugins/Process/Utility/ThreadMemory.h lldb/source/Plugins/Process/Utility/ThreadMemory.h
index d124f5780ea9..1e309671e85c 100644
--- lldb/source/Plugins/Process/Utility/ThreadMemory.h
+++ lldb/source/Plugins/Process/Utility/ThreadMemory.h
@@ -72,12 +72,17 @@ public:
 
   void ClearStackFrames() override;
 
-  void ClearBackingThread() override { m_backing_thread_sp.reset(); }
+  void ClearBackingThread() override {
+    if (m_backing_thread_sp)
+      m_backing_thread_sp->ClearBackedThread();
+    m_backing_thread_sp.reset();
+  }
 
   bool SetBackingThread(const lldb::ThreadSP &thread_sp) override {
     // printf ("Thread 0x%llx is being backed by thread 0x%llx\n", GetID(),
     // thread_sp->GetID());
     m_backing_thread_sp = thread_sp;
+    thread_sp->SetBackedThread(*this);
     return (bool)thread_sp;
   }
 
diff --git lldb/source/Plugins/Process/Windows/Common/NativeProcessWindows.cpp lldb/source/Plugins/Process/Windows/Common/NativeProcessWindows.cpp
index 9c330ff11867..79dd46ba319d 100644
--- lldb/source/Plugins/Process/Windows/Common/NativeProcessWindows.cpp
+++ lldb/source/Plugins/Process/Windows/Common/NativeProcessWindows.cpp
@@ -292,7 +292,8 @@ NativeProcessWindows::GetAuxvData() const {
 
 llvm::Expected<llvm::ArrayRef<uint8_t>>
 NativeProcessWindows::GetSoftwareBreakpointTrapOpcode(size_t size_hint) {
-  static const uint8_t g_aarch64_opcode[] = {0x00, 0x00, 0x3e, 0xd4}; // brk #0xf000
+  static const uint8_t g_aarch64_opcode[] = {0x00, 0x00, 0x3e,
+                                             0xd4};     // brk #0xf000
   static const uint8_t g_thumb_opcode[] = {0xfe, 0xde}; // udf #0xfe
 
   switch (GetArchitecture().GetMachine()) {
@@ -309,9 +310,9 @@ NativeProcessWindows::GetSoftwareBreakpointTrapOpcode(size_t size_hint) {
 }
 
 size_t NativeProcessWindows::GetSoftwareBreakpointPCOffset() {
-    // Windows always reports an incremented PC after a breakpoint is hit,
-    // even on ARM.
-    return cantFail(GetSoftwareBreakpointTrapOpcode(0)).size();
+  // Windows always reports an incremented PC after a breakpoint is hit,
+  // even on ARM.
+  return cantFail(GetSoftwareBreakpointTrapOpcode(0)).size();
 }
 
 bool NativeProcessWindows::FindSoftwareBreakpoint(lldb::addr_t addr) {
@@ -463,6 +464,7 @@ NativeProcessWindows::OnDebugException(bool first_chance,
   switch (record.GetExceptionCode()) {
   case DWORD(STATUS_SINGLE_STEP):
   case STATUS_WX86_SINGLE_STEP: {
+#ifndef __aarch64__
     uint32_t wp_id = LLDB_INVALID_INDEX32;
     if (NativeThreadWindows *thread = GetThreadByID(record.GetThreadID())) {
       NativeRegisterContextWindows &reg_ctx = thread->GetRegisterContext();
@@ -483,6 +485,7 @@ NativeProcessWindows::OnDebugException(bool first_chance,
       }
     }
     if (wp_id == LLDB_INVALID_INDEX32)
+#endif
       StopThread(record.GetThreadID(), StopReason::eStopReasonTrace);
 
     SetState(eStateStopped, true);
@@ -492,23 +495,50 @@ NativeProcessWindows::OnDebugException(bool first_chance,
   }
   case DWORD(STATUS_BREAKPOINT):
   case STATUS_WX86_BREAKPOINT:
-    if (FindSoftwareBreakpoint(record.GetExceptionAddress())) {
-      LLDB_LOG(log, "Hit non-loader breakpoint at address {0:x}.",
-               record.GetExceptionAddress());
 
-      StopThread(record.GetThreadID(), StopReason::eStopReasonBreakpoint);
+    if (NativeThreadWindows *stop_thread =
+            GetThreadByID(record.GetThreadID())) {
+      auto &reg_ctx = stop_thread->GetRegisterContext();
+      const auto exception_addr = record.GetExceptionAddress();
+      const auto thread_id = record.GetThreadID();
 
-      if (NativeThreadWindows *stop_thread =
-              GetThreadByID(record.GetThreadID())) {
-        auto &register_context = stop_thread->GetRegisterContext();
-        uint32_t breakpoint_size = GetSoftwareBreakpointPCOffset();
+      if (FindSoftwareBreakpoint(exception_addr)) {
+        LLDB_LOG(log, "Hit non-loader breakpoint at address {0:x}.",
+                 exception_addr);
         // The current PC is AFTER the BP opcode, on all architectures.
-        uint64_t pc = register_context.GetPC() - breakpoint_size;
-        register_context.SetPC(pc);
+        reg_ctx.SetPC(reg_ctx.GetPC() - GetSoftwareBreakpointPCOffset());
+        StopThread(thread_id, StopReason::eStopReasonBreakpoint);
+        SetState(eStateStopped, true);
+        return ExceptionResult::MaskException;
+      } else {
+        // This block of code will only be entered in case of a hardware
+        // watchpoint or breakpoint hit on AArch64. However, we only handle
+        // hardware watchpoints below as breakpoints are not yet supported.
+        const std::vector<ULONG_PTR> &args = record.GetExceptionArguments();
+        // Check that the ExceptionInformation array of EXCEPTION_RECORD
+        // contains at least two elements: the first is a read-write flag
+        // indicating the type of data access operation (read or write) while
+        // the second contains the virtual address of the accessed data.
+        if (args.size() >= 2) {
+          uint32_t hw_id = LLDB_INVALID_INDEX32;
+          Status error = reg_ctx.GetWatchpointHitIndex(hw_id, args[1]);
+          if (error.Fail())
+            LLDB_LOG(log,
+                     "received error while checking for watchpoint hits, pid = "
+                     "{0}, error = {1}",
+                     thread_id, error);
+
+          if (hw_id != LLDB_INVALID_INDEX32) {
+            std::string desc =
+                formatv("{0} {1} {2}", reg_ctx.GetWatchpointAddress(hw_id),
+                        hw_id, exception_addr)
+                    .str();
+            StopThread(thread_id, StopReason::eStopReasonWatchpoint, desc);
+            SetState(eStateStopped, true);
+            return ExceptionResult::MaskException;
+          }
+        }
       }
-
-      SetState(eStateStopped, true);
-      return ExceptionResult::MaskException;
     }
 
     if (!initial_stop) {
diff --git lldb/source/Plugins/Process/Windows/Common/NativeRegisterContextWindows.cpp lldb/source/Plugins/Process/Windows/Common/NativeRegisterContextWindows.cpp
index 9128363eaa57..effe6df36b57 100644
--- lldb/source/Plugins/Process/Windows/Common/NativeRegisterContextWindows.cpp
+++ lldb/source/Plugins/Process/Windows/Common/NativeRegisterContextWindows.cpp
@@ -18,10 +18,6 @@
 using namespace lldb;
 using namespace lldb_private;
 
-NativeRegisterContextWindows::NativeRegisterContextWindows(
-    NativeThreadProtocol &thread, RegisterInfoInterface *reg_info_interface_p)
-    : NativeRegisterContextRegisterInfo(thread, reg_info_interface_p) {}
-
 lldb::thread_t NativeRegisterContextWindows::GetThreadHandle() const {
   auto wthread = static_cast<NativeThreadWindows *>(&m_thread);
   return wthread->GetHostThread().GetNativeThread().GetSystemHandle();
diff --git lldb/source/Plugins/Process/Windows/Common/NativeRegisterContextWindows.h lldb/source/Plugins/Process/Windows/Common/NativeRegisterContextWindows.h
index 841b8547f3e9..2b71f639d562 100644
--- lldb/source/Plugins/Process/Windows/Common/NativeRegisterContextWindows.h
+++ lldb/source/Plugins/Process/Windows/Common/NativeRegisterContextWindows.h
@@ -17,16 +17,17 @@ namespace lldb_private {
 
 class NativeThreadWindows;
 
-class NativeRegisterContextWindows : public NativeRegisterContextRegisterInfo {
+class NativeRegisterContextWindows
+    : public virtual NativeRegisterContextRegisterInfo {
 public:
-  NativeRegisterContextWindows(
-      NativeThreadProtocol &native_thread,
-      RegisterInfoInterface *reg_info_interface_p);
-
   static std::unique_ptr<NativeRegisterContextWindows>
   CreateHostNativeRegisterContextWindows(const ArchSpec &target_arch,
                                          NativeThreadProtocol &native_thread);
 
+  // MSVC compiler deletes the default constructor due to virtual inheritance.
+  // Explicitly defining it ensures the class remains constructible.
+  NativeRegisterContextWindows() {}
+
 protected:
   lldb::thread_t GetThreadHandle() const;
 };
diff --git lldb/source/Plugins/Process/Windows/Common/NativeRegisterContextWindows_WoW64.cpp lldb/source/Plugins/Process/Windows/Common/NativeRegisterContextWindows_WoW64.cpp
index a9642d1c5e48..069c327ee2f9 100644
--- lldb/source/Plugins/Process/Windows/Common/NativeRegisterContextWindows_WoW64.cpp
+++ lldb/source/Plugins/Process/Windows/Common/NativeRegisterContextWindows_WoW64.cpp
@@ -88,8 +88,8 @@ static Status SetWoW64ThreadContextHelper(lldb::thread_t thread_handle,
 
 NativeRegisterContextWindows_WoW64::NativeRegisterContextWindows_WoW64(
     const ArchSpec &target_arch, NativeThreadProtocol &native_thread)
-    : NativeRegisterContextWindows(native_thread,
-                                   CreateRegisterInfoInterface(target_arch)) {}
+    : NativeRegisterContextRegisterInfo(
+          native_thread, CreateRegisterInfoInterface(target_arch)) {}
 
 bool NativeRegisterContextWindows_WoW64::IsGPR(uint32_t reg_index) const {
   return (reg_index >= k_first_gpr_i386 && reg_index < k_first_alias_i386);
diff --git lldb/source/Plugins/Process/Windows/Common/NativeRegisterContextWindows_arm.cpp lldb/source/Plugins/Process/Windows/Common/NativeRegisterContextWindows_arm.cpp
index 4209fdf3c710..fd8a0c05c1b2 100644
--- lldb/source/Plugins/Process/Windows/Common/NativeRegisterContextWindows_arm.cpp
+++ lldb/source/Plugins/Process/Windows/Common/NativeRegisterContextWindows_arm.cpp
@@ -128,8 +128,8 @@ NativeRegisterContextWindows::CreateHostNativeRegisterContextWindows(
 
 NativeRegisterContextWindows_arm::NativeRegisterContextWindows_arm(
     const ArchSpec &target_arch, NativeThreadProtocol &native_thread)
-    : NativeRegisterContextWindows(native_thread,
-                                   CreateRegisterInfoInterface(target_arch)) {}
+    : NativeRegisterContextRegisterInfo(
+          native_thread, CreateRegisterInfoInterface(target_arch)) {}
 
 bool NativeRegisterContextWindows_arm::IsGPR(uint32_t reg_index) const {
   return (reg_index >= k_first_gpr_arm && reg_index <= k_last_gpr_arm);
diff --git lldb/source/Plugins/Process/Windows/Common/NativeRegisterContextWindows_arm64.cpp lldb/source/Plugins/Process/Windows/Common/NativeRegisterContextWindows_arm64.cpp
index 080a9140e36a..d065ad6957f7 100644
--- lldb/source/Plugins/Process/Windows/Common/NativeRegisterContextWindows_arm64.cpp
+++ lldb/source/Plugins/Process/Windows/Common/NativeRegisterContextWindows_arm64.cpp
@@ -10,7 +10,6 @@
 
 #include "NativeRegisterContextWindows_arm64.h"
 #include "NativeThreadWindows.h"
-#include "Plugins/Process/Utility/RegisterInfoPOSIX_arm64.h"
 #include "ProcessWindowsLog.h"
 #include "lldb/Host/HostInfo.h"
 #include "lldb/Host/HostThread.h"
@@ -143,8 +142,13 @@ NativeRegisterContextWindows::CreateHostNativeRegisterContextWindows(
 
 NativeRegisterContextWindows_arm64::NativeRegisterContextWindows_arm64(
     const ArchSpec &target_arch, NativeThreadProtocol &native_thread)
-    : NativeRegisterContextWindows(native_thread,
-                                   CreateRegisterInfoInterface(target_arch)) {}
+    : NativeRegisterContextRegisterInfo(
+          native_thread, CreateRegisterInfoInterface(target_arch)) {
+  // Currently, there is no API to query the maximum supported hardware
+  // breakpoints and watchpoints on Windows. The values set below are based
+  // on tests conducted on Windows 11 with Snapdragon Elite X hardware.
+  m_max_hwp_supported = 1;
+}
 
 bool NativeRegisterContextWindows_arm64::IsGPR(uint32_t reg_index) const {
   return (reg_index >= k_first_gpr_arm64 && reg_index <= k_last_gpr_arm64);
@@ -709,48 +713,37 @@ Status NativeRegisterContextWindows_arm64::WriteAllRegisterValues(
   return SetThreadContextHelper(GetThreadHandle(), &tls_context);
 }
 
-Status NativeRegisterContextWindows_arm64::IsWatchpointHit(uint32_t wp_index,
-                                                           bool &is_hit) {
-  return Status::FromErrorString("unimplemented");
-}
-
-Status NativeRegisterContextWindows_arm64::GetWatchpointHitIndex(
-    uint32_t &wp_index, lldb::addr_t trap_addr) {
-  return Status::FromErrorString("unimplemented");
-}
-
-Status NativeRegisterContextWindows_arm64::IsWatchpointVacant(uint32_t wp_index,
-                                                              bool &is_vacant) {
-  return Status::FromErrorString("unimplemented");
-}
-
-Status NativeRegisterContextWindows_arm64::SetHardwareWatchpointWithIndex(
-    lldb::addr_t addr, size_t size, uint32_t watch_flags, uint32_t wp_index) {
-  return Status::FromErrorString("unimplemented");
-}
+llvm::Error NativeRegisterContextWindows_arm64::ReadHardwareDebugInfo() {
+  ::CONTEXT tls_context;
+  Status error = GetThreadContextHelper(GetThreadHandle(), &tls_context,
+                                        CONTEXT_DEBUG_REGISTERS);
+  if (error.Fail())
+    return error.ToError();
 
-bool NativeRegisterContextWindows_arm64::ClearHardwareWatchpoint(
-    uint32_t wp_index) {
-  return false;
-}
+  for (uint32_t i = 0; i < m_max_hwp_supported; i++) {
+    m_hwp_regs[i].address = tls_context.Wvr[i];
+    m_hwp_regs[i].control = tls_context.Wcr[i];
+  }
 
-Status NativeRegisterContextWindows_arm64::ClearAllHardwareWatchpoints() {
-  return Status::FromErrorString("unimplemented");
+  return llvm::Error::success();
 }
 
-uint32_t NativeRegisterContextWindows_arm64::SetHardwareWatchpoint(
-    lldb::addr_t addr, size_t size, uint32_t watch_flags) {
-  return LLDB_INVALID_INDEX32;
-}
+llvm::Error
+NativeRegisterContextWindows_arm64::WriteHardwareDebugRegs(DREGType hwbType) {
+  ::CONTEXT tls_context;
+  Status error = GetThreadContextHelper(GetThreadHandle(), &tls_context,
+                                        CONTEXT_DEBUG_REGISTERS);
+  if (error.Fail())
+    return error.ToError();
 
-lldb::addr_t
-NativeRegisterContextWindows_arm64::GetWatchpointAddress(uint32_t wp_index) {
-  return LLDB_INVALID_ADDRESS;
-}
+  if (hwbType == eDREGTypeWATCH) {
+    for (uint32_t i = 0; i < m_max_hwp_supported; i++) {
+      tls_context.Wvr[i] = m_hwp_regs[i].address;
+      tls_context.Wcr[i] = m_hwp_regs[i].control;
+    }
+  }
 
-uint32_t NativeRegisterContextWindows_arm64::NumSupportedHardwareWatchpoints() {
-  // Not implemented
-  return 0;
+  return SetThreadContextHelper(GetThreadHandle(), &tls_context).ToError();
 }
 
 #endif // defined(__aarch64__) || defined(_M_ARM64)
diff --git lldb/source/Plugins/Process/Windows/Common/NativeRegisterContextWindows_arm64.h lldb/source/Plugins/Process/Windows/Common/NativeRegisterContextWindows_arm64.h
index 88afc1e7b18a..e73a6af4cbf8 100644
--- lldb/source/Plugins/Process/Windows/Common/NativeRegisterContextWindows_arm64.h
+++ lldb/source/Plugins/Process/Windows/Common/NativeRegisterContextWindows_arm64.h
@@ -10,6 +10,8 @@
 #ifndef liblldb_NativeRegisterContextWindows_arm64_h_
 #define liblldb_NativeRegisterContextWindows_arm64_h_
 
+#include "Plugins/Process/Utility/NativeRegisterContextDBReg_arm64.h"
+#include "Plugins/Process/Utility/RegisterInfoPOSIX_arm64.h"
 #include "Plugins/Process/Utility/lldb-arm64-register-enums.h"
 
 #include "NativeRegisterContextWindows.h"
@@ -18,7 +20,9 @@ namespace lldb_private {
 
 class NativeThreadWindows;
 
-class NativeRegisterContextWindows_arm64 : public NativeRegisterContextWindows {
+class NativeRegisterContextWindows_arm64
+    : public NativeRegisterContextWindows,
+      public NativeRegisterContextDBReg_arm64 {
 public:
   NativeRegisterContextWindows_arm64(const ArchSpec &target_arch,
                                      NativeThreadProtocol &native_thread);
@@ -37,28 +41,6 @@ public:
 
   Status WriteAllRegisterValues(const lldb::DataBufferSP &data_sp) override;
 
-  Status IsWatchpointHit(uint32_t wp_index, bool &is_hit) override;
-
-  Status GetWatchpointHitIndex(uint32_t &wp_index,
-                               lldb::addr_t trap_addr) override;
-
-  Status IsWatchpointVacant(uint32_t wp_index, bool &is_vacant) override;
-
-  bool ClearHardwareWatchpoint(uint32_t wp_index) override;
-
-  Status ClearAllHardwareWatchpoints() override;
-
-  Status SetHardwareWatchpointWithIndex(lldb::addr_t addr, size_t size,
-                                        uint32_t watch_flags,
-                                        uint32_t wp_index);
-
-  uint32_t SetHardwareWatchpoint(lldb::addr_t addr, size_t size,
-                                 uint32_t watch_flags) override;
-
-  lldb::addr_t GetWatchpointAddress(uint32_t wp_index) override;
-
-  uint32_t NumSupportedHardwareWatchpoints() override;
-
 protected:
   Status GPRRead(const uint32_t reg, RegisterValue &reg_value);
 
@@ -72,6 +54,10 @@ private:
   bool IsGPR(uint32_t reg_index) const;
 
   bool IsFPR(uint32_t reg_index) const;
+
+  llvm::Error ReadHardwareDebugInfo() override;
+
+  llvm::Error WriteHardwareDebugRegs(DREGType hwbType) override;
 };
 
 } // namespace lldb_private
diff --git lldb/source/Plugins/Process/Windows/Common/NativeRegisterContextWindows_i386.cpp lldb/source/Plugins/Process/Windows/Common/NativeRegisterContextWindows_i386.cpp
index 53df98667939..7c13759df462 100644
--- lldb/source/Plugins/Process/Windows/Common/NativeRegisterContextWindows_i386.cpp
+++ lldb/source/Plugins/Process/Windows/Common/NativeRegisterContextWindows_i386.cpp
@@ -92,8 +92,8 @@ NativeRegisterContextWindows::CreateHostNativeRegisterContextWindows(
 
 NativeRegisterContextWindows_i386::NativeRegisterContextWindows_i386(
     const ArchSpec &target_arch, NativeThreadProtocol &native_thread)
-    : NativeRegisterContextWindows(native_thread,
-                                   CreateRegisterInfoInterface(target_arch)) {}
+    : NativeRegisterContextRegisterInfo(
+          native_thread, CreateRegisterInfoInterface(target_arch)) {}
 
 bool NativeRegisterContextWindows_i386::IsGPR(uint32_t reg_index) const {
   return (reg_index < k_first_alias_i386);
diff --git lldb/source/Plugins/Process/Windows/Common/NativeRegisterContextWindows_x86_64.cpp lldb/source/Plugins/Process/Windows/Common/NativeRegisterContextWindows_x86_64.cpp
index 4c59273b845a..c54c31e48e4e 100644
--- lldb/source/Plugins/Process/Windows/Common/NativeRegisterContextWindows_x86_64.cpp
+++ lldb/source/Plugins/Process/Windows/Common/NativeRegisterContextWindows_x86_64.cpp
@@ -110,8 +110,8 @@ NativeRegisterContextWindows::CreateHostNativeRegisterContextWindows(
 
 NativeRegisterContextWindows_x86_64::NativeRegisterContextWindows_x86_64(
     const ArchSpec &target_arch, NativeThreadProtocol &native_thread)
-    : NativeRegisterContextWindows(native_thread,
-                                   CreateRegisterInfoInterface(target_arch)) {}
+    : NativeRegisterContextRegisterInfo(
+          native_thread, CreateRegisterInfoInterface(target_arch)) {}
 
 bool NativeRegisterContextWindows_x86_64::IsGPR(uint32_t reg_index) const {
   return (reg_index >= k_first_gpr_x86_64 && reg_index < k_first_alias_x86_64);
diff --git lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
index b3f1c6f05295..581dd8f8e0b6 100644
--- lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
+++ lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
@@ -275,7 +275,6 @@ void GDBRemoteCommunicationClient::ResetDiscoverableSettings(bool did_exec) {
     m_supports_vCont_s = eLazyBoolCalculate;
     m_supports_vCont_S = eLazyBoolCalculate;
     m_supports_p = eLazyBoolCalculate;
-    m_supports_x = eLazyBoolCalculate;
     m_supports_QSaveRegisterState = eLazyBoolCalculate;
     m_qHostInfo_is_valid = eLazyBoolCalculate;
     m_curr_pid_is_valid = eLazyBoolCalculate;
@@ -295,6 +294,7 @@ void GDBRemoteCommunicationClient::ResetDiscoverableSettings(bool did_exec) {
     m_supports_qXfer_siginfo_read = eLazyBoolCalculate;
     m_supports_augmented_libraries_svr4_read = eLazyBoolCalculate;
     m_uses_native_signals = eLazyBoolCalculate;
+    m_x_packet_state.reset();
     m_supports_qProcessInfoPID = true;
     m_supports_qfProcessInfo = true;
     m_supports_qUserName = true;
@@ -348,6 +348,7 @@ void GDBRemoteCommunicationClient::GetRemoteQSupported() {
   m_supports_memory_tagging = eLazyBoolNo;
   m_supports_qSaveCore = eLazyBoolNo;
   m_uses_native_signals = eLazyBoolNo;
+  m_x_packet_state.reset();
 
   m_max_packet_size = UINT64_MAX; // It's supposed to always be there, but if
                                   // not, we assume no limit
@@ -401,6 +402,8 @@ void GDBRemoteCommunicationClient::GetRemoteQSupported() {
         m_supports_qSaveCore = eLazyBoolYes;
       else if (x == "native-signals+")
         m_uses_native_signals = eLazyBoolYes;
+      else if (x == "binary-upload+")
+        m_x_packet_state = xPacketState::Prefixed;
       // Look for a list of compressions in the features list e.g.
       // qXfer:features:read+;PacketSize=20000;qEcho+;SupportedCompressions=zlib-
       // deflate,lzma
@@ -715,19 +718,20 @@ Status GDBRemoteCommunicationClient::WriteMemoryTags(
   return status;
 }
 
-bool GDBRemoteCommunicationClient::GetxPacketSupported() {
-  if (m_supports_x == eLazyBoolCalculate) {
+GDBRemoteCommunicationClient::xPacketState
+GDBRemoteCommunicationClient::GetxPacketState() {
+  if (!m_x_packet_state)
+    GetRemoteQSupported();
+  if (!m_x_packet_state) {
     StringExtractorGDBRemote response;
-    m_supports_x = eLazyBoolNo;
-    char packet[256];
-    snprintf(packet, sizeof(packet), "x0,0");
-    if (SendPacketAndWaitForResponse(packet, response) ==
+    m_x_packet_state = xPacketState::Unimplemented;
+    if (SendPacketAndWaitForResponse("x0,0", response) ==
         PacketResult::Success) {
       if (response.IsOKResponse())
-        m_supports_x = eLazyBoolYes;
+        m_x_packet_state = xPacketState::Bare;
     }
   }
-  return m_supports_x;
+  return *m_x_packet_state;
 }
 
 lldb::pid_t GDBRemoteCommunicationClient::GetCurrentProcessID(bool allow_lazy) {
diff --git lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h
index 898d176abc34..1118a76d7211 100644
--- lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h
+++ lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h
@@ -218,7 +218,14 @@ public:
 
   bool GetpPacketSupported(lldb::tid_t tid);
 
-  bool GetxPacketSupported();
+  enum class xPacketState {
+    Unimplemented,
+    Prefixed, // Successful responses start with a 'b' character. This is the
+              // style used by GDB.
+    Bare,     // No prefix, packets starts with the memory being read. This is
+              // LLDB's original style.
+  };
+  xPacketState GetxPacketState();
 
   bool GetVAttachOrWaitSupported();
 
@@ -541,7 +548,6 @@ protected:
   LazyBool m_attach_or_wait_reply = eLazyBoolCalculate;
   LazyBool m_prepare_for_reg_writing_reply = eLazyBoolCalculate;
   LazyBool m_supports_p = eLazyBoolCalculate;
-  LazyBool m_supports_x = eLazyBoolCalculate;
   LazyBool m_avoid_g_packets = eLazyBoolCalculate;
   LazyBool m_supports_QSaveRegisterState = eLazyBoolCalculate;
   LazyBool m_supports_qXfer_auxv_read = eLazyBoolCalculate;
@@ -561,6 +567,7 @@ protected:
   LazyBool m_supports_memory_tagging = eLazyBoolCalculate;
   LazyBool m_supports_qSaveCore = eLazyBoolCalculate;
   LazyBool m_uses_native_signals = eLazyBoolCalculate;
+  std::optional<xPacketState> m_x_packet_state;
 
   bool m_supports_qProcessInfoPID : 1, m_supports_qfProcessInfo : 1,
       m_supports_qUserName : 1, m_supports_qGroupName : 1,
diff --git lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
index 538c86801400..07b4470d0619 100644
--- lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
+++ lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
@@ -1726,10 +1726,6 @@ ThreadSP ProcessGDBRemote::SetThreadStopInfo(
 
   if (!thread_sp->StopInfoIsUpToDate()) {
     thread_sp->SetStopInfo(StopInfoSP());
-    // If there's a memory thread backed by this thread, we need to use it to
-    // calculate StopInfo.
-    if (ThreadSP memory_thread_sp = m_thread_list.GetBackingThread(thread_sp))
-      thread_sp = memory_thread_sp;
 
     if (exc_type != 0) {
       // For thread plan async interrupt, creating stop info on the
@@ -2609,11 +2605,15 @@ void ProcessGDBRemote::WillPublicStop() {
 // Process Memory
 size_t ProcessGDBRemote::DoReadMemory(addr_t addr, void *buf, size_t size,
                                       Status &error) {
+  using xPacketState = GDBRemoteCommunicationClient::xPacketState;
+
   GetMaxMemorySize();
-  bool binary_memory_read = m_gdb_comm.GetxPacketSupported();
+  xPacketState x_state = m_gdb_comm.GetxPacketState();
+
   // M and m packets take 2 bytes for 1 byte of memory
-  size_t max_memory_size =
-      binary_memory_read ? m_max_memory_size : m_max_memory_size / 2;
+  size_t max_memory_size = x_state != xPacketState::Unimplemented
+                               ? m_max_memory_size
+                               : m_max_memory_size / 2;
   if (size > max_memory_size) {
     // Keep memory read sizes down to a sane limit. This function will be
     // called multiple times in order to complete the task by
@@ -2624,8 +2624,8 @@ size_t ProcessGDBRemote::DoReadMemory(addr_t addr, void *buf, size_t size,
   char packet[64];
   int packet_len;
   packet_len = ::snprintf(packet, sizeof(packet), "%c%" PRIx64 ",%" PRIx64,
-                          binary_memory_read ? 'x' : 'm', (uint64_t)addr,
-                          (uint64_t)size);
+                          x_state != xPacketState::Unimplemented ? 'x' : 'm',
+                          (uint64_t)addr, (uint64_t)size);
   assert(packet_len + 1 < (int)sizeof(packet));
   UNUSED_IF_ASSERT_DISABLED(packet_len);
   StringExtractorGDBRemote response;
@@ -2634,19 +2634,25 @@ size_t ProcessGDBRemote::DoReadMemory(addr_t addr, void *buf, size_t size,
       GDBRemoteCommunication::PacketResult::Success) {
     if (response.IsNormalResponse()) {
       error.Clear();
-      if (binary_memory_read) {
+      if (x_state != xPacketState::Unimplemented) {
         // The lower level GDBRemoteCommunication packet receive layer has
         // already de-quoted any 0x7d character escaping that was present in
         // the packet
 
-        size_t data_received_size = response.GetBytesLeft();
-        if (data_received_size > size) {
-          // Don't write past the end of BUF if the remote debug server gave us
-          // too much data for some reason.
-          data_received_size = size;
+        llvm::StringRef data_received = response.GetStringRef();
+        if (x_state == xPacketState::Prefixed &&
+            !data_received.consume_front("b")) {
+          error = Status::FromErrorStringWithFormatv(
+              "unexpected response to GDB server memory read packet '{0}': "
+              "'{1}'",
+              packet, data_received);
+          return 0;
         }
-        memcpy(buf, response.GetStringRef().data(), data_received_size);
-        return data_received_size;
+        // Don't write past the end of BUF if the remote debug server gave us
+        // too much data for some reason.
+        size_t memcpy_size = std::min(size, data_received.size());
+        memcpy(buf, data_received.data(), memcpy_size);
+        return memcpy_size;
       } else {
         return response.GetHexBytes(
             llvm::MutableArrayRef<uint8_t>((uint8_t *)buf, size), '\xdd');
diff --git lldb/source/Plugins/StructuredData/DarwinLog/StructuredDataDarwinLog.cpp lldb/source/Plugins/StructuredData/DarwinLog/StructuredDataDarwinLog.cpp
index 4ca8bd2f9085..82f18c5fe37a 100644
--- lldb/source/Plugins/StructuredData/DarwinLog/StructuredDataDarwinLog.cpp
+++ lldb/source/Plugins/StructuredData/DarwinLog/StructuredDataDarwinLog.cpp
@@ -975,8 +975,6 @@ EnableOptionsSP ParseAutoEnableOptions(Status &error, Debugger &debugger) {
   EnableOptionsSP options_sp(new EnableOptions());
   options_sp->NotifyOptionParsingStarting(&exe_ctx);
 
-  CommandReturnObject result(debugger.GetUseColor());
-
   // Parse the arguments.
   auto options_property_sp =
       debugger.GetPropertyValue(nullptr,
@@ -1013,8 +1011,13 @@ EnableOptionsSP ParseAutoEnableOptions(Status &error, Debugger &debugger) {
     return EnableOptionsSP();
   }
 
-  if (!options_sp->VerifyOptions(result))
+  if (llvm::Error error = options_sp->VerifyOptions()) {
+    LLDB_LOG_ERROR(
+        log, std::move(error),
+        "Parsing plugin.structured-data.darwin-log.auto-enable-options value "
+        "failed: {0}");
     return EnableOptionsSP();
+  }
 
   // We successfully parsed and validated the options.
   return options_sp;
diff --git lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
index 6602dd763ba6..05625925d7ca 100644
--- lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
+++ lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
@@ -1019,16 +1019,7 @@ TypeSP DWARFASTParserClang::ParseEnum(const SymbolContext &sc,
     // Declaration DIE is inserted into the type map in ParseTypeFromDWARF
   }
 
-
-  if (TypeSystemClang::StartTagDeclarationDefinition(clang_type)) {
-    if (def_die.HasChildren()) {
-      bool is_signed = false;
-      enumerator_clang_type.IsIntegerType(is_signed);
-      ParseChildEnumerators(clang_type, is_signed,
-                            type_sp->GetByteSize(nullptr).value_or(0), def_die);
-    }
-    TypeSystemClang::CompleteTagDeclarationDefinition(clang_type);
-  } else {
+  if (!CompleteEnumType(def_die, type_sp.get(), clang_type)) {
     dwarf->GetObjectFile()->GetModule()->ReportError(
         "DWARF DIE at {0:x16} named \"{1}\" was not able to start its "
         "definition.\nPlease file a bug and attach the file at the "
@@ -2221,13 +2212,14 @@ bool DWARFASTParserClang::CompleteRecordType(const DWARFDIE &die,
 bool DWARFASTParserClang::CompleteEnumType(const DWARFDIE &die,
                                            lldb_private::Type *type,
                                            const CompilerType &clang_type) {
+  assert(clang_type.IsEnumerationType());
+
   if (TypeSystemClang::StartTagDeclarationDefinition(clang_type)) {
-    if (die.HasChildren()) {
-      bool is_signed = false;
-      clang_type.IsIntegerType(is_signed);
-      ParseChildEnumerators(clang_type, is_signed,
+    if (die.HasChildren())
+      ParseChildEnumerators(clang_type,
+                            clang_type.IsEnumerationIntegerTypeSigned(),
                             type->GetByteSize(nullptr).value_or(0), die);
-    }
+
     TypeSystemClang::CompleteTagDeclarationDefinition(clang_type);
   }
   return (bool)clang_type;
@@ -2329,8 +2321,7 @@ size_t DWARFASTParserClang::ParseChildEnumerators(
       continue;
 
     const char *name = nullptr;
-    bool got_value = false;
-    int64_t enum_value = 0;
+    std::optional<uint64_t> enum_value;
     Declaration decl;
 
     for (size_t i = 0; i < attributes.Size(); ++i) {
@@ -2339,7 +2330,6 @@ size_t DWARFASTParserClang::ParseChildEnumerators(
       if (attributes.ExtractFormValueAtIndex(i, form_value)) {
         switch (attr) {
         case DW_AT_const_value:
-          got_value = true;
           if (is_signed)
             enum_value = form_value.Signed();
           else
@@ -2368,9 +2358,9 @@ size_t DWARFASTParserClang::ParseChildEnumerators(
       }
     }
 
-    if (name && name[0] && got_value) {
+    if (name && name[0] && enum_value) {
       m_ast.AddEnumerationValueToEnumerationType(
-          clang_type, decl, name, enum_value, enumerator_byte_size * 8);
+          clang_type, decl, name, *enum_value, enumerator_byte_size * 8);
       ++enumerators_added;
     }
   }
diff --git lldb/source/Plugins/SymbolFile/PDB/PDBASTParser.cpp lldb/source/Plugins/SymbolFile/PDB/PDBASTParser.cpp
index 990bacd89bf3..c6dd72e22fb4 100644
--- lldb/source/Plugins/SymbolFile/PDB/PDBASTParser.cpp
+++ lldb/source/Plugins/SymbolFile/PDB/PDBASTParser.cpp
@@ -1155,7 +1155,7 @@ bool PDBASTParser::AddEnumValue(CompilerType enum_type,
   Variant v = enum_value.getValue();
   std::string name =
       std::string(MSVCUndecoratedNameParser::DropScope(enum_value.getName()));
-  int64_t raw_value;
+  uint64_t raw_value;
   switch (v.Type) {
   case PDB_VariantType::Int8:
     raw_value = v.Value.Int8;
diff --git lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
index cb246fde976c..1da8fbe0bcd6 100644
--- lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
+++ lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
@@ -8541,12 +8541,10 @@ clang::EnumConstantDecl *TypeSystemClang::AddEnumerationValueToEnumerationType(
 
 clang::EnumConstantDecl *TypeSystemClang::AddEnumerationValueToEnumerationType(
     const CompilerType &enum_type, const Declaration &decl, const char *name,
-    int64_t enum_value, uint32_t enum_value_bit_size) {
-  CompilerType underlying_type = GetEnumerationIntegerType(enum_type);
-  bool is_signed = false;
-  underlying_type.IsIntegerType(is_signed);
-
-  llvm::APSInt value(enum_value_bit_size, !is_signed);
+    uint64_t enum_value, uint32_t enum_value_bit_size) {
+  assert(enum_type.IsEnumerationType());
+  llvm::APSInt value(enum_value_bit_size,
+                     !enum_type.IsEnumerationIntegerTypeSigned());
   value = enum_value;
 
   return AddEnumerationValueToEnumerationType(enum_type, decl, name, value);
diff --git lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h
index 83f954270e30..e70ad4c2973a 100644
--- lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h
+++ lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h
@@ -1040,7 +1040,7 @@ public:
   // Modifying Enumeration types
   clang::EnumConstantDecl *AddEnumerationValueToEnumerationType(
       const CompilerType &enum_type, const Declaration &decl, const char *name,
-      int64_t enum_value, uint32_t enum_value_bit_size);
+      uint64_t enum_value, uint32_t enum_value_bit_size);
   clang::EnumConstantDecl *AddEnumerationValueToEnumerationType(
       const CompilerType &enum_type, const Declaration &decl, const char *name,
       const llvm::APSInt &value);
diff --git lldb/source/Target/ThreadList.cpp lldb/source/Target/ThreadList.cpp
index 6cbef330bf48..c0440d82fd1f 100644
--- lldb/source/Target/ThreadList.cpp
+++ lldb/source/Target/ThreadList.cpp
@@ -191,20 +191,6 @@ ThreadSP ThreadList::GetThreadSPForThreadPtr(Thread *thread_ptr) {
   return thread_sp;
 }
 
-ThreadSP ThreadList::GetBackingThread(const ThreadSP &real_thread) {
-  std::lock_guard<std::recursive_mutex> guard(GetMutex());
-
-  ThreadSP thread_sp;
-  const uint32_t num_threads = m_threads.size();
-  for (uint32_t idx = 0; idx < num_threads; ++idx) {
-    if (m_threads[idx]->GetBackingThread() == real_thread) {
-      thread_sp = m_threads[idx];
-      break;
-    }
-  }
-  return thread_sp;
-}
-
 ThreadSP ThreadList::FindThreadByIndexID(uint32_t index_id, bool can_update) {
   std::lock_guard<std::recursive_mutex> guard(GetMutex());
 
diff --git lldb/test/API/commands/watchpoints/watchpoint_events/TestWatchpointEvents.py lldb/test/API/commands/watchpoints/watchpoint_events/TestWatchpointEvents.py
index 726a9d93c29d..6e05cf06204a 100644
--- lldb/test/API/commands/watchpoints/watchpoint_events/TestWatchpointEvents.py
+++ lldb/test/API/commands/watchpoints/watchpoint_events/TestWatchpointEvents.py
@@ -82,27 +82,45 @@ class TestWatchpointEvents(TestBase):
             'make sure watchpoint condition is "' + condition + '"',
         )
 
-    def GetWatchpointEvent(self, event_type):
-        # We added a watchpoint so we should get a watchpoint added event.
-        event = lldb.SBEvent()
-        success = self.listener.WaitForEvent(1, event)
-        self.assertTrue(success, "Successfully got watchpoint event")
-        self.assertTrue(
-            lldb.SBWatchpoint.EventIsWatchpointEvent(event),
-            "Event is a watchpoint event.",
+        target.DeleteWatchpoint(local_watch.GetID())
+        self.GetWatchpointEvent(
+            lldb.eWatchpointEventTypeDisabled, lldb.eWatchpointEventTypeRemoved
         )
-        found_type = lldb.SBWatchpoint.GetWatchpointEventTypeFromEvent(event)
-        self.assertEqual(
-            found_type,
-            event_type,
-            "Event is not correct type, expected: %d, found: %d"
-            % (event_type, found_type),
+
+        # Re-create it so that we can check DeleteAllWatchpoints
+        local_watch = local_var.Watch(True, False, True, error)
+        if not error.Success():
+            self.fail(
+                "Failed to make watchpoint for local_var: %s" % (error.GetCString())
+            )
+        self.GetWatchpointEvent(lldb.eWatchpointEventTypeAdded)
+        target.DeleteAllWatchpoints()
+        self.GetWatchpointEvent(
+            lldb.eWatchpointEventTypeDisabled, lldb.eWatchpointEventTypeRemoved
         )
+
+    def GetWatchpointEvent(self, *event_types):
+        # We added a watchpoint so we should get a watchpoint added event.
+        event = lldb.SBEvent()
+        for event_type in event_types:
+            success = self.listener.WaitForEvent(1, event)
+            self.assertTrue(success, "Successfully got watchpoint event")
+            self.assertTrue(
+                lldb.SBWatchpoint.EventIsWatchpointEvent(event),
+                "Event is a watchpoint event.",
+            )
+            found_type = lldb.SBWatchpoint.GetWatchpointEventTypeFromEvent(event)
+            self.assertEqual(
+                found_type,
+                event_type,
+                "Event is not correct type, expected: %d, found: %d"
+                % (event_type, found_type),
+            )
         # There shouldn't be another event waiting around:
         found_event = self.listener.PeekAtNextEventForBroadcasterWithType(
-            self.target_bcast, lldb.SBTarget.eBroadcastBitBreakpointChanged, event
+            self.target_bcast, lldb.SBTarget.eBroadcastBitWatchpointChanged, event
         )
         if found_event:
-            print("Found an event I didn't expect: ", event)
+            print("Found an event I didn't expect: ", event.GetType())
 
-        self.assertTrue(not found_event, "Only one event per change.")
+        self.assertTrue(not found_event, f"Only expected {len(event_types)} events.")
diff --git lldb/test/API/functionalities/abbreviation/TestAbbreviations.py lldb/test/API/functionalities/abbreviation/TestAbbreviations.py
index a8cbffbb7ba4..cc767edaaa61 100644
--- lldb/test/API/functionalities/abbreviation/TestAbbreviations.py
+++ lldb/test/API/functionalities/abbreviation/TestAbbreviations.py
@@ -45,10 +45,12 @@ class AbbreviationsTestCase(TestBase):
 
         # Make sure an unabbreviated command is not mangled.
         command_interpreter.ResolveCommand(
-            "breakpoint set --name main --line 123", result
+            "breakpoint set --name main --ignore-count 123", result
         )
         self.assertTrue(result.Succeeded())
-        self.assertEqual("breakpoint set --name main --line 123", result.GetOutput())
+        self.assertEqual(
+            "breakpoint set --name main --ignore-count 123", result.GetOutput()
+        )
 
         # Create some aliases.
         self.runCmd("com a alias com al")
@@ -72,10 +74,10 @@ class AbbreviationsTestCase(TestBase):
             "process launch -s -o /dev/tty0 -e /dev/tty0", result.GetOutput()
         )
 
-        self.runCmd("alias xyzzy breakpoint set -n %1 -l %2")
+        self.runCmd("alias xyzzy breakpoint set -n %1 -i %2")
         command_interpreter.ResolveCommand("xyzzy main 123", result)
         self.assertTrue(result.Succeeded())
-        self.assertEqual("breakpoint set -n main -l 123", result.GetOutput().strip())
+        self.assertEqual("breakpoint set -n main -i 123", result.GetOutput().strip())
 
         # And again, without enough parameters.
         command_interpreter.ResolveCommand("xyzzy main", result)
@@ -92,6 +94,12 @@ class AbbreviationsTestCase(TestBase):
         self.assertTrue(result.Succeeded())
         self.assertEqual("scripting run 1+1", result.GetOutput())
 
+        # Name and line are incompatible options.
+        command_interpreter.HandleCommand(
+            "alias zzyx breakpoint set -n %1 -l %2", result
+        )
+        self.assertFalse(result.Succeeded())
+
         # Prompt changing stuff should be tested, but this doesn't seem like the
         # right test to do it in.  It has nothing to do with aliases or abbreviations.
         # self.runCmd("com sou ./change_prompt.lldb")
diff --git lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/deque/TestDataFormatterLibcxxDeque.py lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/deque/TestDataFormatterLibcxxDeque.py
index 3596b546be30..ece1e4f5f049 100644
--- lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/deque/TestDataFormatterLibcxxDeque.py
+++ lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/deque/TestDataFormatterLibcxxDeque.py
@@ -9,22 +9,27 @@ from lldbsuite.test import lldbutil
 
 
 class LibcxxDequeDataFormatterTestCase(TestBase):
-    def check_numbers(self, var_name):
+    def check_numbers(self, var_name, show_ptr=False):
+        patterns = []
+        substrs = [
+            "[0] = 1",
+            "[1] = 12",
+            "[2] = 123",
+            "[3] = 1234",
+            "[4] = 12345",
+            "[5] = 123456",
+            "[6] = 1234567",
+            "}",
+        ]
+        if show_ptr:
+            patterns = [var_name + " = 0x.* size=7"]
+        else:
+            substrs.insert(0, var_name + " = size=7")
         self.expect(
             "frame variable " + var_name,
-            substrs=[
-                var_name + " = size=7",
-                "[0] = 1",
-                "[1] = 12",
-                "[2] = 123",
-                "[3] = 1234",
-                "[4] = 12345",
-                "[5] = 123456",
-                "[6] = 1234567",
-                "}",
-            ],
+            patterns=patterns,
+            substrs=substrs,
         )
-
         self.expect_expr(
             var_name,
             result_summary="size=7",
@@ -75,7 +80,7 @@ class LibcxxDequeDataFormatterTestCase(TestBase):
         )
 
         # The reference should display the same was as the value did
-        self.check_numbers("ref")
+        self.check_numbers("ref", True)
 
         # The pointer should just show the right number of elements:
         self.expect("frame variable ptr", substrs=["ptr =", " size=7"])
diff --git lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/span/TestDataFormatterLibcxxSpan.py lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/span/TestDataFormatterLibcxxSpan.py
index d5de73ac14ca..4df4fa1acc8e 100644
--- lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/span/TestDataFormatterLibcxxSpan.py
+++ lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/span/TestDataFormatterLibcxxSpan.py
@@ -172,7 +172,4 @@ class LibcxxSpanDataFormatterTestCase(TestBase):
 
         # The pointer should just show the right number of elements:
 
-        ptrAddr = self.findVariable("ptr").GetValue()
-        self.expect_expr(
-            "ptr", result_type="std::span<int, 5> *", result_summary=f"{ptrAddr} size=5"
-        )
+        self.expect("frame variable ptr", patterns=["ptr = 0x.*", " size=5"])
diff --git lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/variant/TestDataFormatterLibcxxVariant.py lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/variant/TestDataFormatterLibcxxVariant.py
index 4154ad3c297f..47e07a5ce3f5 100644
--- lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/variant/TestDataFormatterLibcxxVariant.py
+++ lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/variant/TestDataFormatterLibcxxVariant.py
@@ -47,7 +47,7 @@ class LibcxxVariantDataFormatterTestCase(TestBase):
 
         self.expect(
             "frame variable v1_ref",
-            substrs=["v1_ref =  Active Type = int : {", "Value = 12", "}"],
+            patterns=["v1_ref = 0x.* Active Type = int : {", "Value = 12", "}"],
         )
 
         self.expect(
diff --git lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/vector/TestDataFormatterLibcxxVector.py lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/vector/TestDataFormatterLibcxxVector.py
index a475c15d3da3..ca0f0e369311 100644
--- lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/vector/TestDataFormatterLibcxxVector.py
+++ lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/vector/TestDataFormatterLibcxxVector.py
@@ -10,22 +10,28 @@ from lldbsuite.test import lldbutil
 
 
 class LibcxxVectorDataFormatterTestCase(TestBase):
-    def check_numbers(self, var_name):
+    def check_numbers(self, var_name, show_ptr=False):
+        patterns = []
+        substrs = [
+            "[0] = 1",
+            "[1] = 12",
+            "[2] = 123",
+            "[3] = 1234",
+            "[4] = 12345",
+            "[5] = 123456",
+            "[6] = 1234567",
+            "}",
+        ]
+        if show_ptr:
+            patterns = [var_name + " = 0x.* size=7"]
+        else:
+            substrs.insert(0, var_name + " = size=7")
+
         self.expect(
             "frame variable " + var_name,
-            substrs=[
-                var_name + " = size=7",
-                "[0] = 1",
-                "[1] = 12",
-                "[2] = 123",
-                "[3] = 1234",
-                "[4] = 12345",
-                "[5] = 123456",
-                "[6] = 1234567",
-                "}",
-            ],
+            patterns=patterns,
+            substrs=substrs,
         )
-
         self.expect_expr(
             var_name,
             result_summary="size=7",
@@ -174,7 +180,7 @@ class LibcxxVectorDataFormatterTestCase(TestBase):
         )
 
         # The reference should display the same was as the value did
-        self.check_numbers("ref")
+        self.check_numbers("ref", True)
 
         # The pointer should just show the right number of elements:
 
diff --git lldb/test/API/functionalities/data-formatter/data-formatter-stl/libstdcpp/variant/TestDataFormatterLibStdcxxVariant.py lldb/test/API/functionalities/data-formatter/data-formatter-stl/libstdcpp/variant/TestDataFormatterLibStdcxxVariant.py
index ea4a53fcb409..394e221809f7 100644
--- lldb/test/API/functionalities/data-formatter/data-formatter-stl/libstdcpp/variant/TestDataFormatterLibStdcxxVariant.py
+++ lldb/test/API/functionalities/data-formatter/data-formatter-stl/libstdcpp/variant/TestDataFormatterLibStdcxxVariant.py
@@ -30,7 +30,7 @@ class LibStdcxxVariantDataFormatterTestCase(TestBase):
         for name in ["v1_ref", "v1_typedef_ref"]:
             self.expect(
                 "frame variable " + name,
-                substrs=[name + " =  Active Type = int : {", "Value = 12", "}"],
+                patterns=[name + " = 0x.*  Active Type = int : {", "Value = 12", "}"],
             )
 
         self.expect(
diff --git lldb/test/API/functionalities/gdb_remote_client/TestReadMemory.py lldb/test/API/functionalities/gdb_remote_client/TestReadMemory.py
new file mode 100644
index 000000000000..81dcb54aef5d
--- /dev/null
+++ lldb/test/API/functionalities/gdb_remote_client/TestReadMemory.py
@@ -0,0 +1,55 @@
+import lldb
+from lldbsuite.support import seven
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test.decorators import *
+from lldbsuite.test.gdbclientutils import *
+from lldbsuite.test.lldbgdbclient import GDBRemoteTestBase
+
+
+class TestReadMemory(GDBRemoteTestBase):
+    def test_x_with_prefix(self):
+        class MyResponder(MockGDBServerResponder):
+            def qSupported(self, client_features):
+                # binary-upload+ indicates we use the gdb style of x packets
+                return super().qSupported(client_features) + ";binary-upload+"
+
+            def x(self, addr, length):
+                return "bfoobar" if addr == 0x1000 else "E01"
+
+        self.server.responder = MyResponder()
+        target = self.dbg.CreateTargetWithFileAndTargetTriple("", "x86_64-pc-linux")
+        process = self.connect(target)
+
+        error = lldb.SBError()
+        self.assertEqual(b"foobar", process.ReadMemory(0x1000, 10, error))
+
+    def test_x_bare(self):
+        class MyResponder(MockGDBServerResponder):
+            def x(self, addr, length):
+                # The OK response indicates we use the old lldb style.
+                if addr == 0 and length == 0:
+                    return "OK"
+                return "foobar" if addr == 0x1000 else "E01"
+
+        self.server.responder = MyResponder()
+        target = self.dbg.CreateTargetWithFileAndTargetTriple("", "x86_64-pc-linux")
+        process = self.connect(target)
+
+        error = lldb.SBError()
+        self.assertEqual(b"foobar", process.ReadMemory(0x1000, 10, error))
+
+    def test_m_fallback(self):
+        class MyResponder(MockGDBServerResponder):
+            def x(self, addr, length):
+                # If `x` is unsupported, we should fall back to `m`.
+                return ""
+
+            def readMemory(self, addr, length):
+                return seven.hexlify("foobar") if addr == 0x1000 else "E01"
+
+        self.server.responder = MyResponder()
+        target = self.dbg.CreateTargetWithFileAndTargetTriple("", "x86_64-pc-linux")
+        process = self.connect(target)
+
+        error = lldb.SBError()
+        self.assertEqual(b"foobar", process.ReadMemory(0x1000, 10, error))
diff --git lldb/test/API/tools/lldb-dap/breakpoint/Makefile lldb/test/API/tools/lldb-dap/breakpoint/Makefile
index 7634f513e852..06438b3e6e31 100644
--- lldb/test/API/tools/lldb-dap/breakpoint/Makefile
+++ lldb/test/API/tools/lldb-dap/breakpoint/Makefile
@@ -16,4 +16,4 @@ main-copy.cpp: main.cpp
 # The following shared library will be used to test breakpoints under dynamic loading
 libother:  other-copy.c
 	"$(MAKE)" -f $(MAKEFILE_RULES) \
-		DYLIB_ONLY=YES DYLIB_C_SOURCES=other-copy.c DYLIB_NAME=other 
+		DYLIB_ONLY=YES DYLIB_C_SOURCES=other-copy.c DYLIB_NAME=other
diff --git lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_breakpointLocations.py lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_breakpointLocations.py
new file mode 100644
index 000000000000..1058157e2c66
--- /dev/null
+++ lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_breakpointLocations.py
@@ -0,0 +1,88 @@
+"""
+Test lldb-dap breakpointLocations request
+"""
+
+
+import dap_server
+import shutil
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
+import lldbdap_testcase
+import os
+
+
+class TestDAP_breakpointLocations(lldbdap_testcase.DAPTestCaseBase):
+    def setUp(self):
+        lldbdap_testcase.DAPTestCaseBase.setUp(self)
+
+        self.main_basename = "main-copy.cpp"
+        self.main_path = os.path.realpath(self.getBuildArtifact(self.main_basename))
+
+    @skipIfWindows
+    def test_column_breakpoints(self):
+        """Test retrieving the available breakpoint locations."""
+        program = self.getBuildArtifact("a.out")
+        self.build_and_launch(program, stopOnEntry=True)
+        loop_line = line_number(self.main_path, "// break loop")
+        self.dap_server.request_continue()
+
+        # Ask for the breakpoint locations based only on the line number
+        response = self.dap_server.request_breakpointLocations(
+            self.main_path, loop_line
+        )
+        self.assertTrue(response["success"])
+        self.assertEqual(
+            response["body"]["breakpoints"],
+            [
+                {"line": loop_line, "column": 9},
+                {"line": loop_line, "column": 13},
+                {"line": loop_line, "column": 20},
+                {"line": loop_line, "column": 23},
+                {"line": loop_line, "column": 25},
+                {"line": loop_line, "column": 34},
+                {"line": loop_line, "column": 37},
+                {"line": loop_line, "column": 39},
+                {"line": loop_line, "column": 51},
+            ],
+        )
+
+        # Ask for the breakpoint locations for a column range
+        response = self.dap_server.request_breakpointLocations(
+            self.main_path,
+            loop_line,
+            column=24,
+            end_column=46,
+        )
+        self.assertTrue(response["success"])
+        self.assertEqual(
+            response["body"]["breakpoints"],
+            [
+                {"line": loop_line, "column": 25},
+                {"line": loop_line, "column": 34},
+                {"line": loop_line, "column": 37},
+                {"line": loop_line, "column": 39},
+            ],
+        )
+
+        # Ask for the breakpoint locations for a range of line numbers
+        response = self.dap_server.request_breakpointLocations(
+            self.main_path,
+            line=loop_line,
+            end_line=loop_line + 2,
+            column=39,
+        )
+        self.maxDiff = None
+        self.assertTrue(response["success"])
+        # On some systems, there is an additional breakpoint available
+        # at line 41, column 3, i.e. at the end of the loop. To make this
+        # test more portable, only check that all expected breakpoints are
+        # presented, but also accept additional breakpoints.
+        expected_breakpoints = [
+            {"column": 39, "line": 40},
+            {"column": 51, "line": 40},
+            {"column": 3, "line": 42},
+            {"column": 18, "line": 42},
+        ]
+        for bp in expected_breakpoints:
+            self.assertIn(bp, response["body"]["breakpoints"])
diff --git lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setBreakpoints.py lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setBreakpoints.py
index 123fea79c5cd..c62feda64a12 100644
--- lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setBreakpoints.py
+++ lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setBreakpoints.py
@@ -125,20 +125,18 @@ class TestDAP_setBreakpoints(lldbdap_testcase.DAPTestCaseBase):
         # Set 3 breakpoints and verify that they got set correctly
         response = self.dap_server.request_setBreakpoints(self.main_path, lines)
         line_to_id = {}
-        if response:
-            breakpoints = response["body"]["breakpoints"]
-            self.assertEqual(
-                len(breakpoints),
-                len(lines),
-                "expect %u source breakpoints" % (len(lines)),
-            )
-            for breakpoint, index in zip(breakpoints, range(len(lines))):
-                line = breakpoint["line"]
-                self.assertTrue(line, lines[index])
-                # Store the "id" of the breakpoint that was set for later
-                line_to_id[line] = breakpoint["id"]
-                self.assertIn(line, lines, "line expected in lines array")
-                self.assertTrue(breakpoint["verified"], "expect breakpoint verified")
+        breakpoints = response["body"]["breakpoints"]
+        self.assertEqual(
+            len(breakpoints),
+            len(lines),
+            "expect %u source breakpoints" % (len(lines)),
+        )
+        for index, breakpoint in enumerate(breakpoints):
+            line = breakpoint["line"]
+            self.assertEqual(line, lines[index])
+            # Store the "id" of the breakpoint that was set for later
+            line_to_id[line] = breakpoint["id"]
+            self.assertTrue(breakpoint["verified"], "expect breakpoint verified")
 
         # There is no breakpoint delete packet, clients just send another
         # setBreakpoints packet with the same source file with fewer lines.
@@ -151,75 +149,66 @@ class TestDAP_setBreakpoints(lldbdap_testcase.DAPTestCaseBase):
         # Set 2 breakpoints and verify that the previous breakpoints that were
         # set above are still set.
         response = self.dap_server.request_setBreakpoints(self.main_path, lines)
-        if response:
-            breakpoints = response["body"]["breakpoints"]
+        breakpoints = response["body"]["breakpoints"]
+        self.assertEqual(
+            len(breakpoints),
+            len(lines),
+            "expect %u source breakpoints" % (len(lines)),
+        )
+        for index, breakpoint in enumerate(breakpoints):
+            line = breakpoint["line"]
+            self.assertEqual(line, lines[index])
+            # Verify the same breakpoints are still set within LLDB by
+            # making sure the breakpoint ID didn't change
             self.assertEqual(
-                len(breakpoints),
-                len(lines),
-                "expect %u source breakpoints" % (len(lines)),
+                line_to_id[line],
+                breakpoint["id"],
+                "verify previous breakpoints stayed the same",
             )
-            for breakpoint, index in zip(breakpoints, range(len(lines))):
-                line = breakpoint["line"]
-                self.assertTrue(line, lines[index])
-                # Verify the same breakpoints are still set within LLDB by
-                # making sure the breakpoint ID didn't change
-                self.assertEqual(
-                    line_to_id[line],
-                    breakpoint["id"],
-                    "verify previous breakpoints stayed the same",
-                )
-                self.assertIn(line, lines, "line expected in lines array")
-                self.assertTrue(
-                    breakpoint["verified"], "expect breakpoint still verified"
-                )
+            self.assertTrue(breakpoint["verified"], "expect breakpoint still verified")
 
         # Now get the full list of breakpoints set in the target and verify
         # we have only 2 breakpoints set. The response above could have told
         # us about 2 breakpoints, but we want to make sure we don't have the
         # third one still set in the target
         response = self.dap_server.request_testGetTargetBreakpoints()
-        if response:
-            breakpoints = response["body"]["breakpoints"]
+        breakpoints = response["body"]["breakpoints"]
+        self.assertEqual(
+            len(breakpoints),
+            len(lines),
+            "expect %u source breakpoints" % (len(lines)),
+        )
+        for breakpoint in breakpoints:
+            line = breakpoint["line"]
+            # Verify the same breakpoints are still set within LLDB by
+            # making sure the breakpoint ID didn't change
             self.assertEqual(
-                len(breakpoints),
-                len(lines),
-                "expect %u source breakpoints" % (len(lines)),
+                line_to_id[line],
+                breakpoint["id"],
+                "verify previous breakpoints stayed the same",
             )
-            for breakpoint in breakpoints:
-                line = breakpoint["line"]
-                # Verify the same breakpoints are still set within LLDB by
-                # making sure the breakpoint ID didn't change
-                self.assertEqual(
-                    line_to_id[line],
-                    breakpoint["id"],
-                    "verify previous breakpoints stayed the same",
-                )
-                self.assertIn(line, lines, "line expected in lines array")
-                self.assertTrue(
-                    breakpoint["verified"], "expect breakpoint still verified"
-                )
+            self.assertIn(line, lines, "line expected in lines array")
+            self.assertTrue(breakpoint["verified"], "expect breakpoint still verified")
 
         # Now clear all breakpoints for the source file by passing down an
         # empty lines array
         lines = []
         response = self.dap_server.request_setBreakpoints(self.main_path, lines)
-        if response:
-            breakpoints = response["body"]["breakpoints"]
-            self.assertEqual(
-                len(breakpoints),
-                len(lines),
-                "expect %u source breakpoints" % (len(lines)),
-            )
+        breakpoints = response["body"]["breakpoints"]
+        self.assertEqual(
+            len(breakpoints),
+            len(lines),
+            "expect %u source breakpoints" % (len(lines)),
+        )
 
         # Verify with the target that all breakpoints have been cleared
         response = self.dap_server.request_testGetTargetBreakpoints()
-        if response:
-            breakpoints = response["body"]["breakpoints"]
-            self.assertEqual(
-                len(breakpoints),
-                len(lines),
-                "expect %u source breakpoints" % (len(lines)),
-            )
+        breakpoints = response["body"]["breakpoints"]
+        self.assertEqual(
+            len(breakpoints),
+            len(lines),
+            "expect %u source breakpoints" % (len(lines)),
+        )
 
         # Now set a breakpoint again in the same source file and verify it
         # was added.
@@ -281,12 +270,11 @@ class TestDAP_setBreakpoints(lldbdap_testcase.DAPTestCaseBase):
         self.assertEqual(
             len(breakpoints), len(lines), "expect %u source breakpoints" % (len(lines))
         )
-        for breakpoint, index in zip(breakpoints, range(len(lines))):
+        for index, breakpoint in enumerate(breakpoints):
             line = breakpoint["line"]
-            self.assertTrue(line, lines[index])
+            self.assertEqual(line, lines[index])
             # Store the "id" of the breakpoint that was set for later
             line_to_id[line] = breakpoint["id"]
-            self.assertIn(line, lines, "line expected in lines array")
             self.assertTrue(breakpoint["verified"], "expect breakpoint verified")
 
         # Now clear all breakpoints for the source file by not setting the
@@ -356,3 +344,49 @@ class TestDAP_setBreakpoints(lldbdap_testcase.DAPTestCaseBase):
         self.continue_to_breakpoints(breakpoint_ids)
         i = int(self.dap_server.get_local_variable_value("i"))
         self.assertEqual(i, 7, "i != 7 showing post hitCondition hits every time")
+
+    @skipIfWindows
+    def test_column_breakpoints(self):
+        """Test setting multiple breakpoints in the same line at different columns."""
+        loop_line = line_number("main.cpp", "// break loop")
+
+        program = self.getBuildArtifact("a.out")
+        self.build_and_launch(program)
+
+        # Set two breakpoints on the loop line at different columns.
+        columns = [13, 39]
+        response = self.dap_server.request_setBreakpoints(
+            self.main_path, [loop_line, loop_line], list({"column": c} for c in columns)
+        )
+
+        # Verify the breakpoints were set correctly
+        breakpoints = response["body"]["breakpoints"]
+        breakpoint_ids = []
+        self.assertEqual(
+            len(breakpoints),
+            len(columns),
+            "expect %u source breakpoints" % (len(columns)),
+        )
+        for index, breakpoint in enumerate(breakpoints):
+            self.assertEqual(breakpoint["line"], loop_line)
+            self.assertEqual(breakpoint["column"], columns[index])
+            self.assertTrue(breakpoint["verified"], "expect breakpoint verified")
+            breakpoint_ids.append(breakpoint["id"])
+
+        # Continue to the first breakpoint,
+        self.continue_to_breakpoints([breakpoint_ids[0]])
+
+        # We should have stopped right before the call to `twelve`.
+        # Step into and check we are inside `twelve`.
+        self.stepIn()
+        func_name = self.get_stackFrames()[0]["name"]
+        self.assertEqual(func_name, "twelve(int)")
+
+        # Continue to the second breakpoint.
+        self.continue_to_breakpoints([breakpoint_ids[1]])
+
+        # We should have stopped right before the call to `fourteen`.
+        # Step into and check we are inside `fourteen`.
+        self.stepIn()
+        func_name = self.get_stackFrames()[0]["name"]
+        self.assertEqual(func_name, "a::fourteen(int)")
diff --git lldb/test/Shell/SymbolFile/DWARF/x86/discontinuous-function.s lldb/test/Shell/SymbolFile/DWARF/x86/discontinuous-function.s
index 93ea9f33e762..197ab9aa1491 100644
--- lldb/test/Shell/SymbolFile/DWARF/x86/discontinuous-function.s
+++ lldb/test/Shell/SymbolFile/DWARF/x86/discontinuous-function.s
@@ -6,17 +6,29 @@
 # The function bar has been placed "in the middle" of foo, and the function
 # entry point is deliberately not its lowest address.
 
-# RUN: llvm-mc -triple x86_64-pc-linux -filetype=obj %s -o %t
-# RUN: %lldb %t -o "image lookup -v -n foo" -o "expr -- &foo" -o exit | FileCheck %s
+# RUN: split-file %s %t
+# RUN: llvm-mc -triple x86_64-pc-linux -filetype=obj %t/input.s -o %t/input.o
+# RUN: %lldb %t/input.o -s %t/commands -o exit | FileCheck %s
 
-# CHECK-LABEL: image lookup
+#--- commands
+
+image lookup -v -n foo
+# CHECK-LABEL: image lookup -v -n foo
+# CHECK: 1 match found in {{.*}}
+# CHECK: Summary: input.o`foo
+# CHECK: Function: id = {{.*}}, name = "foo", ranges = [0x0000000000000000-0x000000000000000e)[0x0000000000000014-0x000000000000001c)
+
+image lookup -v --regex -n '^foo$'
+# CHECK-LABEL: image lookup -v --regex -n '^foo$'
 # CHECK: 1 match found in {{.*}}
-# CHECK: Summary: {{.*}}`foo
+# CHECK: Summary: input.o`foo
 # CHECK: Function: id = {{.*}}, name = "foo", ranges = [0x0000000000000000-0x000000000000000e)[0x0000000000000014-0x000000000000001c)
 
+expr -- &foo
 # CHECK-LABEL: expr -- &foo
 # CHECK: (void (*)()) $0 = 0x0000000000000007
 
+#--- input.s
         .text
 
 foo.__part.1:
diff --git lldb/test/Shell/SymbolFile/DWARF/x86/explicit-member-function-quals.cpp lldb/test/Shell/SymbolFile/DWARF/x86/explicit-member-function-quals.cpp
new file mode 100644
index 000000000000..5d1222795dd8
--- /dev/null
+++ lldb/test/Shell/SymbolFile/DWARF/x86/explicit-member-function-quals.cpp
@@ -0,0 +1,22 @@
+// XFAIL: *
+
+// Tests that we correctly deduce the CV-quals and storage
+// class of explicit object member functions.
+//
+// RUN: %clangxx_host %s -target x86_64-pc-linux -g -std=c++23 -c -o %t
+// RUN: %lldb %t -b -o "type lookup Foo" 2>&1 | FileCheck %s
+//
+// CHECK:      (lldb) type lookup Foo
+// CHECK-NEXT: struct Foo {
+// CHECK-NEXT:      void Method(Foo);
+// CHECK-NEXT:      void cMethod(const Foo &) const;
+// CHECK-NEXT:      void vMethod(volatile Foo &) volatile;
+// CHECK-NEXT:      void cvMethod(const volatile Foo &) const volatile;
+// CHECK-NEXT: }
+
+struct Foo {
+  void Method(this Foo) {}
+  void cMethod(this Foo const &) {}
+  void vMethod(this Foo volatile &) {}
+  void cvMethod(this Foo const volatile &) {}
+} f;
diff --git lldb/tools/lldb-dap/.vscodeignore lldb/tools/lldb-dap/.vscodeignore
new file mode 100644
index 000000000000..0491ba879fc3
--- /dev/null
+++ lldb/tools/lldb-dap/.vscodeignore
@@ -0,0 +1,9 @@
+// Ignore everything by default
+**/*
+
+// Only include specific files and directories
+!LICENSE.TXT
+!package.json
+!README.md
+!out/**
+!syntaxes/**
diff --git lldb/tools/lldb-dap/DAP.h lldb/tools/lldb-dap/DAP.h
index 846300cb945b..b23be68ea002 100644
--- lldb/tools/lldb-dap/DAP.h
+++ lldb/tools/lldb-dap/DAP.h
@@ -50,7 +50,8 @@
 
 namespace lldb_dap {
 
-typedef llvm::DenseMap<uint32_t, SourceBreakpoint> SourceBreakpointMap;
+typedef llvm::DenseMap<std::pair<uint32_t, uint32_t>, SourceBreakpoint>
+    SourceBreakpointMap;
 typedef llvm::StringMap<FunctionBreakpoint> FunctionBreakpointMap;
 typedef llvm::DenseMap<lldb::addr_t, InstructionBreakpoint>
     InstructionBreakpointMap;
diff --git lldb/tools/lldb-dap/IOStream.h lldb/tools/lldb-dap/IOStream.h
index 74889eb2e5a8..c91b2f717893 100644
--- lldb/tools/lldb-dap/IOStream.h
+++ lldb/tools/lldb-dap/IOStream.h
@@ -10,13 +10,8 @@
 #define LLDB_TOOLS_LLDB_DAP_IOSTREAM_H
 
 #if defined(_WIN32)
-// We need to #define NOMINMAX in order to skip `min()` and `max()` macro
-// definitions that conflict with other system headers.
-// We also need to #undef GetObject (which is defined to GetObjectW) because
-// the JSON code we use also has methods named `GetObject()` and we conflict
-// against these.
-#define NOMINMAX
-#include <windows.h>
+#include "lldb/Host/windows/windows.h"
+#include <winsock2.h>
 #else
 typedef int SOCKET;
 #endif
diff --git lldb/tools/lldb-dap/OutputRedirector.cpp lldb/tools/lldb-dap/OutputRedirector.cpp
index 8fcbcfec99c4..7935e17a653b 100644
--- lldb/tools/lldb-dap/OutputRedirector.cpp
+++ lldb/tools/lldb-dap/OutputRedirector.cpp
@@ -6,6 +6,9 @@
 //
 //===----------------------------------------------------------------------===/
 
+#include "OutputRedirector.h"
+#include "DAP.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Error.h"
 #include <system_error>
 #if defined(_WIN32)
@@ -15,45 +18,59 @@
 #include <unistd.h>
 #endif
 
-#include "DAP.h"
-#include "OutputRedirector.h"
-#include "llvm/ADT/StringRef.h"
-
 using lldb_private::Pipe;
-using lldb_private::Status;
 using llvm::createStringError;
 using llvm::Error;
 using llvm::Expected;
+using llvm::inconvertibleErrorCode;
 using llvm::StringRef;
 
 namespace lldb_dap {
 
+int OutputRedirector::kInvalidDescriptor = -1;
+
+OutputRedirector::OutputRedirector() : m_fd(kInvalidDescriptor) {}
+
 Expected<int> OutputRedirector::GetWriteFileDescriptor() {
-  if (!m_pipe.CanWrite())
+  if (m_fd == kInvalidDescriptor)
     return createStringError(std::errc::bad_file_descriptor,
                              "write handle is not open for writing");
-  return m_pipe.GetWriteFileDescriptor();
+  return m_fd;
 }
 
 Error OutputRedirector::RedirectTo(std::function<void(StringRef)> callback) {
-  Status status = m_pipe.CreateNew(/*child_process_inherit=*/false);
-  if (status.Fail())
-    return status.takeError();
+  assert(m_fd == kInvalidDescriptor && "Output readirector already started.");
+  int new_fd[2];
 
-  m_forwarder = std::thread([this, callback]() {
-    char buffer[OutputBufferSize];
-    while (m_pipe.CanRead() && !m_stopped) {
-      size_t bytes_read;
-      Status status = m_pipe.Read(&buffer, sizeof(buffer), bytes_read);
-      if (status.Fail())
-        continue;
+#if defined(_WIN32)
+  if (::_pipe(new_fd, OutputBufferSize, O_TEXT) == -1) {
+#else
+  if (::pipe(new_fd) == -1) {
+#endif
+    int error = errno;
+    return createStringError(inconvertibleErrorCode(),
+                             "Couldn't create new pipe %s", strerror(error));
+  }
 
-      // EOF detected
-      if (bytes_read == 0 || m_stopped)
+  int read_fd = new_fd[0];
+  m_fd = new_fd[1];
+  m_forwarder = std::thread([this, callback, read_fd]() {
+    char buffer[OutputBufferSize];
+    while (!m_stopped) {
+      ssize_t bytes_count = ::read(read_fd, &buffer, sizeof(buffer));
+      // EOF detected.
+      if (bytes_count == 0)
+        break;
+      if (bytes_count == -1) {
+        // Skip non-fatal errors.
+        if (errno == EAGAIN || errno == EINTR || errno == EWOULDBLOCK)
+          continue;
         break;
+      }
 
-      callback(StringRef(buffer, bytes_read));
+      callback(StringRef(buffer, bytes_count));
     }
+    ::close(read_fd);
   });
 
   return Error::success();
@@ -62,14 +79,15 @@ Error OutputRedirector::RedirectTo(std::function<void(StringRef)> callback) {
 void OutputRedirector::Stop() {
   m_stopped = true;
 
-  if (m_pipe.CanWrite()) {
+  if (m_fd != kInvalidDescriptor) {
+    int fd = m_fd;
+    m_fd = kInvalidDescriptor;
     // Closing the pipe may not be sufficient to wake up the thread in case the
     // write descriptor is duplicated (to stdout/err or to another process).
     // Write a null byte to ensure the read call returns.
     char buf[] = "\0";
-    size_t bytes_written;
-    m_pipe.Write(buf, sizeof(buf), bytes_written);
-    m_pipe.CloseWriteFileDescriptor();
+    ::write(fd, buf, sizeof(buf));
+    ::close(fd);
     m_forwarder.join();
   }
 }
diff --git lldb/tools/lldb-dap/OutputRedirector.h lldb/tools/lldb-dap/OutputRedirector.h
index 41ea05c22c69..d2bd39797f3d 100644
--- lldb/tools/lldb-dap/OutputRedirector.h
+++ lldb/tools/lldb-dap/OutputRedirector.h
@@ -20,6 +20,8 @@ namespace lldb_dap {
 
 class OutputRedirector {
 public:
+  static int kInvalidDescriptor;
+
   /// Creates writable file descriptor that will invoke the given callback on
   /// each write in a background thread.
   ///
@@ -33,13 +35,13 @@ public:
 
   ~OutputRedirector() { Stop(); }
 
-  OutputRedirector() = default;
+  OutputRedirector();
   OutputRedirector(const OutputRedirector &) = delete;
   OutputRedirector &operator=(const OutputRedirector &) = delete;
 
 private:
   std::atomic<bool> m_stopped = false;
-  lldb_private::Pipe m_pipe;
+  int m_fd;
   std::thread m_forwarder;
 };
 
diff --git lldb/tools/lldb-dap/lldb-dap.cpp lldb/tools/lldb-dap/lldb-dap.cpp
index 9e0e7f21ce4f..e323990d8b6e 100644
--- lldb/tools/lldb-dap/lldb-dap.cpp
+++ lldb/tools/lldb-dap/lldb-dap.cpp
@@ -912,6 +912,196 @@ void request_attach(DAP &dap, const llvm::json::Object &request) {
   }
 }
 
+// "BreakpointLocationsRequest": {
+//   "allOf": [ { "$ref": "#/definitions/Request" }, {
+//     "type": "object",
+//     "description": "The `breakpointLocations` request returns all possible
+//     locations for source breakpoints in a given range.\nClients should only
+//     call this request if the corresponding capability
+//     `supportsBreakpointLocationsRequest` is true.",
+//     "properties": {
+//       "command": {
+//         "type": "string",
+//         "enum": [ "breakpointLocations" ]
+//       },
+//       "arguments": {
+//         "$ref": "#/definitions/BreakpointLocationsArguments"
+//       }
+//     },
+//     "required": [ "command" ]
+//   }]
+// },
+// "BreakpointLocationsArguments": {
+//   "type": "object",
+//   "description": "Arguments for `breakpointLocations` request.",
+//   "properties": {
+//     "source": {
+//       "$ref": "#/definitions/Source",
+//       "description": "The source location of the breakpoints; either
+//       `source.path` or `source.sourceReference` must be specified."
+//     },
+//     "line": {
+//       "type": "integer",
+//       "description": "Start line of range to search possible breakpoint
+//       locations in. If only the line is specified, the request returns all
+//       possible locations in that line."
+//     },
+//     "column": {
+//       "type": "integer",
+//       "description": "Start position within `line` to search possible
+//       breakpoint locations in. It is measured in UTF-16 code units and the
+//       client capability `columnsStartAt1` determines whether it is 0- or
+//       1-based. If no column is given, the first position in the start line is
+//       assumed."
+//     },
+//     "endLine": {
+//       "type": "integer",
+//       "description": "End line of range to search possible breakpoint
+//       locations in. If no end line is given, then the end line is assumed to
+//       be the start line."
+//     },
+//     "endColumn": {
+//       "type": "integer",
+//       "description": "End position within `endLine` to search possible
+//       breakpoint locations in. It is measured in UTF-16 code units and the
+//       client capability `columnsStartAt1` determines whether it is 0- or
+//       1-based. If no end column is given, the last position in the end line
+//       is assumed."
+//     }
+//   },
+//   "required": [ "source", "line" ]
+// },
+// "BreakpointLocationsResponse": {
+//   "allOf": [ { "$ref": "#/definitions/Response" }, {
+//     "type": "object",
+//     "description": "Response to `breakpointLocations` request.\nContains
+//     possible locations for source breakpoints.",
+//     "properties": {
+//       "body": {
+//         "type": "object",
+//         "properties": {
+//           "breakpoints": {
+//             "type": "array",
+//             "items": {
+//               "$ref": "#/definitions/BreakpointLocation"
+//             },
+//             "description": "Sorted set of possible breakpoint locations."
+//           }
+//         },
+//         "required": [ "breakpoints" ]
+//       }
+//     },
+//     "required": [ "body" ]
+//   }]
+// },
+// "BreakpointLocation": {
+//   "type": "object",
+//   "description": "Properties of a breakpoint location returned from the
+//   `breakpointLocations` request.",
+//   "properties": {
+//     "line": {
+//       "type": "integer",
+//       "description": "Start line of breakpoint location."
+//     },
+//     "column": {
+//       "type": "integer",
+//       "description": "The start position of a breakpoint location. Position
+//       is measured in UTF-16 code units and the client capability
+//       `columnsStartAt1` determines whether it is 0- or 1-based."
+//     },
+//     "endLine": {
+//       "type": "integer",
+//       "description": "The end line of breakpoint location if the location
+//       covers a range."
+//     },
+//     "endColumn": {
+//       "type": "integer",
+//       "description": "The end position of a breakpoint location (if the
+//       location covers a range). Position is measured in UTF-16 code units and
+//       the client capability `columnsStartAt1` determines whether it is 0- or
+//       1-based."
+//     }
+//   },
+//   "required": [ "line" ]
+// },
+void request_breakpointLocations(DAP &dap, const llvm::json::Object &request) {
+  llvm::json::Object response;
+  FillResponse(request, response);
+  auto *arguments = request.getObject("arguments");
+  auto *source = arguments->getObject("source");
+  std::string path = GetString(source, "path").str();
+  uint64_t start_line = GetUnsigned(arguments, "line", 0);
+  uint64_t start_column = GetUnsigned(arguments, "column", 0);
+  uint64_t end_line = GetUnsigned(arguments, "endLine", start_line);
+  uint64_t end_column =
+      GetUnsigned(arguments, "endColumn", std::numeric_limits<uint64_t>::max());
+
+  lldb::SBFileSpec file_spec(path.c_str(), true);
+  lldb::SBSymbolContextList compile_units =
+      dap.target.FindCompileUnits(file_spec);
+
+  // Find all relevant lines & columns
+  llvm::SmallVector<std::pair<uint32_t, uint32_t>, 8> locations;
+  for (uint32_t c_idx = 0, c_limit = compile_units.GetSize(); c_idx < c_limit;
+       ++c_idx) {
+    const lldb::SBCompileUnit &compile_unit =
+        compile_units.GetContextAtIndex(c_idx).GetCompileUnit();
+    if (!compile_unit.IsValid())
+      continue;
+    lldb::SBFileSpec primary_file_spec = compile_unit.GetFileSpec();
+
+    // Go through the line table and find all matching lines / columns
+    for (uint32_t l_idx = 0, l_limit = compile_unit.GetNumLineEntries();
+         l_idx < l_limit; ++l_idx) {
+      lldb::SBLineEntry line_entry = compile_unit.GetLineEntryAtIndex(l_idx);
+
+      // Filter by line / column
+      uint32_t line = line_entry.GetLine();
+      if (line < start_line || line > end_line)
+        continue;
+      uint32_t column = line_entry.GetColumn();
+      if (column == LLDB_INVALID_COLUMN_NUMBER)
+        continue;
+      if (line == start_line && column < start_column)
+        continue;
+      if (line == end_line && column > end_column)
+        continue;
+
+      // Make sure we are in the right file.
+      // We might have a match on line & column range and still
+      // be in the wrong file, e.g. for included files.
+      // Given that the involved pointers point into LLDB's string pool,
+      // we can directly compare the `const char*` pointers.
+      if (line_entry.GetFileSpec().GetFilename() !=
+              primary_file_spec.GetFilename() ||
+          line_entry.GetFileSpec().GetDirectory() !=
+              primary_file_spec.GetDirectory())
+        continue;
+
+      locations.emplace_back(line, column);
+    }
+  }
+
+  // The line entries are sorted by addresses, but we must return the list
+  // ordered by line / column position.
+  std::sort(locations.begin(), locations.end());
+  locations.erase(std::unique(locations.begin(), locations.end()),
+                  locations.end());
+
+  llvm::json::Array locations_json;
+  for (auto &l : locations) {
+    llvm::json::Object location;
+    location.try_emplace("line", l.first);
+    location.try_emplace("column", l.second);
+    locations_json.emplace_back(std::move(location));
+  }
+
+  llvm::json::Object body;
+  body.try_emplace("breakpoints", std::move(locations_json));
+  response.try_emplace("body", std::move(body));
+  dap.SendJSON(llvm::json::Value(std::move(response)));
+}
+
 // "ContinueRequest": {
 //   "allOf": [ { "$ref": "#/definitions/Request" }, {
 //     "type": "object",
@@ -1969,6 +2159,8 @@ void request_initialize(DAP &dap, const llvm::json::Object &request) {
   body.try_emplace("supportsCompletionsRequest", true);
   // The debug adapter supports the disassembly request.
   body.try_emplace("supportsDisassembleRequest", true);
+  // The debug adapter supports the `breakpointLocations` request.
+  body.try_emplace("supportsBreakpointLocationsRequest", true);
   // The debug adapter supports stepping granularities (argument `granularity`)
   // for the stepping requests.
   body.try_emplace("supportsSteppingGranularity", true);
@@ -2733,9 +2925,10 @@ void request_setBreakpoints(DAP &dap, const llvm::json::Object &request) {
       const auto *bp_obj = bp.getAsObject();
       if (bp_obj) {
         SourceBreakpoint src_bp(dap, *bp_obj);
-        request_bps.try_emplace(src_bp.line, src_bp);
+        std::pair<uint32_t, uint32_t> bp_pos(src_bp.line, src_bp.column);
+        request_bps.try_emplace(bp_pos, src_bp);
         const auto [iv, inserted] =
-            dap.source_breakpoints[path].try_emplace(src_bp.line, src_bp);
+            dap.source_breakpoints[path].try_emplace(bp_pos, src_bp);
         // We check if this breakpoint already exists to update it
         if (inserted)
           iv->getSecond().SetBreakpoint(path.data());
@@ -4806,6 +4999,8 @@ void request_setInstructionBreakpoints(DAP &dap,
 
 void RegisterRequestCallbacks(DAP &dap) {
   dap.RegisterRequestCallback("attach", request_attach);
+  dap.RegisterRequestCallback("breakpointLocations",
+                              request_breakpointLocations);
   dap.RegisterRequestCallback("completions", request_completions);
   dap.RegisterRequestCallback("continue", request_continue);
   dap.RegisterRequestCallback("configurationDone", request_configurationDone);
diff --git lldb/tools/lldb-dap/package.json lldb/tools/lldb-dap/package.json
index bbe65e1f73fd..445e1961dbe8 100644
--- lldb/tools/lldb-dap/package.json
+++ lldb/tools/lldb-dap/package.json
@@ -1,7 +1,7 @@
 {
   "name": "lldb-dap",
   "displayName": "LLDB DAP",
-  "version": "0.2.8",
+  "version": "0.2.9",
   "publisher": "llvm-vs-code-extensions",
   "homepage": "https://lldb.llvm.org",
   "description": "LLDB debugging from VSCode",
diff --git lldb/unittests/OperatingSystem/CMakeLists.txt lldb/unittests/OperatingSystem/CMakeLists.txt
new file mode 100644
index 000000000000..ae810f37b057
--- /dev/null
+++ lldb/unittests/OperatingSystem/CMakeLists.txt
@@ -0,0 +1,21 @@
+add_lldb_unittest(ProcessGdbRemoteTests
+  GDBRemoteClientBaseTest.cpp
+  GDBRemoteCommunicationClientTest.cpp
+  GDBRemoteCommunicationServerLLGSTest.cpp
+  GDBRemoteCommunicationServerTest.cpp
+  GDBRemoteCommunicationTest.cpp
+  GDBRemoteTestUtils.cpp
+
+  LINK_LIBS
+    LLVMTestingSupport
+    lldbCore
+    lldbHost
+    lldbInterpreter
+    lldbPluginProcessUtility
+    lldbSymbol
+    lldbTarget
+    lldbValueObject
+
+  LINK_COMPONENTS
+    Support
+  )
diff --git lldb/unittests/OperatingSystem/OperatingSystemPlugin.h lldb/unittests/OperatingSystem/OperatingSystemPlugin.h
new file mode 100644
index 000000000000..c58362dec03a
--- /dev/null
+++ lldb/unittests/OperatingSystem/OperatingSystemPlugin.h
@@ -0,0 +1,59 @@
+//===-- OperatingSystemPlugin.h ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/Core/PluginManager.h"
+#include "lldb/Target/OperatingSystem.h"
+#include "lldb/Target/Thread.h"
+#include "lldb/Target/ThreadList.h"
+
+/// An operating system plugin that does nothing: simply keeps the thread lists
+/// as they are.
+class OperatingSystemIdentityMap : public lldb_private::OperatingSystem {
+public:
+  OperatingSystemIdentityMap(lldb_private::Process *process)
+      : OperatingSystem(process) {}
+
+  static OperatingSystem *CreateInstance(lldb_private::Process *process,
+                                         bool force) {
+    return new OperatingSystemIdentityMap(process);
+  }
+  static llvm::StringRef GetPluginNameStatic() { return "identity map"; }
+  static llvm::StringRef GetPluginDescriptionStatic() { return ""; }
+
+  static void Initialize() {
+    lldb_private::PluginManager::RegisterPlugin(GetPluginNameStatic(),
+                                                GetPluginDescriptionStatic(),
+                                                CreateInstance, nullptr);
+  }
+  static void Terminate() {
+    lldb_private::PluginManager::UnregisterPlugin(CreateInstance);
+  }
+  llvm::StringRef GetPluginName() override { return GetPluginNameStatic(); }
+
+  // Simply adds the threads from real_thread_list into new_thread_list.
+  bool UpdateThreadList(lldb_private::ThreadList &old_thread_list,
+                        lldb_private::ThreadList &real_thread_list,
+                        lldb_private::ThreadList &new_thread_list) override {
+    for (const auto &real_thread : real_thread_list.Threads())
+      new_thread_list.AddThread(real_thread);
+    return true;
+  }
+
+  void ThreadWasSelected(lldb_private::Thread *thread) override {}
+
+  lldb::RegisterContextSP
+  CreateRegisterContextForThread(lldb_private::Thread *thread,
+                                 lldb::addr_t reg_data_addr) override {
+    return thread->GetRegisterContext();
+  }
+
+  lldb::StopInfoSP
+  CreateThreadStopReason(lldb_private::Thread *thread) override {
+    return thread->GetStopInfo();
+  }
+};
diff --git lldb/unittests/OperatingSystem/TestThreadSpecificBreakpoints.cpp lldb/unittests/OperatingSystem/TestThreadSpecificBreakpoints.cpp
new file mode 100644
index 000000000000..369f5b3ae467
--- /dev/null
+++ lldb/unittests/OperatingSystem/TestThreadSpecificBreakpoints.cpp
@@ -0,0 +1,10 @@
+//===-- OperatingSystemPlugin.h ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "OperatingSystemPlugin.h"
+LLDB_PLUGIN_DEFINE(OperatingSystemIdentityMap)
diff --git llvm/CMakeLists.txt llvm/CMakeLists.txt
index c9ff3696e22d..f5293e866324 100644
--- llvm/CMakeLists.txt
+++ llvm/CMakeLists.txt
@@ -829,6 +829,7 @@ option (LLVM_ENABLE_DOXYGEN "Use doxygen to generate llvm API documentation." OF
 option (LLVM_ENABLE_SPHINX "Use Sphinx to generate llvm documentation." OFF)
 option (LLVM_ENABLE_OCAMLDOC "Build OCaml bindings documentation." ON)
 option (LLVM_ENABLE_BINDINGS "Build bindings." ON)
+option (LLVM_BUILD_TELEMETRY "Build the telemetry library. This does not enable telemetry." ON)
 
 set(LLVM_INSTALL_DOXYGEN_HTML_DIR "${CMAKE_INSTALL_DOCDIR}/llvm/doxygen-html"
     CACHE STRING "Doxygen-generated HTML documentation install directory")
diff --git llvm/docs/ReleaseNotes.md llvm/docs/ReleaseNotes.md
index 0b4afc77ab9d..dc8439b28895 100644
--- llvm/docs/ReleaseNotes.md
+++ llvm/docs/ReleaseNotes.md
@@ -135,6 +135,14 @@ Changes to LLDB
 
 * When building LLDB with Python support, the minimum version of Python is now
   3.8.
+* LLDB now supports hardware watchpoints for AArch64 Windows targets. Windows
+  does not provide API to query the number of supported hardware watchpoints.
+  Therefore current implementation allows only 1 watchpoint, as tested with
+  Windows 11 on the Microsoft SQ2 and Snapdragon Elite X platforms.
+
+### Changes to lldb-dap
+
+* Breakpoints can now be set for specific columns within a line.
 
 Changes to BOLT
 ---------------------------------
diff --git llvm/docs/SPIRVUsage.rst llvm/docs/SPIRVUsage.rst
index 23c5fe37a9b8..b7601b26beb8 100644
--- llvm/docs/SPIRVUsage.rst
+++ llvm/docs/SPIRVUsage.rst
@@ -204,7 +204,7 @@ list of supported SPIR-V extensions, sorted alphabetically by their extension na
    * - ``SPV_KHR_non_semantic_info``
      - Adds the ability to declare extended instruction sets that have no semantic impact and can be safely removed from a module.
 
-To enable multiple extensions, list them separated by spaces. For example, to enable support for atomic operations on floating-point numbers and arbitrary precision integers, use:
+To enable multiple extensions, list them separated by comma. For example, to enable support for atomic operations on floating-point numbers and arbitrary precision integers, use:
 
 ``-spirv-ext=+SPV_EXT_shader_atomic_float_add,+SPV_INTEL_arbitrary_precision_integers``
 
diff --git llvm/docs/TableGen/ProgRef.rst llvm/docs/TableGen/ProgRef.rst
index cfe61382658e..f26056475162 100644
--- llvm/docs/TableGen/ProgRef.rst
+++ llvm/docs/TableGen/ProgRef.rst
@@ -268,7 +268,7 @@ high-level types (e.g., ``dag``). This flexibility allows you to describe a
 wide range of records conveniently and compactly.
 
 .. productionlist::
-   Type: "bit" | "int" | "string" | "dag"
+   Type: "bit" | "int" | "string" | "dag" | "code"
        :| "bits" "<" `TokInteger` ">"
        :| "list" "<" `Type` ">"
        :| `ClassID`
@@ -285,6 +285,10 @@ wide range of records conveniently and compactly.
     The ``string`` type represents an ordered sequence of characters of arbitrary
     length.
 
+``code``
+    The keyword ``code`` is an alias for ``string`` which may be used to
+    indicate string values that are code.
+
 ``bits<``\ *n*\ ``>``
     The ``bits`` type is a fixed-sized integer of arbitrary length *n* that
     is treated as separate bits. These bits can be accessed individually.
@@ -498,6 +502,8 @@ arguments, producing a value for that bang operator. The ``!cond`` operator
 takes a list of pairs of arguments separated by colons. See `Appendix A:
 Bang Operators`_ for a description of each bang operator.
 
+The `Type` is only accepted for certain bang operators, and must not be
+``code``.
 
 Suffixed values
 ---------------
@@ -670,7 +676,7 @@ arguments.
 
 .. productionlist::
    Body: ";" | "{" `BodyItem`* "}"
-   BodyItem: (`Type` | "code") `TokIdentifier` ["=" `Value`] ";"
+   BodyItem: `Type` `TokIdentifier` ["=" `Value`] ";"
            :| "let" `TokIdentifier` ["{" `RangeList` "}"] "=" `Value` ";"
            :| "defvar" `TokIdentifier` "=" `Value` ";"
            :| `Assert`
@@ -678,8 +684,7 @@ arguments.
 A field definition in the body specifies a field to be included in the class
 or record. If no initial value is specified, then the field's value is
 uninitialized. The type must be specified; TableGen will not infer it from
-the value. The keyword ``code`` may be used to emphasize that the field
-has a string value that is code.
+the value.
 
 The ``let`` form is used to reset a field to a new value. This can be done
 for fields defined directly in the body or fields inherited from parent
diff --git llvm/docs/Vectorizers.rst llvm/docs/Vectorizers.rst
index f134a6df94a6..d19d03050436 100644
--- llvm/docs/Vectorizers.rst
+++ llvm/docs/Vectorizers.rst
@@ -405,9 +405,11 @@ Early Exit Vectorization
 When vectorizing a loop with a single early exit, the loop blocks following the
 early exit are predicated and the vector loop will always exit via the latch.
 If the early exit has been taken, the vector loop's successor block
-(``middle.split`` below) branches to the early exit block. Otherwise
-``middle.block`` selects between the exit block from the latch or the scalar
-remainder loop.
+(``middle.split`` below) branches to the early exit block via an intermediate
+block (``vector.early.exit`` below). This intermediate block is responsible for
+calculating any exit values of loop-defined variables that are used in the
+early exit block. Otherwise, ``middle.block`` selects between the exit block
+from the latch or the scalar remainder loop.
 
 .. image:: vplan-early-exit.png
 
diff --git llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl09.rst llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl09.rst
index c75d8814918a..a88b6ed48e8d 100644
--- llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl09.rst
+++ llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl09.rst
@@ -299,7 +299,7 @@ or parser so we'll need to add it.
    }
 
 In this set of code we've added some functionality on how to keep track of the
-line and column of the "source file". As we lex every token we set our current
+line and column of the "source file". As we lex every token we set our
 current "lexical location" to the assorted line and column for the beginning
 of the token. We do this by overriding all of the previous calls to
 ``getchar()`` with our new ``advance()`` that keeps track of the information
diff --git llvm/docs/vplan-early-exit.dot llvm/docs/vplan-early-exit.dot
index 63490b0cdb2e..980fc6939b6d 100644
--- llvm/docs/vplan-early-exit.dot
+++ llvm/docs/vplan-early-exit.dot
@@ -19,23 +19,27 @@ compound=true
     "middle.split"
   ]
   N4 -> N5 [ label=""]
-  N4 -> N6 [ label=""]
+  N4 -> N7 [ label=""]
   N5 [label =
-    "early.exit"
+    "vector.early.exit"
   ]
+  N5 -> N6 [ label=""]
   N6 [label =
-    "middle.block"
+    "early.exit"
   ]
-  N6 -> N9 [ label=""]
-  N6 -> N7 [ label=""]
   N7 [label =
-    "scalar.ph"
+    "middle.block"
   ]
+  N7 -> N10 [ label=""]
   N7 -> N8 [ label=""]
   N8 [label =
-    "loop.header"
+    "scalar.ph"
   ]
+  N8 -> N9 [ label=""]
   N9 [label =
+    "loop.header"
+  ]
+  N10 [label =
     "latch.exit"
   ]
 }
diff --git llvm/docs/vplan-early-exit.png llvm/docs/vplan-early-exit.png
index 3cd293bcdbcc..249b90c69e2b 100644
Binary files llvm/docs/vplan-early-exit.png and llvm/docs/vplan-early-exit.png differ
diff --git llvm/include/llvm-c/Core.h llvm/include/llvm-c/Core.h
index dc8ecf4fb2ad..43bb775e5781 100644
--- llvm/include/llvm-c/Core.h
+++ llvm/include/llvm-c/Core.h
@@ -321,11 +321,6 @@ typedef enum {
   LLVMRealPredicateTrue   /**< Always true (always folded) */
 } LLVMRealPredicate;
 
-typedef enum {
-  LLVMLandingPadCatch,    /**< A catch clause   */
-  LLVMLandingPadFilter    /**< A filter clause  */
-} LLVMLandingPadClauseTy;
-
 typedef enum {
   LLVMNotThreadLocal = 0,
   LLVMGeneralDynamicTLSModel,
diff --git llvm/include/llvm/ADT/StringSwitch.h llvm/include/llvm/ADT/StringSwitch.h
index 7093da07663a..86e591c71c92 100644
--- llvm/include/llvm/ADT/StringSwitch.h
+++ llvm/include/llvm/ADT/StringSwitch.h
@@ -14,7 +14,6 @@
 #define LLVM_ADT_STRINGSWITCH_H
 
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Support/Compiler.h"
 #include <cassert>
 #include <cstring>
 #include <optional>
@@ -67,9 +66,7 @@ public:
 
   // Case-sensitive case matchers
   StringSwitch &Case(StringLiteral S, T Value) {
-    if (!Result && Str == S) {
-      Result = std::move(Value);
-    }
+    CaseImpl(Value, S);
     return *this;
   }
 
@@ -88,61 +85,59 @@ public:
   }
 
   StringSwitch &Cases(StringLiteral S0, StringLiteral S1, T Value) {
-    return Case(S0, Value).Case(S1, Value);
+    return CasesImpl(Value, S0, S1);
   }
 
   StringSwitch &Cases(StringLiteral S0, StringLiteral S1, StringLiteral S2,
                       T Value) {
-    return Case(S0, Value).Cases(S1, S2, Value);
+    return CasesImpl(Value, S0, S1, S2);
   }
 
   StringSwitch &Cases(StringLiteral S0, StringLiteral S1, StringLiteral S2,
                       StringLiteral S3, T Value) {
-    return Case(S0, Value).Cases(S1, S2, S3, Value);
+    return CasesImpl(Value, S0, S1, S2, S3);
   }
 
   StringSwitch &Cases(StringLiteral S0, StringLiteral S1, StringLiteral S2,
                       StringLiteral S3, StringLiteral S4, T Value) {
-    return Case(S0, Value).Cases(S1, S2, S3, S4, Value);
+    return CasesImpl(Value, S0, S1, S2, S3, S4);
   }
 
   StringSwitch &Cases(StringLiteral S0, StringLiteral S1, StringLiteral S2,
                       StringLiteral S3, StringLiteral S4, StringLiteral S5,
                       T Value) {
-    return Case(S0, Value).Cases(S1, S2, S3, S4, S5, Value);
+    return CasesImpl(Value, S0, S1, S2, S3, S4, S5);
   }
 
   StringSwitch &Cases(StringLiteral S0, StringLiteral S1, StringLiteral S2,
                       StringLiteral S3, StringLiteral S4, StringLiteral S5,
                       StringLiteral S6, T Value) {
-    return Case(S0, Value).Cases(S1, S2, S3, S4, S5, S6, Value);
+    return CasesImpl(Value, S0, S1, S2, S3, S4, S5, S6);
   }
 
   StringSwitch &Cases(StringLiteral S0, StringLiteral S1, StringLiteral S2,
                       StringLiteral S3, StringLiteral S4, StringLiteral S5,
                       StringLiteral S6, StringLiteral S7, T Value) {
-    return Case(S0, Value).Cases(S1, S2, S3, S4, S5, S6, S7, Value);
+    return CasesImpl(Value, S0, S1, S2, S3, S4, S5, S6, S7);
   }
 
   StringSwitch &Cases(StringLiteral S0, StringLiteral S1, StringLiteral S2,
                       StringLiteral S3, StringLiteral S4, StringLiteral S5,
                       StringLiteral S6, StringLiteral S7, StringLiteral S8,
                       T Value) {
-    return Case(S0, Value).Cases(S1, S2, S3, S4, S5, S6, S7, S8, Value);
+    return CasesImpl(Value, S0, S1, S2, S3, S4, S5, S6, S7, S8);
   }
 
   StringSwitch &Cases(StringLiteral S0, StringLiteral S1, StringLiteral S2,
                       StringLiteral S3, StringLiteral S4, StringLiteral S5,
                       StringLiteral S6, StringLiteral S7, StringLiteral S8,
                       StringLiteral S9, T Value) {
-    return Case(S0, Value).Cases(S1, S2, S3, S4, S5, S6, S7, S8, S9, Value);
+    return CasesImpl(Value, S0, S1, S2, S3, S4, S5, S6, S7, S8, S9);
   }
 
   // Case-insensitive case matchers.
   StringSwitch &CaseLower(StringLiteral S, T Value) {
-    if (!Result && Str.equals_insensitive(S))
-      Result = std::move(Value);
-
+    CaseLowerImpl(Value, S);
     return *this;
   }
 
@@ -161,22 +156,22 @@ public:
   }
 
   StringSwitch &CasesLower(StringLiteral S0, StringLiteral S1, T Value) {
-    return CaseLower(S0, Value).CaseLower(S1, Value);
+    return CasesLowerImpl(Value, S0, S1);
   }
 
   StringSwitch &CasesLower(StringLiteral S0, StringLiteral S1, StringLiteral S2,
                            T Value) {
-    return CaseLower(S0, Value).CasesLower(S1, S2, Value);
+    return CasesLowerImpl(Value, S0, S1, S2);
   }
 
   StringSwitch &CasesLower(StringLiteral S0, StringLiteral S1, StringLiteral S2,
                            StringLiteral S3, T Value) {
-    return CaseLower(S0, Value).CasesLower(S1, S2, S3, Value);
+    return CasesLowerImpl(Value, S0, S1, S2, S3);
   }
 
   StringSwitch &CasesLower(StringLiteral S0, StringLiteral S1, StringLiteral S2,
                            StringLiteral S3, StringLiteral S4, T Value) {
-    return CaseLower(S0, Value).CasesLower(S1, S2, S3, S4, Value);
+    return CasesLowerImpl(Value, S0, S1, S2, S3, S4);
   }
 
   [[nodiscard]] R Default(T Value) {
@@ -189,6 +184,39 @@ public:
     assert(Result && "Fell off the end of a string-switch");
     return std::move(*Result);
   }
+
+private:
+  // Returns true when `Str` matches the `S` argument, and stores the result.
+  bool CaseImpl(T &Value, StringLiteral S) {
+    if (!Result && Str == S) {
+      Result = std::move(Value);
+      return true;
+    }
+    return false;
+  }
+
+  // Returns true when `Str` matches the `S` argument (case-insensitive), and
+  // stores the result.
+  bool CaseLowerImpl(T &Value, StringLiteral S) {
+    if (!Result && Str.equals_insensitive(S)) {
+      Result = std::move(Value);
+      return true;
+    }
+    return false;
+  }
+
+  template <typename... Args> StringSwitch &CasesImpl(T &Value, Args... Cases) {
+    // Stop matching after the string is found.
+    (... || CaseImpl(Value, Cases));
+    return *this;
+  }
+
+  template <typename... Args>
+  StringSwitch &CasesLowerImpl(T &Value, Args... Cases) {
+    // Stop matching after the string is found.
+    (... || CaseLowerImpl(Value, Cases));
+    return *this;
+  }
 };
 
 } // end namespace llvm
diff --git llvm/include/llvm/Analysis/ScalarEvolution.h llvm/include/llvm/Analysis/ScalarEvolution.h
index a011f16b4d95..f729b07076d2 100644
--- llvm/include/llvm/Analysis/ScalarEvolution.h
+++ llvm/include/llvm/Analysis/ScalarEvolution.h
@@ -1188,21 +1188,19 @@ public:
                             ICmpInst::Predicate Pred);
 
   struct LoopInvariantPredicate {
-    ICmpInst::Predicate Pred;
+    CmpPredicate Pred;
     const SCEV *LHS;
     const SCEV *RHS;
 
-    LoopInvariantPredicate(ICmpInst::Predicate Pred, const SCEV *LHS,
-                           const SCEV *RHS)
+    LoopInvariantPredicate(CmpPredicate Pred, const SCEV *LHS, const SCEV *RHS)
         : Pred(Pred), LHS(LHS), RHS(RHS) {}
   };
   /// If the result of the predicate LHS `Pred` RHS is loop invariant with
   /// respect to L, return a LoopInvariantPredicate with LHS and RHS being
   /// invariants, available at L's entry. Otherwise, return std::nullopt.
   std::optional<LoopInvariantPredicate>
-  getLoopInvariantPredicate(ICmpInst::Predicate Pred, const SCEV *LHS,
-                            const SCEV *RHS, const Loop *L,
-                            const Instruction *CtxI = nullptr);
+  getLoopInvariantPredicate(CmpPredicate Pred, const SCEV *LHS, const SCEV *RHS,
+                            const Loop *L, const Instruction *CtxI = nullptr);
 
   /// If the result of the predicate LHS `Pred` RHS is loop invariant with
   /// respect to L at given Context during at least first MaxIter iterations,
diff --git llvm/include/llvm/Analysis/ScalarEvolutionExpressions.h llvm/include/llvm/Analysis/ScalarEvolutionExpressions.h
index 6eb1aca1cf76..872746b7df5c 100644
--- llvm/include/llvm/Analysis/ScalarEvolutionExpressions.h
+++ llvm/include/llvm/Analysis/ScalarEvolutionExpressions.h
@@ -944,10 +944,11 @@ public:
       Operands.push_back(visit(Op));
 
     const Loop *L = Expr->getLoop();
-    if (0 == Map.count(L))
+    auto It = Map.find(L);
+    if (It == Map.end())
       return SE.getAddRecExpr(Operands, L, Expr->getNoWrapFlags());
 
-    return SCEVAddRecExpr::evaluateAtIteration(Operands, Map[L], SE);
+    return SCEVAddRecExpr::evaluateAtIteration(Operands, It->second, SE);
   }
 
 private:
diff --git llvm/include/llvm/Analysis/ValueTracking.h llvm/include/llvm/Analysis/ValueTracking.h
index b4918c2d1e8a..dba54be4c92f 100644
--- llvm/include/llvm/Analysis/ValueTracking.h
+++ llvm/include/llvm/Analysis/ValueTracking.h
@@ -164,11 +164,8 @@ bool isKnownNegative(const Value *V, const SimplifyQuery &SQ,
 
 /// Return true if the given values are known to be non-equal when defined.
 /// Supports scalar integer types only.
-bool isKnownNonEqual(const Value *V1, const Value *V2, const DataLayout &DL,
-                     AssumptionCache *AC = nullptr,
-                     const Instruction *CxtI = nullptr,
-                     const DominatorTree *DT = nullptr,
-                     bool UseInstrInfo = true);
+bool isKnownNonEqual(const Value *V1, const Value *V2, const SimplifyQuery &SQ,
+                     unsigned Depth = 0);
 
 /// Return true if 'V & Mask' is known to be zero. We use this predicate to
 /// simplify operations downstream. Mask is known to be zero for bits that V
diff --git llvm/include/llvm/BinaryFormat/ELF.h llvm/include/llvm/BinaryFormat/ELF.h
index 48ae0db80f43..8853c4a88b0b 100644
--- llvm/include/llvm/BinaryFormat/ELF.h
+++ llvm/include/llvm/BinaryFormat/ELF.h
@@ -619,6 +619,7 @@ enum {
   EF_HEXAGON_MACH_V5 = 0x00000004,   // Hexagon V5
   EF_HEXAGON_MACH_V55 = 0x00000005,  // Hexagon V55
   EF_HEXAGON_MACH_V60 = 0x00000060,  // Hexagon V60
+  EF_HEXAGON_MACH_V61 = 0x00000061,  // Hexagon V61
   EF_HEXAGON_MACH_V62 = 0x00000062,  // Hexagon V62
   EF_HEXAGON_MACH_V65 = 0x00000065,  // Hexagon V65
   EF_HEXAGON_MACH_V66 = 0x00000066,  // Hexagon V66
@@ -630,7 +631,11 @@ enum {
   EF_HEXAGON_MACH_V71T = 0x00008071, // Hexagon V71T
   EF_HEXAGON_MACH_V73 = 0x00000073,  // Hexagon V73
   EF_HEXAGON_MACH_V75 = 0x00000075,  // Hexagon V75
+  EF_HEXAGON_MACH_V77 = 0x00000077,  // Hexagon V77
   EF_HEXAGON_MACH_V79 = 0x00000079,  // Hexagon V79
+  EF_HEXAGON_MACH_V81 = 0x00000081,  // Hexagon V81
+  EF_HEXAGON_MACH_V83 = 0x00000083,  // Hexagon V83
+  EF_HEXAGON_MACH_V85 = 0x00000085,  // Hexagon V85
   EF_HEXAGON_MACH = 0x000003ff,      // Hexagon V..
 
   // Highest ISA version flags
@@ -642,6 +647,7 @@ enum {
   EF_HEXAGON_ISA_V5 = 0x00000040,   // Hexagon V5 ISA
   EF_HEXAGON_ISA_V55 = 0x00000050,  // Hexagon V55 ISA
   EF_HEXAGON_ISA_V60 = 0x00000060,  // Hexagon V60 ISA
+  EF_HEXAGON_ISA_V61 = 0x00000061,  // Hexagon V61 ISA
   EF_HEXAGON_ISA_V62 = 0x00000062,  // Hexagon V62 ISA
   EF_HEXAGON_ISA_V65 = 0x00000065,  // Hexagon V65 ISA
   EF_HEXAGON_ISA_V66 = 0x00000066,  // Hexagon V66 ISA
@@ -651,7 +657,11 @@ enum {
   EF_HEXAGON_ISA_V71 = 0x00000071,  // Hexagon V71 ISA
   EF_HEXAGON_ISA_V73 = 0x00000073,  // Hexagon V73 ISA
   EF_HEXAGON_ISA_V75 = 0x00000075,  // Hexagon V75 ISA
+  EF_HEXAGON_ISA_V77 = 0x00000077,  // Hexagon V77 ISA
   EF_HEXAGON_ISA_V79 = 0x00000079,  // Hexagon V79 ISA
+  EF_HEXAGON_ISA_V81 = 0x00000081,  // Hexagon V81 ISA
+  EF_HEXAGON_ISA_V83 = 0x00000083,  // Hexagon V83 ISA
+  EF_HEXAGON_ISA_V85 = 0x00000085,  // Hexagon V85 ISA
   EF_HEXAGON_ISA = 0x000003ff,      // Hexagon V.. ISA
 };
 
diff --git llvm/include/llvm/CodeGen/BasicTTIImpl.h llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 596db3923921..9571bd9330de 100644
--- llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -2204,6 +2204,20 @@ public:
       return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0,
                                             CostKind);
     }
+    case Intrinsic::experimental_vp_strided_store: {
+      auto *Ty = cast<VectorType>(ICA.getArgTypes()[0]);
+      Align Alignment = thisT()->DL.getABITypeAlign(Ty->getElementType());
+      return thisT()->getStridedMemoryOpCost(
+          Instruction::Store, Ty, /*Ptr=*/nullptr, /*VariableMask=*/true,
+          Alignment, CostKind, ICA.getInst());
+    }
+    case Intrinsic::experimental_vp_strided_load: {
+      auto *Ty = cast<VectorType>(ICA.getReturnType());
+      Align Alignment = thisT()->DL.getABITypeAlign(Ty->getElementType());
+      return thisT()->getStridedMemoryOpCost(
+          Instruction::Load, Ty, /*Ptr=*/nullptr, /*VariableMask=*/true,
+          Alignment, CostKind, ICA.getInst());
+    }
     case Intrinsic::vector_reduce_add:
     case Intrinsic::vector_reduce_mul:
     case Intrinsic::vector_reduce_and:
diff --git llvm/include/llvm/CodeGen/LiveIntervals.h llvm/include/llvm/CodeGen/LiveIntervals.h
index 540651ea1144..708917be497e 100644
--- llvm/include/llvm/CodeGen/LiveIntervals.h
+++ llvm/include/llvm/CodeGen/LiveIntervals.h
@@ -149,8 +149,9 @@ public:
   LiveInterval &createEmptyInterval(Register Reg) {
     assert(!hasInterval(Reg) && "Interval already exists!");
     VirtRegIntervals.grow(Reg.id());
-    VirtRegIntervals[Reg.id()] = createInterval(Reg);
-    return *VirtRegIntervals[Reg.id()];
+    auto &Interval = VirtRegIntervals[Reg.id()];
+    Interval = createInterval(Reg);
+    return *Interval;
   }
 
   LiveInterval &createAndComputeVirtRegInterval(Register Reg) {
@@ -168,8 +169,9 @@ public:
 
   /// Interval removal.
   void removeInterval(Register Reg) {
-    delete VirtRegIntervals[Reg];
-    VirtRegIntervals[Reg] = nullptr;
+    auto &Interval = VirtRegIntervals[Reg];
+    delete Interval;
+    Interval = nullptr;
   }
 
   /// Given a register and an instruction, adds a live segment from that
diff --git llvm/include/llvm/CodeGen/RegisterCoalescerPass.h llvm/include/llvm/CodeGen/RegisterCoalescerPass.h
new file mode 100644
index 000000000000..91f66dbf33b5
--- /dev/null
+++ llvm/include/llvm/CodeGen/RegisterCoalescerPass.h
@@ -0,0 +1,28 @@
+//===- llvm/CodeGen/RegisterCoalescerPass.h ---------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_REGISTER_COALESCERPASS_H
+#define LLVM_CODEGEN_REGISTER_COALESCERPASS_H
+
+#include "llvm/CodeGen/MachinePassManager.h"
+
+namespace llvm {
+class RegisterCoalescerPass : public PassInfoMixin<RegisterCoalescerPass> {
+public:
+  PreservedAnalyses run(MachineFunction &MF,
+                        MachineFunctionAnalysisManager &MFAM);
+
+  MachineFunctionProperties getClearedProperties() const {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::IsSSA);
+  }
+};
+
+} // namespace llvm
+
+#endif // LLVM_CODEGEN_REGISTER_COALESCERPASS_H
diff --git llvm/include/llvm/CodeGen/SelectionDAGNodes.h llvm/include/llvm/CodeGen/SelectionDAGNodes.h
index 49467ce0a54c..8c1e2fa6f57a 100644
--- llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -3261,13 +3261,16 @@ namespace ISD {
   template <typename ConstNodeType>
   bool matchUnaryPredicateImpl(SDValue Op,
                                std::function<bool(ConstNodeType *)> Match,
-                               bool AllowUndefs = false);
+                               bool AllowUndefs = false,
+                               bool AllowTruncation = false);
 
   /// Hook for matching ConstantSDNode predicate
   inline bool matchUnaryPredicate(SDValue Op,
                                   std::function<bool(ConstantSDNode *)> Match,
-                                  bool AllowUndefs = false) {
-    return matchUnaryPredicateImpl<ConstantSDNode>(Op, Match, AllowUndefs);
+                                  bool AllowUndefs = false,
+                                  bool AllowTruncation = false) {
+    return matchUnaryPredicateImpl<ConstantSDNode>(Op, Match, AllowUndefs,
+                                                   AllowTruncation);
   }
 
   /// Hook for matching ConstantFPSDNode predicate
diff --git llvm/include/llvm/ExecutionEngine/Orc/Core.h llvm/include/llvm/ExecutionEngine/Orc/Core.h
index 3eddaf4c9c59..db853362f657 100644
--- llvm/include/llvm/ExecutionEngine/Orc/Core.h
+++ llvm/include/llvm/ExecutionEngine/Orc/Core.h
@@ -1204,13 +1204,8 @@ private:
 
   JITDylib(ExecutionSession &ES, std::string Name);
 
-  struct RemoveTrackerResult {
-    AsynchronousSymbolQuerySet QueriesToFail;
-    std::shared_ptr<SymbolDependenceMap> FailedSymbols;
-    std::vector<std::unique_ptr<MaterializationUnit>> DefunctMUs;
-  };
-
-  RemoveTrackerResult IL_removeTracker(ResourceTracker &RT);
+  std::pair<AsynchronousSymbolQuerySet, std::shared_ptr<SymbolDependenceMap>>
+  IL_removeTracker(ResourceTracker &RT);
 
   void transferTracker(ResourceTracker &DstRT, ResourceTracker &SrcRT);
 
diff --git llvm/include/llvm/ExecutionEngine/Orc/ExecutorProcessControl.h llvm/include/llvm/ExecutionEngine/Orc/ExecutorProcessControl.h
index 86e98e74b705..dcf5592f1717 100644
--- llvm/include/llvm/ExecutionEngine/Orc/ExecutorProcessControl.h
+++ llvm/include/llvm/ExecutionEngine/Orc/ExecutorProcessControl.h
@@ -20,7 +20,6 @@
 #include "llvm/ExecutionEngine/Orc/Shared/TargetProcessControlTypes.h"
 #include "llvm/ExecutionEngine/Orc/Shared/WrapperFunctionUtils.h"
 #include "llvm/ExecutionEngine/Orc/SymbolStringPool.h"
-#include "llvm/ExecutionEngine/Orc/TargetProcess/UnwindInfoManager.h"
 #include "llvm/ExecutionEngine/Orc/TaskDispatch.h"
 #include "llvm/Support/DynamicLibrary.h"
 #include "llvm/Support/MSVCErrorWorkarounds.h"
@@ -508,9 +507,6 @@ private:
                           SymbolLookupCompleteFn F) override;
 
   std::unique_ptr<jitlink::JITLinkMemoryManager> OwnedMemMgr;
-#ifdef __APPLE__
-  std::unique_ptr<UnwindInfoManager> UnwindInfoMgr;
-#endif // __APPLE__
   char GlobalManglingPrefix = 0;
 };
 
diff --git llvm/include/llvm/ExecutionEngine/Orc/Shared/MachOObjectFormat.h llvm/include/llvm/ExecutionEngine/Orc/Shared/MachOObjectFormat.h
index b927dfbce992..31d0ecca2080 100644
--- llvm/include/llvm/ExecutionEngine/Orc/Shared/MachOObjectFormat.h
+++ llvm/include/llvm/ExecutionEngine/Orc/Shared/MachOObjectFormat.h
@@ -25,7 +25,7 @@ namespace orc {
 extern StringRef MachODataCommonSectionName;
 extern StringRef MachODataDataSectionName;
 extern StringRef MachOEHFrameSectionName;
-extern StringRef MachOCompactUnwindInfoSectionName;
+extern StringRef MachOCompactUnwindSectionName;
 extern StringRef MachOCStringSectionName;
 extern StringRef MachOModInitFuncSectionName;
 extern StringRef MachOObjCCatListSectionName;
diff --git llvm/include/llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h llvm/include/llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h
index db5ff135a716..aed43f6308cb 100644
--- llvm/include/llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h
+++ llvm/include/llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h
@@ -88,15 +88,6 @@ using SPSRunAsMainSignature = int64_t(shared::SPSExecutorAddr,
 using SPSRunAsVoidFunctionSignature = int32_t(shared::SPSExecutorAddr);
 using SPSRunAsIntFunctionSignature = int32_t(shared::SPSExecutorAddr, int32_t);
 } // end namespace rt
-
-namespace rt_alt {
-extern const char *UnwindInfoManagerInstanceName;
-extern const char *UnwindInfoManagerFindSectionsHelperName;
-extern const char *UnwindInfoManagerEnableWrapperName;
-extern const char *UnwindInfoManagerDisableWrapperName;
-extern const char *UnwindInfoManagerRegisterActionName;
-extern const char *UnwindInfoManagerDeregisterActionName;
-} // end namespace rt_alt
 } // end namespace orc
 } // end namespace llvm
 
diff --git llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/UnwindInfoManager.h llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/UnwindInfoManager.h
deleted file mode 100644
index fc7719f28212..000000000000
--- llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/UnwindInfoManager.h
+++ /dev/null
@@ -1,78 +0,0 @@
-//===--- UnwindInfoManager.h -- Register unwind info sections ---*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Utilities for managing eh-frame and compact-unwind registration and lookup
-// through libunwind's find_dynamic_unwind_sections mechanism.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_UNWINDINFOMANAGER_H
-#define LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_UNWINDINFOMANAGER_H
-
-#include "llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h"
-#include "llvm/ExecutionEngine/Orc/TargetProcess/ExecutorBootstrapService.h"
-#include "llvm/Support/Error.h"
-#include <map>
-#include <mutex>
-
-namespace llvm::orc {
-
-class UnwindInfoManager : public ExecutorBootstrapService {
-public:
-  // This struct's layout should match the unw_dynamic_unwind_sections struct
-  // from libunwind/src/libunwid_ext.h.
-  struct UnwindSections {
-    uintptr_t dso_base;
-    uintptr_t dwarf_section;
-    size_t dwarf_section_length;
-    uintptr_t compact_unwind_section;
-    size_t compact_unwind_section_length;
-  };
-
-  /// If the libunwind find-dynamic-unwind-info callback registration APIs are
-  /// available then this method will return an UnwindInfoManager instance,
-  /// otherwise it will return nullptr.
-  static std::unique_ptr<UnwindInfoManager> TryCreate();
-
-  Error shutdown() override;
-  void addBootstrapSymbols(StringMap<ExecutorAddr> &M) override;
-
-  Error enable(void *FindDynamicUnwindSections);
-  Error disable(void);
-
-  Error registerSections(ArrayRef<orc::ExecutorAddrRange> CodeRanges,
-                         orc::ExecutorAddr DSOBase,
-                         orc::ExecutorAddrRange DWARFEHFrame,
-                         orc::ExecutorAddrRange CompactUnwind);
-
-  Error deregisterSections(ArrayRef<orc::ExecutorAddrRange> CodeRanges);
-
-  int findSections(uintptr_t Addr, UnwindSections *Info);
-
-private:
-  UnwindInfoManager(int (*AddFindDynamicUnwindSections)(void *),
-                    int (*RemoveFindDynamicUnwindSections)(void *))
-      : AddFindDynamicUnwindSections(AddFindDynamicUnwindSections),
-        RemoveFindDynamicUnwindSections(RemoveFindDynamicUnwindSections) {}
-
-  static int findSectionsHelper(UnwindInfoManager *Instance, uintptr_t Addr,
-                                UnwindSections *Info);
-
-  std::mutex M;
-  std::map<uintptr_t, UnwindSections> UWSecs;
-
-  int (*AddFindDynamicUnwindSections)(void *) = nullptr;
-  int (*RemoveFindDynamicUnwindSections)(void *) = nullptr;
-  void *FindDynamicUnwindSections = nullptr;
-
-  static const char *AddFnName, *RemoveFnName;
-};
-
-} // namespace llvm::orc
-
-#endif // LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_UNWINDINFOMANAGER_H
diff --git llvm/include/llvm/ExecutionEngine/Orc/UnwindInfoRegistrationPlugin.h llvm/include/llvm/ExecutionEngine/Orc/UnwindInfoRegistrationPlugin.h
deleted file mode 100644
index eb883a79a93d..000000000000
--- llvm/include/llvm/ExecutionEngine/Orc/UnwindInfoRegistrationPlugin.h
+++ /dev/null
@@ -1,70 +0,0 @@
-//===- UnwindInfoRegistrationPlugin.h -- libunwind registration -*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Register eh-frame and compact-unwind sections with libunwind
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_EXECUTIONENGINE_ORC_UNWINDINFOREGISTRATIONPLUGIN_H
-#define LLVM_EXECUTIONENGINE_ORC_UNWINDINFOREGISTRATIONPLUGIN_H
-
-#include "llvm/ExecutionEngine/Orc/LinkGraphLinkingLayer.h"
-
-namespace llvm::orc {
-
-class UnwindInfoRegistrationPlugin : public LinkGraphLinkingLayer::Plugin {
-public:
-  static Expected<std::shared_ptr<UnwindInfoRegistrationPlugin>>
-  Create(IRLayer &IRL, JITDylib &PlatformJD, ExecutorAddr Instance,
-         ExecutorAddr FindHelper, ExecutorAddr Enable, ExecutorAddr Disable,
-         ExecutorAddr Register, ExecutorAddr Deregister);
-
-  static Expected<std::shared_ptr<UnwindInfoRegistrationPlugin>>
-  Create(IRLayer &IRL, JITDylib &PlatformJD);
-
-  ~UnwindInfoRegistrationPlugin();
-
-  void modifyPassConfig(MaterializationResponsibility &MR,
-                        jitlink::LinkGraph &G,
-                        jitlink::PassConfiguration &PassConfig) override;
-
-  Error notifyEmitted(MaterializationResponsibility &MR) override {
-    return Error::success();
-  }
-
-  Error notifyFailed(MaterializationResponsibility &MR) override {
-    return Error::success();
-  }
-
-  Error notifyRemovingResources(JITDylib &JD, ResourceKey K) override {
-    return Error::success();
-  }
-
-  void notifyTransferringResources(JITDylib &JD, ResourceKey DstKey,
-                                   ResourceKey SrcKey) override {}
-
-private:
-  UnwindInfoRegistrationPlugin(ExecutionSession &ES, ExecutorAddr Instance,
-                               ExecutorAddr Disable, ExecutorAddr Register,
-                               ExecutorAddr Deregister)
-      : ES(ES), Instance(Instance), Disable(Disable), Register(Register),
-        Deregister(Deregister) {
-    DSOBaseName = ES.intern("__jitlink$libunwind_dso_base");
-  }
-
-  static Expected<ThreadSafeModule> makeBouncerModule(ExecutionSession &ES);
-  Error addUnwindInfoRegistrationActions(jitlink::LinkGraph &G);
-
-  ExecutionSession &ES;
-  SymbolStringPtr DSOBaseName;
-  ExecutorAddr Instance, Disable, Register, Deregister;
-};
-
-} // namespace llvm::orc
-
-#endif // LLVM_EXECUTIONENGINE_ORC_UNWINDINFOREGISTRATIONPLUGIN_H
diff --git llvm/include/llvm/IR/IntrinsicsAMDGPU.td llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index f721d5267cd2..eb7bde699949 100644
--- llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -237,7 +237,7 @@ def int_amdgcn_reloc_constant : DefaultAttrsIntrinsic<
 // the second one is copied to m0
 def int_amdgcn_s_sendmsg : ClangBuiltin<"__builtin_amdgcn_s_sendmsg">,
   Intrinsic <[], [llvm_i32_ty, llvm_i32_ty],
-  [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>;
+  [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrWillReturn]>;
 def int_amdgcn_s_sendmsghalt : ClangBuiltin<"__builtin_amdgcn_s_sendmsghalt">,
   Intrinsic <[], [llvm_i32_ty, llvm_i32_ty],
   [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>;
@@ -246,7 +246,7 @@ def int_amdgcn_s_sendmsghalt : ClangBuiltin<"__builtin_amdgcn_s_sendmsghalt">,
 // gfx11 intrinsic
 // The first parameter is s_sendmsg immediate (i16). Return type is i32 or i64.
 def int_amdgcn_s_sendmsg_rtn : Intrinsic <[llvm_anyint_ty], [llvm_i32_ty],
-  [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>;
+  [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrWillReturn]>;
 
 // Vanilla workgroup sync-barrier
 def int_amdgcn_s_barrier : ClangBuiltin<"__builtin_amdgcn_s_barrier">,
diff --git llvm/include/llvm/IR/Use.h llvm/include/llvm/IR/Use.h
index 64b86f3a4396..a86b9c46c1f6 100644
--- llvm/include/llvm/IR/Use.h
+++ llvm/include/llvm/IR/Use.h
@@ -11,14 +11,6 @@
 /// instruction or some other User instance which refers to a Value.  The Use
 /// class keeps the "use list" of the referenced value up to date.
 ///
-/// Pointer tagging is used to efficiently find the User corresponding to a Use
-/// without having to store a User pointer in every Use. A User is preceded in
-/// memory by all the Uses corresponding to its operands, and the low bits of
-/// one of the fields (Prev) of the Use class are used to encode offsets to be
-/// able to find that User given a pointer to any Use. For details, see:
-///
-///   http://www.llvm.org/docs/ProgrammersManual.html#UserLayout
-///
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_IR_USE_H
diff --git llvm/include/llvm/IR/Value.h llvm/include/llvm/IR/Value.h
index 011aedece94a..cfed12e2f5f8 100644
--- llvm/include/llvm/IR/Value.h
+++ llvm/include/llvm/IR/Value.h
@@ -710,6 +710,10 @@ public:
   /// For example, for a value \p ExternalAnalysis might try to calculate a
   /// lower bound. If \p ExternalAnalysis is successful, it should return true.
   ///
+  /// If \p LookThroughIntToPtr is true then this method also looks through
+  /// IntToPtr and PtrToInt constant expressions. The returned pointer may not
+  /// have the same provenance as this value.
+  ///
   /// If this is called on a non-pointer value, it returns 'this' and the
   /// \p Offset is not modified.
   ///
@@ -722,17 +726,19 @@ public:
       const DataLayout &DL, APInt &Offset, bool AllowNonInbounds,
       bool AllowInvariantGroup = false,
       function_ref<bool(Value &Value, APInt &Offset)> ExternalAnalysis =
-          nullptr) const;
+          nullptr,
+      bool LookThroughIntToPtr = false) const;
 
   Value *stripAndAccumulateConstantOffsets(
       const DataLayout &DL, APInt &Offset, bool AllowNonInbounds,
       bool AllowInvariantGroup = false,
       function_ref<bool(Value &Value, APInt &Offset)> ExternalAnalysis =
-          nullptr) {
+          nullptr,
+      bool LookThroughIntToPtr = false) {
     return const_cast<Value *>(
         static_cast<const Value *>(this)->stripAndAccumulateConstantOffsets(
-            DL, Offset, AllowNonInbounds, AllowInvariantGroup,
-            ExternalAnalysis));
+            DL, Offset, AllowNonInbounds, AllowInvariantGroup, ExternalAnalysis,
+            LookThroughIntToPtr));
   }
 
   /// This is a wrapper around stripAndAccumulateConstantOffsets with the
diff --git llvm/include/llvm/InitializePasses.h llvm/include/llvm/InitializePasses.h
index 8111afcc1fb2..46fcd17347f4 100644
--- llvm/include/llvm/InitializePasses.h
+++ llvm/include/llvm/InitializePasses.h
@@ -266,7 +266,7 @@ void initializeRegionOnlyPrinterPass(PassRegistry &);
 void initializeRegionOnlyViewerPass(PassRegistry &);
 void initializeRegionPrinterPass(PassRegistry &);
 void initializeRegionViewerPass(PassRegistry &);
-void initializeRegisterCoalescerPass(PassRegistry &);
+void initializeRegisterCoalescerLegacyPass(PassRegistry &);
 void initializeRemoveLoadsIntoFakeUsesPass(PassRegistry &);
 void initializeRemoveRedundantDebugValuesPass(PassRegistry &);
 void initializeRenameIndependentSubregsPass(PassRegistry &);
diff --git llvm/include/llvm/MC/MCStreamer.h llvm/include/llvm/MC/MCStreamer.h
index 558b14cebfd3..bf1bbd888591 100644
--- llvm/include/llvm/MC/MCStreamer.h
+++ llvm/include/llvm/MC/MCStreamer.h
@@ -252,6 +252,12 @@ class MCStreamer {
   bool AllowAutoPadding = false;
 
 protected:
+  // True if we are processing SEH directives in an epilogue.
+  bool InEpilogCFI = false;
+
+  // Symbol of the current epilog for which we are processing SEH directives.
+  MCSymbol *CurrentEpilog = nullptr;
+
   MCFragment *CurFrag = nullptr;
 
   MCStreamer(MCContext &Ctx);
@@ -333,6 +339,10 @@ public:
     return WinFrameInfos;
   }
 
+  MCSymbol *getCurrentEpilog() const { return CurrentEpilog; }
+
+  bool isInEpilogCFI() const { return InEpilogCFI; }
+
   void generateCompactUnwindEncodings(MCAsmBackend *MAB);
 
   /// \name Assembly File Formatting.
@@ -1056,6 +1066,8 @@ public:
                                  SMLoc Loc = SMLoc());
   virtual void emitWinCFIPushFrame(bool Code, SMLoc Loc = SMLoc());
   virtual void emitWinCFIEndProlog(SMLoc Loc = SMLoc());
+  virtual void emitWinCFIBeginEpilogue(SMLoc Loc = SMLoc());
+  virtual void emitWinCFIEndEpilogue(SMLoc Loc = SMLoc());
   virtual void emitWinEHHandler(const MCSymbol *Sym, bool Unwind, bool Except,
                                 SMLoc Loc = SMLoc());
   virtual void emitWinEHHandlerData(SMLoc Loc = SMLoc());
diff --git llvm/include/llvm/Passes/CodeGenPassBuilder.h llvm/include/llvm/Passes/CodeGenPassBuilder.h
index a84164bed46c..9681368249a0 100644
--- llvm/include/llvm/Passes/CodeGenPassBuilder.h
+++ llvm/include/llvm/Passes/CodeGenPassBuilder.h
@@ -57,6 +57,7 @@
 #include "llvm/CodeGen/RegAllocFast.h"
 #include "llvm/CodeGen/RegUsageInfoCollector.h"
 #include "llvm/CodeGen/RegUsageInfoPropagate.h"
+#include "llvm/CodeGen/RegisterCoalescerPass.h"
 #include "llvm/CodeGen/RegisterUsageInfo.h"
 #include "llvm/CodeGen/ReplaceWithVeclib.h"
 #include "llvm/CodeGen/SafeStack.h"
diff --git llvm/include/llvm/Passes/MachinePassRegistry.def llvm/include/llvm/Passes/MachinePassRegistry.def
index dfe3514360c3..1d978f2ea312 100644
--- llvm/include/llvm/Passes/MachinePassRegistry.def
+++ llvm/include/llvm/Passes/MachinePassRegistry.def
@@ -164,6 +164,7 @@ MACHINE_FUNCTION_PASS("print<slot-indexes>", SlotIndexesPrinterPass(errs()))
 MACHINE_FUNCTION_PASS("print<virtregmap>", VirtRegMapPrinterPass(errs()))
 MACHINE_FUNCTION_PASS("reg-usage-collector", RegUsageInfoCollectorPass())
 MACHINE_FUNCTION_PASS("reg-usage-propagation", RegUsageInfoPropagationPass())
+MACHINE_FUNCTION_PASS("register-coalescer", RegisterCoalescerPass())
 MACHINE_FUNCTION_PASS("require-all-machine-function-properties",
                       RequireAllMachineFunctionPropertiesPass())
 MACHINE_FUNCTION_PASS("stack-coloring", StackColoringPass())
@@ -265,7 +266,6 @@ DUMMY_MACHINE_FUNCTION_PASS("removeredundantdebugvalues", RemoveRedundantDebugVa
 DUMMY_MACHINE_FUNCTION_PASS("rename-independent-subregs", RenameIndependentSubregsPass)
 DUMMY_MACHINE_FUNCTION_PASS("reset-machine-function", ResetMachineFunctionPass)
 DUMMY_MACHINE_FUNCTION_PASS("shrink-wrap", ShrinkWrapPass)
-DUMMY_MACHINE_FUNCTION_PASS("simple-register-coalescing", RegisterCoalescerPass)
 DUMMY_MACHINE_FUNCTION_PASS("stack-frame-layout", StackFrameLayoutAnalysisPass)
 DUMMY_MACHINE_FUNCTION_PASS("stack-slot-coloring", StackSlotColoringPass)
 DUMMY_MACHINE_FUNCTION_PASS("stackmap-liveness", StackMapLivenessPass)
diff --git llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
index 5a20a9ef6328..4fc013344319 100644
--- llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
+++ llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
@@ -541,9 +541,9 @@ public:
 
   std::string getConditionHeaderString(unsigned Condition) {
     std::ostringstream OS;
-    OS << "Condition C" << Condition + 1 << " --> (";
-    OS << CondLoc[Condition].first << ":" << CondLoc[Condition].second;
-    OS << ")\n";
+    const auto &[Line, Col] = CondLoc[Condition];
+    OS << "Condition C" << Condition + 1 << " --> (" << Line << ":" << Col
+       << ")\n";
     return OS.str();
   }
 
diff --git llvm/include/llvm/TableGen/Record.h llvm/include/llvm/TableGen/Record.h
index d9930a48e808..e04ed3482314 100644
--- llvm/include/llvm/TableGen/Record.h
+++ llvm/include/llvm/TableGen/Record.h
@@ -1523,7 +1523,7 @@ private:
   bool IsUsed = false;
 
   /// Reference locations to this record value.
-  SmallVector<SMRange> ReferenceLocs;
+  SmallVector<SMRange, 0> ReferenceLocs;
 
 public:
   RecordVal(const Init *N, const RecTy *T, FieldKind K);
diff --git llvm/include/llvm/TargetParser/Triple.h llvm/include/llvm/TargetParser/Triple.h
index 7d67966d1725..09c0d223d9b4 100644
--- llvm/include/llvm/TargetParser/Triple.h
+++ llvm/include/llvm/TargetParser/Triple.h
@@ -1123,9 +1123,10 @@ public:
            isWindowsCygwinEnvironment() || isOHOSFamily();
   }
 
-  /// True if the target supports both general-dynamic and TLSDESC, and TLSDESC
-  /// is enabled by default.
-  bool hasDefaultTLSDESC() const { return isAndroid() && isRISCV64(); }
+  /// True if the target uses TLSDESC by default.
+  bool hasDefaultTLSDESC() const {
+    return isAArch64() || (isAndroid() && isRISCV64()) || isOSFuchsia();
+  }
 
   /// Tests whether the target uses -data-sections as default.
   bool hasDefaultDataSections() const {
diff --git llvm/include/llvm/Transforms/Utils/MemoryTaggingSupport.h llvm/include/llvm/Transforms/Utils/MemoryTaggingSupport.h
index 60255d573640..8b7daf616b11 100644
--- llvm/include/llvm/Transforms/Utils/MemoryTaggingSupport.h
+++ llvm/include/llvm/Transforms/Utils/MemoryTaggingSupport.h
@@ -91,7 +91,6 @@ private:
 
 uint64_t getAllocaSizeInBytes(const AllocaInst &AI);
 void alignAndPadAlloca(memtag::AllocaInfo &Info, llvm::Align Align);
-bool isLifetimeIntrinsic(Value *V);
 
 Value *readRegister(IRBuilder<> &IRB, StringRef Name);
 Value *getFP(IRBuilder<> &IRB);
diff --git llvm/include/llvm/Transforms/Utils/VNCoercion.h llvm/include/llvm/Transforms/Utils/VNCoercion.h
index f1ea94bf60fc..ed4dbad50ee8 100644
--- llvm/include/llvm/Transforms/Utils/VNCoercion.h
+++ llvm/include/llvm/Transforms/Utils/VNCoercion.h
@@ -23,6 +23,7 @@
 
 namespace llvm {
 class Constant;
+class Function;
 class StoreInst;
 class LoadInst;
 class MemIntrinsic;
@@ -35,7 +36,7 @@ namespace VNCoercion {
 /// Return true if CoerceAvailableValueToLoadType would succeed if it was
 /// called.
 bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
-                                     const DataLayout &DL);
+                                     Function *F);
 
 /// If we saw a store of a value to memory, and then a load from a must-aliased
 /// pointer of a different type, try to coerce the stored value to the loaded
@@ -44,7 +45,7 @@ bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
 ///
 /// If we can't do it, return null.
 Value *coerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy,
-                                      IRBuilderBase &IRB, const DataLayout &DL);
+                                      IRBuilderBase &IRB, Function *F);
 
 /// This function determines whether a value for the pointer LoadPtr can be
 /// extracted from the store at DepSI.
@@ -75,7 +76,7 @@ int analyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr,
 /// It inserts instructions to do so at InsertPt, and returns the extracted
 /// value.
 Value *getValueForLoad(Value *SrcVal, unsigned Offset, Type *LoadTy,
-                            Instruction *InsertPt, const DataLayout &DL);
+                       Instruction *InsertPt, Function *F);
 // This is the same as getValueForLoad, except it performs no insertion.
 // It only allows constant inputs.
 Constant *getConstantValueForLoad(Constant *SrcVal, unsigned Offset,
diff --git llvm/lib/Analysis/BasicAliasAnalysis.cpp llvm/lib/Analysis/BasicAliasAnalysis.cpp
index b2a3f3390e00..85b8253408eb 100644
--- llvm/lib/Analysis/BasicAliasAnalysis.cpp
+++ llvm/lib/Analysis/BasicAliasAnalysis.cpp
@@ -1349,8 +1349,10 @@ AliasResult BasicAAResult::aliasGEP(
     const VariableGEPIndex &Var1 = DecompGEP1.VarIndices[1];
     if (Var0.hasNegatedScaleOf(Var1) && Var0.Val.TruncBits == 0 &&
         Var0.Val.hasSameCastsAs(Var1.Val) && !AAQI.MayBeCrossIteration &&
-        isKnownNonEqual(Var0.Val.V, Var1.Val.V, DL, &AC, /* CxtI */ nullptr,
-                        DT))
+        isKnownNonEqual(Var0.Val.V, Var1.Val.V,
+                        SimplifyQuery(DL, DT, &AC, /*CxtI=*/Var0.CxtI
+                                                       ? Var0.CxtI
+                                                       : Var1.CxtI)))
       MinAbsVarIndex = Var0.Scale.abs();
   }
 
diff --git llvm/lib/Analysis/CaptureTracking.cpp llvm/lib/Analysis/CaptureTracking.cpp
index a353842eb809..49baf2eb84bb 100644
--- llvm/lib/Analysis/CaptureTracking.cpp
+++ llvm/lib/Analysis/CaptureTracking.cpp
@@ -318,8 +318,8 @@ UseCaptureKind llvm::DetermineUseCaptureKind(
       return UseCaptureKind::NO_CAPTURE;
 
     // Not captured if only passed via 'nocapture' arguments.
-    if (Call->isDataOperand(&U) &&
-        !Call->doesNotCapture(Call->getDataOperandNo(&U))) {
+    assert(Call->isDataOperand(&U) && "Non-callee must be data operand");
+    if (!Call->doesNotCapture(Call->getDataOperandNo(&U))) {
       // The parameter is not marked 'nocapture' - captured.
       return UseCaptureKind::MAY_CAPTURE;
     }
diff --git llvm/lib/Analysis/CmpInstAnalysis.cpp llvm/lib/Analysis/CmpInstAnalysis.cpp
index 3599428c5ff4..5c0d1dd1c74b 100644
--- llvm/lib/Analysis/CmpInstAnalysis.cpp
+++ llvm/lib/Analysis/CmpInstAnalysis.cpp
@@ -168,6 +168,7 @@ llvm::decomposeBitTestICmp(Value *LHS, Value *RHS, CmpInst::Predicate Pred,
 
 std::optional<DecomposedBitTest>
 llvm::decomposeBitTest(Value *Cond, bool LookThruTrunc, bool AllowNonZeroC) {
+  using namespace PatternMatch;
   if (auto *ICmp = dyn_cast<ICmpInst>(Cond)) {
     // Don't allow pointers. Splat vectors are fine.
     if (!ICmp->getOperand(0)->getType()->isIntOrIntVectorTy())
@@ -176,6 +177,19 @@ llvm::decomposeBitTest(Value *Cond, bool LookThruTrunc, bool AllowNonZeroC) {
                                 ICmp->getPredicate(), LookThruTrunc,
                                 AllowNonZeroC);
   }
+  Value *X;
+  if (Cond->getType()->isIntOrIntVectorTy(1) &&
+      (match(Cond, m_Trunc(m_Value(X))) ||
+       match(Cond, m_Not(m_Trunc(m_Value(X)))))) {
+    DecomposedBitTest Result;
+    Result.X = X;
+    unsigned BitWidth = X->getType()->getScalarSizeInBits();
+    Result.Mask = APInt(BitWidth, 1);
+    Result.C = APInt::getZero(BitWidth);
+    Result.Pred = isa<TruncInst>(Cond) ? ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ;
+
+    return Result;
+  }
 
   return std::nullopt;
 }
diff --git llvm/lib/Analysis/ConstantFolding.cpp llvm/lib/Analysis/ConstantFolding.cpp
index 80c1277e6316..d645bf8f7b62 100644
--- llvm/lib/Analysis/ConstantFolding.cpp
+++ llvm/lib/Analysis/ConstantFolding.cpp
@@ -1258,11 +1258,16 @@ Constant *llvm::ConstantFoldCompareInstOperands(
     if (Ops0->getType()->isPointerTy() && !ICmpInst::isSigned(Predicate)) {
       unsigned IndexWidth = DL.getIndexTypeSizeInBits(Ops0->getType());
       APInt Offset0(IndexWidth, 0);
-      Value *Stripped0 =
-          Ops0->stripAndAccumulateInBoundsConstantOffsets(DL, Offset0);
+      bool IsEqPred = ICmpInst::isEquality(Predicate);
+      Value *Stripped0 = Ops0->stripAndAccumulateConstantOffsets(
+          DL, Offset0, /*AllowNonInbounds=*/IsEqPred,
+          /*AllowInvariantGroup=*/false, /*ExternalAnalysis=*/nullptr,
+          /*LookThroughIntToPtr=*/IsEqPred);
       APInt Offset1(IndexWidth, 0);
-      Value *Stripped1 =
-          Ops1->stripAndAccumulateInBoundsConstantOffsets(DL, Offset1);
+      Value *Stripped1 = Ops1->stripAndAccumulateConstantOffsets(
+          DL, Offset1, /*AllowNonInbounds=*/IsEqPred,
+          /*AllowInvariantGroup=*/false, /*ExternalAnalysis=*/nullptr,
+          /*LookThroughIntToPtr=*/IsEqPred);
       if (Stripped0 == Stripped1)
         return ConstantInt::getBool(
             Ops0->getContext(),
diff --git llvm/lib/Analysis/InstructionSimplify.cpp llvm/lib/Analysis/InstructionSimplify.cpp
index d69747e30f88..3cbc4107433e 100644
--- llvm/lib/Analysis/InstructionSimplify.cpp
+++ llvm/lib/Analysis/InstructionSimplify.cpp
@@ -27,6 +27,7 @@
 #include "llvm/Analysis/CmpInstAnalysis.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstSimplifyFolder.h"
+#include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/OverflowInstAnalysis.h"
@@ -3994,7 +3995,7 @@ static Value *simplifyICmpInst(CmpPredicate Pred, Value *LHS, Value *RHS,
   // This is potentially expensive, and we have already computedKnownBits for
   // compares with 0 above here, so only try this for a non-zero compare.
   if (ICmpInst::isEquality(Pred) && !match(RHS, m_Zero()) &&
-      isKnownNonEqual(LHS, RHS, Q.DL, Q.AC, Q.CxtI, Q.DT, Q.IIQ.UseInstrInfo)) {
+      isKnownNonEqual(LHS, RHS, Q)) {
     return Pred == ICmpInst::ICMP_NE ? getTrue(ITy) : getFalse(ITy);
   }
 
@@ -4612,12 +4613,11 @@ static Value *simplifyCmpSelOfMaxMin(Value *CmpLHS, Value *CmpRHS,
   return nullptr;
 }
 
-/// An alternative way to test if a bit is set or not uses sgt/slt instead of
-/// eq/ne.
-static Value *simplifySelectWithFakeICmpEq(Value *CmpLHS, Value *CmpRHS,
-                                           CmpPredicate Pred, Value *TrueVal,
-                                           Value *FalseVal) {
-  if (auto Res = decomposeBitTestICmp(CmpLHS, CmpRHS, Pred))
+/// An alternative way to test if a bit is set or not.
+/// uses e.g. sgt/slt or trunc instead of eq/ne.
+static Value *simplifySelectWithBitTest(Value *CondVal, Value *TrueVal,
+                                        Value *FalseVal) {
+  if (auto Res = decomposeBitTest(CondVal))
     return simplifySelectBitTest(TrueVal, FalseVal, Res->X, &Res->Mask,
                                  Res->Pred == ICmpInst::ICMP_EQ);
 
@@ -4728,21 +4728,20 @@ static Value *simplifySelectWithICmpCond(Value *CondVal, Value *TrueVal,
       return FalseVal;
   }
 
-  // Check for other compares that behave like bit test.
-  if (Value *V =
-          simplifySelectWithFakeICmpEq(CmpLHS, CmpRHS, Pred, TrueVal, FalseVal))
-    return V;
-
   // If we have a scalar equality comparison, then we know the value in one of
   // the arms of the select. See if substituting this value into the arm and
   // simplifying the result yields the same value as the other arm.
   if (Pred == ICmpInst::ICMP_EQ) {
-    if (Value *V = simplifySelectWithEquivalence({{CmpLHS, CmpRHS}}, TrueVal,
-                                                 FalseVal, Q, MaxRecurse))
-      return V;
-    if (Value *V = simplifySelectWithEquivalence({{CmpRHS, CmpLHS}}, TrueVal,
-                                                 FalseVal, Q, MaxRecurse))
-      return V;
+    if (CmpLHS->getType()->isIntOrIntVectorTy() ||
+        canReplacePointersIfEqual(CmpLHS, CmpRHS, Q.DL))
+      if (Value *V = simplifySelectWithEquivalence({{CmpLHS, CmpRHS}}, TrueVal,
+                                                   FalseVal, Q, MaxRecurse))
+        return V;
+    if (CmpLHS->getType()->isIntOrIntVectorTy() ||
+        canReplacePointersIfEqual(CmpRHS, CmpLHS, Q.DL))
+      if (Value *V = simplifySelectWithEquivalence({{CmpRHS, CmpLHS}}, TrueVal,
+                                                   FalseVal, Q, MaxRecurse))
+        return V;
 
     Value *X;
     Value *Y;
@@ -4984,6 +4983,9 @@ static Value *simplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal,
           simplifySelectWithICmpCond(Cond, TrueVal, FalseVal, Q, MaxRecurse))
     return V;
 
+  if (Value *V = simplifySelectWithBitTest(Cond, TrueVal, FalseVal))
+    return V;
+
   if (Value *V = simplifySelectWithFCmp(Cond, TrueVal, FalseVal, Q, MaxRecurse))
     return V;
 
diff --git llvm/lib/Analysis/LazyValueInfo.cpp llvm/lib/Analysis/LazyValueInfo.cpp
index 20f69a0955f5..5cd179df436d 100644
--- llvm/lib/Analysis/LazyValueInfo.cpp
+++ llvm/lib/Analysis/LazyValueInfo.cpp
@@ -398,6 +398,8 @@ class LazyValueInfoImpl {
   std::optional<ValueLatticeElement>
   getValueFromICmpCondition(Value *Val, ICmpInst *ICI, bool isTrueDest,
                             bool UseBlockValue);
+  ValueLatticeElement getValueFromTrunc(Value *Val, TruncInst *Trunc,
+                                        bool IsTrueDest);
 
   std::optional<ValueLatticeElement>
   getValueFromCondition(Value *Val, Value *Cond, bool IsTrueDest,
@@ -622,10 +624,12 @@ LazyValueInfoImpl::solveBlockValueImpl(Value *Val, BasicBlock *BB) {
   return getFromRangeMetadata(BBI);
 }
 
-static void AddNonNullPointer(Value *Ptr, NonNullPointerSet &PtrSet) {
+static void AddNonNullPointer(Value *Ptr, NonNullPointerSet &PtrSet,
+                              bool IsDereferenced = true) {
   // TODO: Use NullPointerIsDefined instead.
   if (Ptr->getType()->getPointerAddressSpace() == 0)
-    PtrSet.insert(getUnderlyingObject(Ptr));
+    PtrSet.insert(IsDereferenced ? getUnderlyingObject(Ptr)
+                                 : Ptr->stripInBoundsOffsets());
 }
 
 static void AddNonNullPointersByInstruction(
@@ -644,6 +648,13 @@ static void AddNonNullPointersByInstruction(
     AddNonNullPointer(MI->getRawDest(), PtrSet);
     if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI))
       AddNonNullPointer(MTI->getRawSource(), PtrSet);
+  } else if (auto *CB = dyn_cast<CallBase>(I)) {
+    for (auto &U : CB->args()) {
+      if (U->getType()->isPointerTy() &&
+          CB->paramHasNonNullAttr(CB->getArgOperandNo(&U),
+                                  /*AllowUndefOrPoison=*/false))
+        AddNonNullPointer(U.get(), PtrSet, /*IsDereferenced=*/false);
+    }
   }
 }
 
@@ -1283,6 +1294,27 @@ std::optional<ValueLatticeElement> LazyValueInfoImpl::getValueFromICmpCondition(
   return ValueLatticeElement::getOverdefined();
 }
 
+ValueLatticeElement LazyValueInfoImpl::getValueFromTrunc(Value *Val,
+                                                         TruncInst *Trunc,
+                                                         bool IsTrueDest) {
+  assert(Trunc->getType()->isIntOrIntVectorTy(1));
+
+  if (Trunc->getOperand(0) != Val)
+    return ValueLatticeElement::getOverdefined();
+
+  Type *Ty = Val->getType();
+
+  if (Trunc->hasNoUnsignedWrap()) {
+    if (IsTrueDest)
+      return ValueLatticeElement::get(ConstantInt::get(Ty, 1));
+    return ValueLatticeElement::get(Constant::getNullValue(Ty));
+  }
+
+  if (IsTrueDest)
+    return ValueLatticeElement::getNot(Constant::getNullValue(Ty));
+  return ValueLatticeElement::getNot(Constant::getAllOnesValue(Ty));
+}
+
 // Handle conditions of the form
 // extractvalue(op.with.overflow(%x, C), 1).
 static ValueLatticeElement getValueFromOverflowCondition(
@@ -1312,6 +1344,9 @@ LazyValueInfoImpl::getValueFromCondition(Value *Val, Value *Cond,
   if (ICmpInst *ICI = dyn_cast<ICmpInst>(Cond))
     return getValueFromICmpCondition(Val, ICI, IsTrueDest, UseBlockValue);
 
+  if (auto *Trunc = dyn_cast<TruncInst>(Cond))
+    return getValueFromTrunc(Val, Trunc, IsTrueDest);
+
   if (auto *EVI = dyn_cast<ExtractValueInst>(Cond))
     if (auto *WO = dyn_cast<WithOverflowInst>(EVI->getAggregateOperand()))
       if (EVI->getNumIndices() == 1 && *EVI->idx_begin() == 1)
diff --git llvm/lib/Analysis/LoopAccessAnalysis.cpp llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 697b40403902..3202ba81be78 100644
--- llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -2859,7 +2859,7 @@ static Value *stripGetElementPtr(Value *Ptr, ScalarEvolution *SE, Loop *Lp) {
 /// strides "a[i*stride]". Returns the symbolic stride, or null otherwise.
 static const SCEV *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *Lp) {
   auto *PtrTy = dyn_cast<PointerType>(Ptr->getType());
-  if (!PtrTy || PtrTy->isAggregateType())
+  if (!PtrTy)
     return nullptr;
 
   // Try to remove a gep instruction to make the pointer (actually index at this
@@ -2867,18 +2867,15 @@ static const SCEV *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *L
   // pointer, otherwise, we are analyzing the index.
   Value *OrigPtr = Ptr;
 
-  // The size of the pointer access.
-  int64_t PtrAccessSize = 1;
-
   Ptr = stripGetElementPtr(Ptr, SE, Lp);
   const SCEV *V = SE->getSCEV(Ptr);
 
   if (Ptr != OrigPtr)
     // Strip off casts.
-    while (const SCEVIntegralCastExpr *C = dyn_cast<SCEVIntegralCastExpr>(V))
+    while (auto *C = dyn_cast<SCEVIntegralCastExpr>(V))
       V = C->getOperand();
 
-  const SCEVAddRecExpr *S = dyn_cast<SCEVAddRecExpr>(V);
+  auto *S = dyn_cast<SCEVAddRecExpr>(V);
   if (!S)
     return nullptr;
 
@@ -2888,25 +2885,20 @@ static const SCEV *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *L
     return nullptr;
 
   V = S->getStepRecurrence(*SE);
-  if (!V)
-    return nullptr;
 
   // Strip off the size of access multiplication if we are still analyzing the
   // pointer.
   if (OrigPtr == Ptr) {
-    if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(V)) {
-      if (M->getOperand(0)->getSCEVType() != scConstant)
+    if (auto *M = dyn_cast<SCEVMulExpr>(V)) {
+      auto *StepConst = dyn_cast<SCEVConstant>(M->getOperand(0));
+      if (!StepConst)
         return nullptr;
 
-      const APInt &APStepVal = cast<SCEVConstant>(M->getOperand(0))->getAPInt();
-
-      // Huge step value - give up.
-      if (APStepVal.getBitWidth() > 64)
+      auto StepVal = StepConst->getAPInt().trySExtValue();
+      // Bail out on a non-unit pointer access size.
+      if (!StepVal || StepVal != 1)
         return nullptr;
 
-      int64_t StepVal = APStepVal.getSExtValue();
-      if (PtrAccessSize != StepVal)
-        return nullptr;
       V = M->getOperand(1);
     }
   }
@@ -2920,7 +2912,7 @@ static const SCEV *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *L
   if (isa<SCEVUnknown>(V))
     return V;
 
-  if (const auto *C = dyn_cast<SCEVIntegralCastExpr>(V))
+  if (auto *C = dyn_cast<SCEVIntegralCastExpr>(V))
     if (isa<SCEVUnknown>(C->getOperand()))
       return V;
 
@@ -3086,7 +3078,6 @@ const LoopAccessInfo &LoopAccessInfoManager::getInfo(Loop &L) {
   return *It->second;
 }
 void LoopAccessInfoManager::clear() {
-  SmallVector<Loop *> ToRemove;
   // Collect LoopAccessInfo entries that may keep references to IR outside the
   // analyzed loop or SCEVs that may have been modified or invalidated. At the
   // moment, that is loops requiring memory or SCEV runtime checks, as those cache
@@ -3095,11 +3086,8 @@ void LoopAccessInfoManager::clear() {
     if (LAI->getRuntimePointerChecking()->getChecks().empty() &&
         LAI->getPSE().getPredicate().isAlwaysTrue())
       continue;
-    ToRemove.push_back(L);
-  }
-
-  for (Loop *L : ToRemove)
     LoopAccessInfoMap.erase(L);
+  }
 }
 
 bool LoopAccessInfoManager::invalidate(
diff --git llvm/lib/Analysis/ScalarEvolution.cpp llvm/lib/Analysis/ScalarEvolution.cpp
index 2ce40877b523..0d7bbe3f9964 100644
--- llvm/lib/Analysis/ScalarEvolution.cpp
+++ llvm/lib/Analysis/ScalarEvolution.cpp
@@ -11165,9 +11165,8 @@ ScalarEvolution::getMonotonicPredicateTypeImpl(const SCEVAddRecExpr *LHS,
 }
 
 std::optional<ScalarEvolution::LoopInvariantPredicate>
-ScalarEvolution::getLoopInvariantPredicate(ICmpInst::Predicate Pred,
-                                           const SCEV *LHS, const SCEV *RHS,
-                                           const Loop *L,
+ScalarEvolution::getLoopInvariantPredicate(CmpPredicate Pred, const SCEV *LHS,
+                                           const SCEV *RHS, const Loop *L,
                                            const Instruction *CtxI) {
   // If there is a loop-invariant, force it into the RHS, otherwise bail out.
   if (!isLoopInvariant(RHS, L)) {
@@ -11175,7 +11174,7 @@ ScalarEvolution::getLoopInvariantPredicate(ICmpInst::Predicate Pred,
       return std::nullopt;
 
     std::swap(LHS, RHS);
-    Pred = ICmpInst::getSwappedPredicate(Pred);
+    Pred = ICmpInst::getSwappedCmpPredicate(Pred);
   }
 
   const SCEVAddRecExpr *ArLHS = dyn_cast<SCEVAddRecExpr>(LHS);
@@ -11203,7 +11202,7 @@ ScalarEvolution::getLoopInvariantPredicate(ICmpInst::Predicate Pred,
   // A similar reasoning applies for a monotonically decreasing predicate, by
   // replacing true with false and false with true in the above two bullets.
   bool Increasing = *MonotonicType == ScalarEvolution::MonotonicallyIncreasing;
-  auto P = Increasing ? Pred : ICmpInst::getInversePredicate(Pred);
+  auto P = Increasing ? Pred : ICmpInst::getInverseCmpPredicate(Pred);
 
   if (isLoopBackedgeGuardedByCond(L, P, LHS, RHS))
     return ScalarEvolution::LoopInvariantPredicate(Pred, ArLHS->getStart(),
diff --git llvm/lib/Analysis/ValueTracking.cpp llvm/lib/Analysis/ValueTracking.cpp
index 45c3b85ea39f..6b61a3546e8b 100644
--- llvm/lib/Analysis/ValueTracking.cpp
+++ llvm/lib/Analysis/ValueTracking.cpp
@@ -113,24 +113,6 @@ static const Instruction *safeCxtI(const Value *V, const Instruction *CxtI) {
   return nullptr;
 }
 
-static const Instruction *safeCxtI(const Value *V1, const Value *V2, const Instruction *CxtI) {
-  // If we've been provided with a context instruction, then use that (provided
-  // it has been inserted).
-  if (CxtI && CxtI->getParent())
-    return CxtI;
-
-  // If the value is really an already-inserted instruction, then use that.
-  CxtI = dyn_cast<Instruction>(V1);
-  if (CxtI && CxtI->getParent())
-    return CxtI;
-
-  CxtI = dyn_cast<Instruction>(V2);
-  if (CxtI && CxtI->getParent())
-    return CxtI;
-
-  return nullptr;
-}
-
 static bool getShuffleDemandedElts(const ShuffleVectorInst *Shuf,
                                    const APInt &DemandedElts,
                                    APInt &DemandedLHS, APInt &DemandedRHS) {
@@ -316,18 +298,14 @@ static bool isKnownNonEqual(const Value *V1, const Value *V2,
                             const SimplifyQuery &Q);
 
 bool llvm::isKnownNonEqual(const Value *V1, const Value *V2,
-                           const DataLayout &DL, AssumptionCache *AC,
-                           const Instruction *CxtI, const DominatorTree *DT,
-                           bool UseInstrInfo) {
+                           const SimplifyQuery &Q, unsigned Depth) {
   // We don't support looking through casts.
   if (V1 == V2 || V1->getType() != V2->getType())
     return false;
   auto *FVTy = dyn_cast<FixedVectorType>(V1->getType());
   APInt DemandedElts =
       FVTy ? APInt::getAllOnes(FVTy->getNumElements()) : APInt(1, 1);
-  return ::isKnownNonEqual(
-      V1, V2, DemandedElts, 0,
-      SimplifyQuery(DL, DT, AC, safeCxtI(V2, V1, CxtI), UseInstrInfo));
+  return ::isKnownNonEqual(V1, V2, DemandedElts, Depth, Q);
 }
 
 bool llvm::MaskedValueIsZero(const Value *V, const APInt &Mask,
@@ -798,7 +776,10 @@ static void computeKnownBitsFromICmpCond(const Value *V, ICmpInst *Cmp,
   if (match(LHS, m_Trunc(m_Specific(V)))) {
     KnownBits DstKnown(LHS->getType()->getScalarSizeInBits());
     computeKnownBitsFromCmp(LHS, Pred, LHS, RHS, DstKnown, SQ);
-    Known = Known.unionWith(DstKnown.anyext(Known.getBitWidth()));
+    if (cast<TruncInst>(LHS)->hasNoUnsignedWrap())
+      Known = Known.unionWith(DstKnown.zext(Known.getBitWidth()));
+    else
+      Known = Known.unionWith(DstKnown.anyext(Known.getBitWidth()));
     return;
   }
 
diff --git llvm/lib/CMakeLists.txt llvm/lib/CMakeLists.txt
index f6465612d30c..d0a2bc929438 100644
--- llvm/lib/CMakeLists.txt
+++ llvm/lib/CMakeLists.txt
@@ -41,7 +41,9 @@ add_subdirectory(ProfileData)
 add_subdirectory(Passes)
 add_subdirectory(TargetParser)
 add_subdirectory(TextAPI)
-add_subdirectory(Telemetry)
+if (LLVM_BUILD_TELEMETRY)
+  add_subdirectory(Telemetry)
+endif()
 add_subdirectory(ToolDrivers)
 add_subdirectory(XRay)
 if (LLVM_INCLUDE_TESTS)
diff --git llvm/lib/CodeGen/CFIFixup.cpp llvm/lib/CodeGen/CFIFixup.cpp
index 02152a136a22..7986f7d21345 100644
--- llvm/lib/CodeGen/CFIFixup.cpp
+++ llvm/lib/CodeGen/CFIFixup.cpp
@@ -70,6 +70,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -122,6 +123,62 @@ findPrologueEnd(MachineFunction &MF, MachineBasicBlock::iterator &PrologueEnd) {
   return nullptr;
 }
 
+// Represents a basic block's relationship to the call frame. This metadata
+// reflects what the state *should* be, which may differ from the actual state
+// after final machine basic block layout.
+struct BlockFlags {
+  bool Reachable : 1;
+  bool StrongNoFrameOnEntry : 1;
+  bool HasFrameOnEntry : 1;
+  bool HasFrameOnExit : 1;
+  BlockFlags()
+      : Reachable(false), StrongNoFrameOnEntry(false), HasFrameOnEntry(false),
+        HasFrameOnExit(false) {}
+};
+
+// Most functions will have <= 32 basic blocks.
+using BlockFlagsVector = SmallVector<BlockFlags, 32>;
+
+// Computes the frame information for each block in the function. Frame info
+// for a block is inferred from its predecessors.
+static BlockFlagsVector
+computeBlockInfo(const MachineFunction &MF,
+                 const MachineBasicBlock *PrologueBlock) {
+  BlockFlagsVector BlockInfo(MF.getNumBlockIDs());
+  BlockInfo[0].Reachable = true;
+  BlockInfo[0].StrongNoFrameOnEntry = true;
+
+  // Compute the presence/absence of frame at each basic block.
+  ReversePostOrderTraversal<const MachineBasicBlock *> RPOT(&*MF.begin());
+  for (const MachineBasicBlock *MBB : RPOT) {
+    BlockFlags &Info = BlockInfo[MBB->getNumber()];
+
+    // Set to true if the current block contains the prologue or the epilogue,
+    // respectively.
+    bool HasPrologue = MBB == PrologueBlock;
+    bool HasEpilogue = false;
+
+    if (Info.HasFrameOnEntry || HasPrologue)
+      HasEpilogue = containsEpilogue(*MBB);
+
+    // If the function has a call frame at the entry of the current block or the
+    // current block contains the prologue, then the function has a call frame
+    // at the exit of the block, unless the block contains the epilogue.
+    Info.HasFrameOnExit = (Info.HasFrameOnEntry || HasPrologue) && !HasEpilogue;
+
+    // Set the successors' state on entry.
+    for (MachineBasicBlock *Succ : MBB->successors()) {
+      BlockFlags &SuccInfo = BlockInfo[Succ->getNumber()];
+      SuccInfo.Reachable = true;
+      SuccInfo.StrongNoFrameOnEntry |=
+          Info.StrongNoFrameOnEntry && !HasPrologue;
+      SuccInfo.HasFrameOnEntry = Info.HasFrameOnExit;
+    }
+  }
+
+  return BlockInfo;
+}
+
 // Represents the point within a basic block where we can insert an instruction.
 // Note that we need the MachineBasicBlock* as well as the iterator since the
 // iterator can point to the end of the block. Instructions are inserted
@@ -181,13 +238,69 @@ static InsertionPoint cloneCfiPrologue(const InsertionPoint &PrologueEnd,
   return DstInsertPt;
 }
 
-bool CFIFixup::runOnMachineFunction(MachineFunction &MF) {
+// Fixes up the CFI instructions in a basic block to be consistent with the
+// intended frame state, adding or removing CFI instructions as necessary.
+// Returns true if a change was made and false otherwise.
+static bool
+fixupBlock(MachineBasicBlock &CurrBB, const BlockFlagsVector &BlockInfo,
+           SmallDenseMap<MBBSectionID, InsertionPoint> &InsertionPts,
+           const InsertionPoint &Prologue) {
+  const MachineFunction &MF = *CurrBB.getParent();
   const TargetFrameLowering &TFL = *MF.getSubtarget().getFrameLowering();
-  if (!TFL.enableCFIFixup(MF))
+  const BlockFlags &Info = BlockInfo[CurrBB.getNumber()];
+
+  if (!Info.Reachable)
     return false;
 
-  const unsigned NumBlocks = MF.getNumBlockIDs();
-  if (NumBlocks < 2)
+  // If the previous block and the current block are in the same section,
+  // the frame info will propagate from the previous block to the current one.
+  const BlockFlags &PrevInfo =
+      BlockInfo[std::prev(CurrBB.getIterator())->getNumber()];
+  bool HasFrame = PrevInfo.HasFrameOnExit && !CurrBB.isBeginSection();
+  bool NeedsFrame = Info.HasFrameOnEntry && !Info.StrongNoFrameOnEntry;
+
+#ifndef NDEBUG
+  if (!Info.StrongNoFrameOnEntry) {
+    for (auto *Pred : CurrBB.predecessors()) {
+      const BlockFlags &PredInfo = BlockInfo[Pred->getNumber()];
+      assert((!PredInfo.Reachable ||
+              Info.HasFrameOnEntry == PredInfo.HasFrameOnExit) &&
+             "Inconsistent call frame state");
+    }
+  }
+#endif
+
+  if (HasFrame == NeedsFrame)
+    return false;
+
+  if (!NeedsFrame) {
+    // Reset to the state upon function entry.
+    TFL.resetCFIToInitialState(CurrBB);
+    return true;
+  }
+
+  // Reset to the "after prologue" state.
+  InsertionPoint &InsertPt = InsertionPts[CurrBB.getSectionID()];
+  if (InsertPt.MBB == nullptr) {
+    // CurBB is the first block in its section, so there is no "after
+    // prologue" state. Clone the CFI instructions from the prologue block
+    // to create it.
+    InsertPt = cloneCfiPrologue(Prologue, {&CurrBB, CurrBB.begin()});
+  } else {
+    // There's an earlier block known to have a stack frame. Insert a
+    // `.cfi_remember_state` instruction into that block and a
+    // `.cfi_restore_state` instruction at the beginning of the current
+    // block.
+    InsertPt = insertRememberRestorePair(InsertPt, {&CurrBB, CurrBB.begin()});
+  }
+  return true;
+}
+
+bool CFIFixup::runOnMachineFunction(MachineFunction &MF) {
+  if (!MF.getSubtarget().getFrameLowering()->enableCFIFixup(MF))
+    return false;
+
+  if (MF.getNumBlockIDs() < 2)
     return false;
 
   // Find the prologue and the point where we can issue the first
@@ -197,44 +310,7 @@ bool CFIFixup::runOnMachineFunction(MachineFunction &MF) {
   if (PrologueBlock == nullptr)
     return false;
 
-  struct BlockFlags {
-    bool Reachable : 1;
-    bool StrongNoFrameOnEntry : 1;
-    bool HasFrameOnEntry : 1;
-    bool HasFrameOnExit : 1;
-  };
-  SmallVector<BlockFlags, 32> BlockInfo(NumBlocks,
-                                        {false, false, false, false});
-  BlockInfo[0].Reachable = true;
-  BlockInfo[0].StrongNoFrameOnEntry = true;
-
-  // Compute the presence/absence of frame at each basic block.
-  ReversePostOrderTraversal<MachineBasicBlock *> RPOT(&*MF.begin());
-  for (MachineBasicBlock *MBB : RPOT) {
-    BlockFlags &Info = BlockInfo[MBB->getNumber()];
-
-    // Set to true if the current block contains the prologue or the epilogue,
-    // respectively.
-    bool HasPrologue = MBB == PrologueBlock;
-    bool HasEpilogue = false;
-
-    if (Info.HasFrameOnEntry || HasPrologue)
-      HasEpilogue = containsEpilogue(*MBB);
-
-    // If the function has a call frame at the entry of the current block or the
-    // current block contains the prologue, then the function has a call frame
-    // at the exit of the block, unless the block contains the epilogue.
-    Info.HasFrameOnExit = (Info.HasFrameOnEntry || HasPrologue) && !HasEpilogue;
-
-    // Set the successors' state on entry.
-    for (MachineBasicBlock *Succ : MBB->successors()) {
-      BlockFlags &SuccInfo = BlockInfo[Succ->getNumber()];
-      SuccInfo.Reachable = true;
-      SuccInfo.StrongNoFrameOnEntry |=
-          Info.StrongNoFrameOnEntry && !HasPrologue;
-      SuccInfo.HasFrameOnEntry = Info.HasFrameOnExit;
-    }
-  }
+  BlockFlagsVector BlockInfo = computeBlockInfo(MF, PrologueBlock);
 
   // Walk the blocks of the function in "physical" order.
   // Every block inherits the frame state (as recorded in the unwind tables)
@@ -253,57 +329,10 @@ bool CFIFixup::runOnMachineFunction(MachineFunction &MF) {
   // No point starting before the prologue block.
   // TODO: the unwind tables will still be incorrect if an epilogue physically
   // preceeds the prologue.
-  MachineFunction::iterator CurrBB = std::next(PrologueBlock->getIterator());
-  bool HasFrame = BlockInfo[PrologueBlock->getNumber()].HasFrameOnExit;
-  while (CurrBB != MF.end()) {
-    const BlockFlags &Info = BlockInfo[CurrBB->getNumber()];
-    if (!Info.Reachable) {
-      ++CurrBB;
-      continue;
-    }
-
-#ifndef NDEBUG
-    if (!Info.StrongNoFrameOnEntry) {
-      for (auto *Pred : CurrBB->predecessors()) {
-        BlockFlags &PredInfo = BlockInfo[Pred->getNumber()];
-        assert((!PredInfo.Reachable ||
-                Info.HasFrameOnEntry == PredInfo.HasFrameOnExit) &&
-               "Inconsistent call frame state");
-      }
-    }
-#endif
-
-    // If the block is the first block in its section, then it doesn't have a
-    // frame on entry.
-    HasFrame &= !CurrBB->isBeginSection();
-    if (!Info.StrongNoFrameOnEntry && Info.HasFrameOnEntry && !HasFrame) {
-      // Reset to the "after prologue" state.
-
-      InsertionPoint &InsertPt = InsertionPts[CurrBB->getSectionID()];
-      if (InsertPt.MBB == nullptr) {
-        // CurBB is the first block in its section, so there is no "after
-        // prologue" state. Clone the CFI instructions from the prologue block
-        // to create it.
-        InsertPt = cloneCfiPrologue({PrologueBlock, PrologueEnd},
-                                    {&*CurrBB, CurrBB->begin()});
-      } else {
-        // There's an earlier block known to have a stack frame. Insert a
-        // `.cfi_remember_state` instruction into that block and a
-        // `.cfi_restore_state` instruction at the beginning of the current
-        // block.
-        InsertPt =
-            insertRememberRestorePair(InsertPt, {&*CurrBB, CurrBB->begin()});
-      }
-      Change = true;
-    } else if ((Info.StrongNoFrameOnEntry || !Info.HasFrameOnEntry) &&
-               HasFrame) {
-      // Reset to the state upon function entry.
-      TFL.resetCFIToInitialState(*CurrBB);
-      Change = true;
-    }
-
-    HasFrame = Info.HasFrameOnExit;
-    ++CurrBB;
+  for (MachineBasicBlock &MBB :
+       make_range(std::next(PrologueBlock->getIterator()), MF.end())) {
+    Change |=
+        fixupBlock(MBB, BlockInfo, InsertionPts, {PrologueBlock, PrologueEnd});
   }
 
   return Change;
diff --git llvm/lib/CodeGen/CodeGen.cpp llvm/lib/CodeGen/CodeGen.cpp
index ed871519e33b..5f0c7ec9c8d0 100644
--- llvm/lib/CodeGen/CodeGen.cpp
+++ llvm/lib/CodeGen/CodeGen.cpp
@@ -116,7 +116,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeRegAllocFastPass(Registry);
   initializeRegUsageInfoCollectorLegacyPass(Registry);
   initializeRegUsageInfoPropagationLegacyPass(Registry);
-  initializeRegisterCoalescerPass(Registry);
+  initializeRegisterCoalescerLegacyPass(Registry);
   initializeRemoveLoadsIntoFakeUsesPass(Registry);
   initializeRemoveRedundantDebugValuesPass(Registry);
   initializeRenameIndependentSubregsPass(Registry);
diff --git llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index b193d8bb0aa1..4648414cc46a 100644
--- llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -6595,8 +6595,9 @@ bool CombinerHelper::matchRedundantBinOpInEquality(MachineInstr &MI,
 static std::optional<unsigned>
 getMinUselessShift(KnownBits ValueKB, unsigned Opcode,
                    std::optional<int64_t> &Result) {
-  assert(Opcode == TargetOpcode::G_SHL || Opcode == TargetOpcode::G_LSHR ||
-         Opcode == TargetOpcode::G_ASHR && "Expect G_SHL, G_LSHR or G_ASHR.");
+  assert((Opcode == TargetOpcode::G_SHL || Opcode == TargetOpcode::G_LSHR ||
+          Opcode == TargetOpcode::G_ASHR) &&
+         "Expect G_SHL, G_LSHR or G_ASHR.");
   auto SignificantBits = 0;
   switch (Opcode) {
   case TargetOpcode::G_SHL:
diff --git llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index d0a62340a5f3..d4cb224c35d7 100644
--- llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -411,6 +411,10 @@ static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) {
   } while (0)
 
   switch (Opcode) {
+  case TargetOpcode::G_LROUND:
+    RTLIBCASE(LROUND_F);
+  case TargetOpcode::G_LLROUND:
+    RTLIBCASE(LLROUND_F);
   case TargetOpcode::G_MUL:
     RTLIBCASE_INT(MUL_I);
   case TargetOpcode::G_SDIV:
@@ -1267,6 +1271,8 @@ LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) {
       return Status;
     break;
   }
+  case TargetOpcode::G_LROUND:
+  case TargetOpcode::G_LLROUND:
   case TargetOpcode::G_INTRINSIC_LRINT:
   case TargetOpcode::G_INTRINSIC_LLRINT: {
     LLT LLTy = MRI.getType(MI.getOperand(1).getReg());
@@ -3342,6 +3348,15 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
     Observer.changedInstr(MI);
     return Legalized;
   }
+  case TargetOpcode::G_VECREDUCE_ADD: {
+    if (TypeIdx != 1)
+      return UnableToLegalize;
+    Observer.changingInstr(MI);
+    widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
+    widenScalarDst(MI, WideTy.getScalarType(), 0, TargetOpcode::G_TRUNC);
+    Observer.changedInstr(MI);
+    return Legalized;
+  }
   case TargetOpcode::G_VECREDUCE_FADD:
   case TargetOpcode::G_VECREDUCE_FMUL:
   case TargetOpcode::G_VECREDUCE_FMIN:
@@ -7255,7 +7270,7 @@ LegalizerHelper::lowerFunnelShiftAsShifts(MachineInstr &MI) {
     }
   }
 
-  MIRBuilder.buildOr(Dst, ShX, ShY);
+  MIRBuilder.buildOr(Dst, ShX, ShY, MachineInstr::Disjoint);
   MI.eraseFromParent();
   return Legalized;
 }
diff --git llvm/lib/CodeGen/InlineSpiller.cpp llvm/lib/CodeGen/InlineSpiller.cpp
index 33915d0f7f82..302dd37ff3d6 100644
--- llvm/lib/CodeGen/InlineSpiller.cpp
+++ llvm/lib/CodeGen/InlineSpiller.cpp
@@ -1320,13 +1320,16 @@ void HoistSpillHelper::addToMergeableSpills(MachineInstr &Spill, int StackSlot,
   LiveInterval &OrigLI = LIS.getInterval(Original);
   // save a copy of LiveInterval in StackSlotToOrigLI because the original
   // LiveInterval may be cleared after all its references are spilled.
-  if (!StackSlotToOrigLI.contains(StackSlot)) {
+
+  auto [Place, Inserted] = StackSlotToOrigLI.try_emplace(StackSlot);
+  if (Inserted) {
     auto LI = std::make_unique<LiveInterval>(OrigLI.reg(), OrigLI.weight());
     LI->assign(OrigLI, Allocator);
-    StackSlotToOrigLI[StackSlot] = std::move(LI);
+    Place->second = std::move(LI);
   }
+
   SlotIndex Idx = LIS.getInstructionIndex(Spill);
-  VNInfo *OrigVNI = StackSlotToOrigLI[StackSlot]->getVNInfoAt(Idx.getRegSlot());
+  VNInfo *OrigVNI = Place->second->getVNInfoAt(Idx.getRegSlot());
   std::pair<int, VNInfo *> MIdx = std::make_pair(StackSlot, OrigVNI);
   MergeableSpills[MIdx].insert(&Spill);
 }
@@ -1529,10 +1532,12 @@ void HoistSpillHelper::runHoistSpills(
     MachineBasicBlock *Block = (*RIt)->getBlock();
 
     // If Block contains an original spill, simply continue.
-    if (SpillsToKeep.contains(*RIt) && !SpillsToKeep[*RIt]) {
-      SpillsInSubTreeMap[*RIt].first.insert(*RIt);
-      // SpillsInSubTreeMap[*RIt].second contains the cost of spill.
-      SpillsInSubTreeMap[*RIt].second = MBFI.getBlockFreq(Block);
+    if (auto It = SpillsToKeep.find(*RIt);
+        It != SpillsToKeep.end() && !It->second) {
+      auto &SIt = SpillsInSubTreeMap[*RIt];
+      SIt.first.insert(*RIt);
+      // Sit.second contains the cost of spill.
+      SIt.second = MBFI.getBlockFreq(Block);
       continue;
     }
 
diff --git llvm/lib/CodeGen/MIRParser/MIParser.cpp llvm/lib/CodeGen/MIRParser/MIParser.cpp
index 19c73374c370..b44e1e10fef9 100644
--- llvm/lib/CodeGen/MIRParser/MIParser.cpp
+++ llvm/lib/CodeGen/MIRParser/MIParser.cpp
@@ -485,7 +485,7 @@ public:
   bool parseDILocation(MDNode *&Expr);
   bool parseMetadataOperand(MachineOperand &Dest);
   bool parseCFIOffset(int &Offset);
-  bool parseCFIRegister(Register &Reg);
+  bool parseCFIRegister(unsigned &Reg);
   bool parseCFIAddressSpace(unsigned &AddressSpace);
   bool parseCFIEscapeValues(std::string& Values);
   bool parseCFIOperand(MachineOperand &Dest);
@@ -2446,7 +2446,7 @@ bool MIParser::parseCFIOffset(int &Offset) {
   return false;
 }
 
-bool MIParser::parseCFIRegister(Register &Reg) {
+bool MIParser::parseCFIRegister(unsigned &Reg) {
   if (Token.isNot(MIToken::NamedRegister))
     return error("expected a cfi register");
   Register LLVMReg;
@@ -2491,7 +2491,7 @@ bool MIParser::parseCFIOperand(MachineOperand &Dest) {
   auto Kind = Token.kind();
   lex();
   int Offset;
-  Register Reg;
+  unsigned Reg;
   unsigned AddressSpace;
   unsigned CFIIndex;
   switch (Kind) {
@@ -2564,7 +2564,7 @@ bool MIParser::parseCFIOperand(MachineOperand &Dest) {
     CFIIndex = MF.addFrameInst(MCCFIInstruction::createUndefined(nullptr, Reg));
     break;
   case MIToken::kw_cfi_register: {
-    Register Reg2;
+    unsigned Reg2;
     if (parseCFIRegister(Reg) || expectAndConsume(MIToken::comma) ||
         parseCFIRegister(Reg2))
       return true;
diff --git llvm/lib/CodeGen/PeepholeOptimizer.cpp llvm/lib/CodeGen/PeepholeOptimizer.cpp
index 48c25d5039bf..e0053fb24336 100644
--- llvm/lib/CodeGen/PeepholeOptimizer.cpp
+++ llvm/lib/CodeGen/PeepholeOptimizer.cpp
@@ -153,7 +153,7 @@ class RecurrenceInstr;
 class Rewriter {
 protected:
   MachineInstr &CopyLike;
-  unsigned CurrentSrcIdx = 0; ///< The index of the source being rewritten.
+  int CurrentSrcIdx = 0; ///< The index of the source being rewritten.
 public:
   Rewriter(MachineInstr &CopyLike) : CopyLike(CopyLike) {}
   virtual ~Rewriter() = default;
@@ -201,14 +201,11 @@ public:
 
   bool getNextRewritableSource(RegSubRegPair &Src,
                                RegSubRegPair &Dst) override {
-    // CurrentSrcIdx > 0 means this function has already been called.
-    if (CurrentSrcIdx > 0)
+    if (++CurrentSrcIdx > 1)
       return false;
-    // This is the first call to getNextRewritableSource.
-    // Move the CurrentSrcIdx to remember that we made that call.
-    CurrentSrcIdx = 1;
+
     // The rewritable source is the argument.
-    const MachineOperand &MOSrc = CopyLike.getOperand(1);
+    const MachineOperand &MOSrc = CopyLike.getOperand(CurrentSrcIdx);
     Src = RegSubRegPair(MOSrc.getReg(), MOSrc.getSubReg());
     // What we track are the alternative sources of the definition.
     const MachineOperand &MODef = CopyLike.getOperand(0);
@@ -217,8 +214,6 @@ public:
   }
 
   bool RewriteCurrentSource(Register NewReg, unsigned NewSubReg) override {
-    if (CurrentSrcIdx != 1)
-      return false;
     MachineOperand &MOSrc = CopyLike.getOperand(CurrentSrcIdx);
     MOSrc.setReg(NewReg);
     MOSrc.setSubReg(NewSubReg);
@@ -229,7 +224,7 @@ public:
 /// Helper class to rewrite uncoalescable copy like instructions
 /// into new COPY (coalescable friendly) instructions.
 class UncoalescableRewriter : public Rewriter {
-  unsigned NumDefs; ///< Number of defs in the bitcast.
+  int NumDefs; ///< Number of defs in the bitcast.
 
 public:
   UncoalescableRewriter(MachineInstr &MI) : Rewriter(MI) {
@@ -383,6 +378,7 @@ class RegSequenceRewriter : public Rewriter {
 public:
   RegSequenceRewriter(MachineInstr &MI) : Rewriter(MI) {
     assert(MI.isRegSequence() && "Invalid instruction");
+    CurrentSrcIdx = -1;
   }
 
   /// \see Rewriter::getNextRewritableSource()
@@ -404,16 +400,10 @@ public:
   bool getNextRewritableSource(RegSubRegPair &Src,
                                RegSubRegPair &Dst) override {
     // We are looking at v0 = REG_SEQUENCE v1, sub1, v2, sub2, etc.
+    CurrentSrcIdx += 2;
+    if (static_cast<unsigned>(CurrentSrcIdx) >= CopyLike.getNumOperands())
+      return false;
 
-    // If this is the first call, move to the first argument.
-    if (CurrentSrcIdx == 0) {
-      CurrentSrcIdx = 1;
-    } else {
-      // Otherwise, move to the next argument and check that it is valid.
-      CurrentSrcIdx += 2;
-      if (CurrentSrcIdx >= CopyLike.getNumOperands())
-        return false;
-    }
     const MachineOperand &MOInsertedReg = CopyLike.getOperand(CurrentSrcIdx);
     Src.Reg = MOInsertedReg.getReg();
     // If we have to compose sub-register indices, bail out.
@@ -431,9 +421,10 @@ public:
   }
 
   bool RewriteCurrentSource(Register NewReg, unsigned NewSubReg) override {
-    // We cannot rewrite out of bound operands.
-    // Moreover, rewritable sources are at odd positions.
-    if ((CurrentSrcIdx & 1) != 1 || CurrentSrcIdx > CopyLike.getNumOperands())
+    // Do not introduce new subregister uses in a reg_sequence. Until composing
+    // subregister indices is supported while folding, we're just blocking
+    // folding of subregister copies later in the function.
+    if (NewSubReg)
       return false;
 
     MachineOperand &MO = CopyLike.getOperand(CurrentSrcIdx);
@@ -513,7 +504,7 @@ private:
 
   /// Check whether \p MI is understood by the register coalescer
   /// but may require some rewriting.
-  bool isCoalescableCopy(const MachineInstr &MI) {
+  static bool isCoalescableCopy(const MachineInstr &MI) {
     // SubregToRegs are not interesting, because they are already register
     // coalescer friendly.
     return MI.isCopy() ||
@@ -523,7 +514,7 @@ private:
 
   /// Check whether \p MI is a copy like instruction that is
   /// not recognized by the register coalescer.
-  bool isUncoalescableCopy(const MachineInstr &MI) {
+  static bool isUncoalescableCopy(const MachineInstr &MI) {
     return MI.isBitcast() || (!DisableAdvCopyOpt && (MI.isRegSequenceLike() ||
                                                      MI.isInsertSubregLike() ||
                                                      MI.isExtractSubregLike()));
@@ -1044,8 +1035,11 @@ bool PeepholeOptimizer::findNextSource(RegSubRegPair RegSubReg,
         return false;
 
       // Insert the Def -> Use entry for the recently found source.
-      ValueTrackerResult CurSrcRes = RewriteMap.lookup(CurSrcPair);
-      if (CurSrcRes.isValid()) {
+      auto [InsertPt, WasInserted] = RewriteMap.try_emplace(CurSrcPair, Res);
+
+      if (!WasInserted) {
+        const ValueTrackerResult &CurSrcRes = InsertPt->second;
+
         assert(CurSrcRes == Res && "ValueTrackerResult found must match");
         // An existent entry with multiple sources is a PHI cycle we must avoid.
         // Otherwise it's an entry with a valid next source we already found.
@@ -1056,7 +1050,6 @@ bool PeepholeOptimizer::findNextSource(RegSubRegPair RegSubReg,
         }
         break;
       }
-      RewriteMap.insert(std::make_pair(CurSrcPair, Res));
 
       // ValueTrackerResult usually have one source unless it's the result from
       // a PHI instruction. Add the found PHI edges to be looked up further.
@@ -1965,22 +1958,7 @@ ValueTrackerResult ValueTracker::getNextSourceFromRegSequence() {
   assert((Def->isRegSequence() || Def->isRegSequenceLike()) &&
          "Invalid definition");
 
-  if (Def->getOperand(DefIdx).getSubReg())
-    // If we are composing subregs, bail out.
-    // The case we are checking is Def.<subreg> = REG_SEQUENCE.
-    // This should almost never happen as the SSA property is tracked at
-    // the register level (as opposed to the subreg level).
-    // I.e.,
-    // Def.sub0 =
-    // Def.sub1 =
-    // is a valid SSA representation for Def.sub0 and Def.sub1, but not for
-    // Def. Thus, it must not be generated.
-    // However, some code could theoretically generates a single
-    // Def.sub0 (i.e, not defining the other subregs) and we would
-    // have this case.
-    // If we can ascertain (or force) that this never happens, we could
-    // turn that into an assertion.
-    return ValueTrackerResult();
+  assert(!Def->getOperand(DefIdx).getSubReg() && "illegal subregister def");
 
   SmallVector<RegSubRegPairAndIdx, 8> RegSeqInputRegs;
   if (!TII->getRegSequenceInputs(*Def, DefIdx, RegSeqInputRegs))
diff --git llvm/lib/CodeGen/ReachingDefAnalysis.cpp llvm/lib/CodeGen/ReachingDefAnalysis.cpp
index 08121f411640..fa60881b2085 100644
--- llvm/lib/CodeGen/ReachingDefAnalysis.cpp
+++ llvm/lib/CodeGen/ReachingDefAnalysis.cpp
@@ -266,9 +266,13 @@ void ReachingDefAnalysis::printAllReachingDefs(MachineFunction &MF) {
         Defs.clear();
         getGlobalReachingDefs(&MI, Reg, Defs);
         MO.print(dbgs(), TRI);
-        dbgs() << ":{ ";
+        SmallVector<int, 0> Nums;
         for (MachineInstr *Def : Defs)
-          dbgs() << InstToNumMap[Def] << " ";
+          Nums.push_back(InstToNumMap[Def]);
+        llvm::sort(Nums);
+        dbgs() << ":{ ";
+        for (int Num : Nums)
+          dbgs() << Num << " ";
         dbgs() << "}\n";
       }
       dbgs() << Num << ": " << MI << "\n";
diff --git llvm/lib/CodeGen/RegAllocBasic.cpp llvm/lib/CodeGen/RegAllocBasic.cpp
index f3f34f890be1..e1f05406297d 100644
--- llvm/lib/CodeGen/RegAllocBasic.cpp
+++ llvm/lib/CodeGen/RegAllocBasic.cpp
@@ -134,7 +134,7 @@ INITIALIZE_PASS_BEGIN(RABasic, "regallocbasic", "Basic Register Allocator",
 INITIALIZE_PASS_DEPENDENCY(LiveDebugVariablesWrapperLegacy)
 INITIALIZE_PASS_DEPENDENCY(SlotIndexesWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(RegisterCoalescer)
+INITIALIZE_PASS_DEPENDENCY(RegisterCoalescerLegacy)
 INITIALIZE_PASS_DEPENDENCY(MachineScheduler)
 INITIALIZE_PASS_DEPENDENCY(LiveStacksWrapperLegacy)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
diff --git llvm/lib/CodeGen/RegAllocGreedy.cpp llvm/lib/CodeGen/RegAllocGreedy.cpp
index 6077cfd514de..465c4e8feffb 100644
--- llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -154,7 +154,7 @@ INITIALIZE_PASS_BEGIN(RAGreedy, "greedy",
 INITIALIZE_PASS_DEPENDENCY(LiveDebugVariablesWrapperLegacy)
 INITIALIZE_PASS_DEPENDENCY(SlotIndexesWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(RegisterCoalescer)
+INITIALIZE_PASS_DEPENDENCY(RegisterCoalescerLegacy)
 INITIALIZE_PASS_DEPENDENCY(MachineScheduler)
 INITIALIZE_PASS_DEPENDENCY(LiveStacksWrapperLegacy)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
diff --git llvm/lib/CodeGen/RegisterCoalescer.cpp llvm/lib/CodeGen/RegisterCoalescer.cpp
index 62c15949e46d..f0b597e21f6f 100644
--- llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -24,15 +24,18 @@
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/LiveRangeEdit.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachinePassManager.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/RegisterCoalescerPass.h"
 #include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
@@ -121,13 +124,13 @@ namespace {
 
 class JoinVals;
 
-class RegisterCoalescer : public MachineFunctionPass,
-                          private LiveRangeEdit::Delegate {
+class RegisterCoalescer : private LiveRangeEdit::Delegate {
   MachineFunction *MF = nullptr;
   MachineRegisterInfo *MRI = nullptr;
   const TargetRegisterInfo *TRI = nullptr;
   const TargetInstrInfo *TII = nullptr;
   LiveIntervals *LIS = nullptr;
+  SlotIndexes *SI = nullptr;
   const MachineLoopInfo *Loops = nullptr;
   RegisterClassInfo RegClassInfo;
 
@@ -372,11 +375,24 @@ class RegisterCoalescer : public MachineFunctionPass,
   void checkMergingChangesDbgValuesImpl(Register Reg, LiveRange &OtherRange,
                                         LiveRange &RegRange, JoinVals &Vals2);
 
+public:
+  // For legacy pass only.
+  RegisterCoalescer() {}
+  RegisterCoalescer &operator=(RegisterCoalescer &&Other) = default;
+
+  RegisterCoalescer(LiveIntervals *LIS, SlotIndexes *SI,
+                    const MachineLoopInfo *Loops)
+      : LIS(LIS), SI(SI), Loops(Loops) {}
+
+  bool run(MachineFunction &MF);
+};
+
+class RegisterCoalescerLegacy : public MachineFunctionPass {
 public:
   static char ID; ///< Class identification, replacement for typeinfo
 
-  RegisterCoalescer() : MachineFunctionPass(ID) {
-    initializeRegisterCoalescerPass(*PassRegistry::getPassRegistry());
+  RegisterCoalescerLegacy() : MachineFunctionPass(ID) {
+    initializeRegisterCoalescerLegacyPass(*PassRegistry::getPassRegistry());
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override;
@@ -386,27 +402,22 @@ public:
         MachineFunctionProperties::Property::IsSSA);
   }
 
-  void releaseMemory() override;
-
   /// This is the pass entry point.
   bool runOnMachineFunction(MachineFunction &) override;
-
-  /// Implement the dump method.
-  void print(raw_ostream &O, const Module * = nullptr) const override;
 };
 
 } // end anonymous namespace
 
-char RegisterCoalescer::ID = 0;
+char RegisterCoalescerLegacy::ID = 0;
 
-char &llvm::RegisterCoalescerID = RegisterCoalescer::ID;
+char &llvm::RegisterCoalescerID = RegisterCoalescerLegacy::ID;
 
-INITIALIZE_PASS_BEGIN(RegisterCoalescer, "register-coalescer",
+INITIALIZE_PASS_BEGIN(RegisterCoalescerLegacy, "register-coalescer",
                       "Register Coalescer", false, false)
 INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(SlotIndexesWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
-INITIALIZE_PASS_END(RegisterCoalescer, "register-coalescer",
+INITIALIZE_PASS_END(RegisterCoalescerLegacy, "register-coalescer",
                     "Register Coalescer", false, false)
 
 [[nodiscard]] static bool isMoveInstr(const TargetRegisterInfo &tri,
@@ -583,8 +594,9 @@ bool CoalescerPair::isCoalescable(const MachineInstr *MI) const {
   }
 }
 
-void RegisterCoalescer::getAnalysisUsage(AnalysisUsage &AU) const {
+void RegisterCoalescerLegacy::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesCFG();
+  AU.addUsedIfAvailable<SlotIndexesWrapperPass>();
   AU.addRequired<LiveIntervalsWrapperPass>();
   AU.addPreserved<LiveIntervalsWrapperPass>();
   AU.addPreserved<SlotIndexesWrapperPass>();
@@ -4229,15 +4241,35 @@ void RegisterCoalescer::joinAllIntervals() {
   lateLiveIntervalUpdate();
 }
 
-void RegisterCoalescer::releaseMemory() {
-  ErasedInstrs.clear();
-  WorkList.clear();
-  DeadDefs.clear();
-  InflateRegs.clear();
-  LargeLIVisitCounter.clear();
+PreservedAnalyses
+RegisterCoalescerPass::run(MachineFunction &MF,
+                           MachineFunctionAnalysisManager &MFAM) {
+  MFPropsModifier _(*this, MF);
+  auto &LIS = MFAM.getResult<LiveIntervalsAnalysis>(MF);
+  auto &Loops = MFAM.getResult<MachineLoopAnalysis>(MF);
+  auto *SI = MFAM.getCachedResult<SlotIndexesAnalysis>(MF);
+  RegisterCoalescer Impl(&LIS, SI, &Loops);
+  if (!Impl.run(MF))
+    return PreservedAnalyses::all();
+  auto PA = getMachineFunctionPassPreservedAnalyses();
+  PA.preserveSet<CFGAnalyses>();
+  PA.preserve<LiveIntervalsAnalysis>();
+  PA.preserve<SlotIndexesAnalysis>();
+  PA.preserve<MachineLoopAnalysis>();
+  PA.preserve<MachineDominatorTreeAnalysis>();
+  return PA;
+}
+
+bool RegisterCoalescerLegacy::runOnMachineFunction(MachineFunction &MF) {
+  auto *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
+  auto *Loops = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
+  auto *SIWrapper = getAnalysisIfAvailable<SlotIndexesWrapperPass>();
+  SlotIndexes *SI = SIWrapper ? &SIWrapper->getSI() : nullptr;
+  RegisterCoalescer Impl(LIS, SI, Loops);
+  return Impl.run(MF);
 }
 
-bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) {
+bool RegisterCoalescer::run(MachineFunction &fn) {
   LLVM_DEBUG(dbgs() << "********** REGISTER COALESCER **********\n"
                     << "********** Function: " << fn.getName() << '\n');
 
@@ -4260,8 +4292,6 @@ bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) {
   const TargetSubtargetInfo &STI = fn.getSubtarget();
   TRI = STI.getRegisterInfo();
   TII = STI.getInstrInfo();
-  LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
-  Loops = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
   if (EnableGlobalCopies == cl::BOU_UNSET)
     JoinGlobalCopies = STI.enableJoinGlobalCopies();
   else
@@ -4286,7 +4316,7 @@ bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) {
   JoinSplitEdges = EnableJoinSplits;
 
   if (VerifyCoalescing)
-    MF->verify(this, "Before register coalescing", &errs());
+    MF->verify(LIS, SI, "Before register coalescing", &errs());
 
   DbgVRegToValues.clear();
   buildVRegToDbgValueMap(fn);
@@ -4344,12 +4374,9 @@ bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) {
   PHIValToPos.clear();
   RegToPHIIdx.clear();
 
-  LLVM_DEBUG(dump());
+  LLVM_DEBUG(LIS->dump());
+
   if (VerifyCoalescing)
-    MF->verify(this, "After register coalescing", &errs());
+    MF->verify(LIS, SI, "After register coalescing", &errs());
   return true;
 }
-
-void RegisterCoalescer::print(raw_ostream &O, const Module *m) const {
-  LIS->print(O);
-}
diff --git llvm/lib/CodeGen/RegisterCoalescer.h llvm/lib/CodeGen/RegisterCoalescer.h
index 6926e9b5d188..ec1940805ea2 100644
--- llvm/lib/CodeGen/RegisterCoalescer.h
+++ llvm/lib/CodeGen/RegisterCoalescer.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_LIB_CODEGEN_REGISTERCOALESCER_H
 #define LLVM_LIB_CODEGEN_REGISTERCOALESCER_H
 
+#include "llvm/CodeGen/MachinePassManager.h"
 #include "llvm/CodeGen/Register.h"
 
 namespace llvm {
diff --git llvm/lib/CodeGen/ScheduleDAGInstrs.cpp llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
index cc98c52e90ea..a26804707dd1 100644
--- llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -209,13 +209,25 @@ void ScheduleDAGInstrs::addSchedBarrierDeps() {
   ExitSU.setInstr(ExitMI);
   // Add dependencies on the defs and uses of the instruction.
   if (ExitMI) {
+    const MCInstrDesc &MIDesc = ExitMI->getDesc();
     for (const MachineOperand &MO : ExitMI->all_uses()) {
+      unsigned OpIdx = MO.getOperandNo();
       Register Reg = MO.getReg();
       if (Reg.isPhysical()) {
+        // addPhysRegDataDeps uses the provided operand index to retrieve
+        // the operand use cycle from the scheduling model. If the operand
+        // is "fake" (e.g., an operand of a call instruction used to pass
+        // an argument to the called function.), the scheduling model may not
+        // have an entry for it. If this is the case, pass -1 as operand index,
+        // which will cause addPhysRegDataDeps to add an artificial dependency.
+        // FIXME: Using hasImplicitUseOfPhysReg here is inaccurate as it misses
+        //  aliases. When fixing, make sure to update addPhysRegDataDeps, too.
+        bool IsRealUse = OpIdx < MIDesc.getNumOperands() ||
+                         MIDesc.hasImplicitUseOfPhysReg(Reg);
         for (MCRegUnit Unit : TRI->regunits(Reg))
-          Uses.insert(PhysRegSUOper(&ExitSU, -1, Unit));
+          Uses.insert(PhysRegSUOper(&ExitSU, IsRealUse ? OpIdx : -1, Unit));
       } else if (Reg.isVirtual() && MO.readsReg()) {
-        addVRegUseDeps(&ExitSU, MO.getOperandNo());
+        addVRegUseDeps(&ExitSU, OpIdx);
       }
     }
   }
diff --git llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index f4caaf426de6..882d60150814 100644
--- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -11005,8 +11005,8 @@ SDValue DAGCombiner::visitFunnelShift(SDNode *N) {
       auto *RHS = dyn_cast<LoadSDNode>(N1);
       if (LHS && RHS && LHS->isSimple() && RHS->isSimple() &&
           LHS->getAddressSpace() == RHS->getAddressSpace() &&
-          (LHS->hasOneUse() || RHS->hasOneUse()) && ISD::isNON_EXTLoad(RHS) &&
-          ISD::isNON_EXTLoad(LHS)) {
+          (LHS->hasNUsesOfValue(1, 0) || RHS->hasNUsesOfValue(1, 0)) &&
+          ISD::isNON_EXTLoad(RHS) && ISD::isNON_EXTLoad(LHS)) {
         if (DAG.areNonVolatileConsecutiveLoads(LHS, RHS, BitWidth / 8, 1)) {
           SDLoc DL(RHS);
           uint64_t PtrOff =
@@ -11024,9 +11024,8 @@ SDValue DAGCombiner::visitFunnelShift(SDNode *N) {
                 VT, DL, RHS->getChain(), NewPtr,
                 RHS->getPointerInfo().getWithOffset(PtrOff), NewAlign,
                 RHS->getMemOperand()->getFlags(), RHS->getAAInfo());
-            // Replace the old load's chain with the new load's chain.
-            WorklistRemover DeadNodes(*this);
-            DAG.ReplaceAllUsesOfValueWith(N1.getValue(1), Load.getValue(1));
+            DAG.makeEquivalentMemoryOrdering(LHS, Load.getValue(1));
+            DAG.makeEquivalentMemoryOrdering(RHS, Load.getValue(1));
             return Load;
           }
         }
diff --git llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index c6475f021990..6c9c96ceaa4b 100644
--- llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -3970,8 +3970,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     else
       Index = DAG.getNode(ISD::MUL, dl, Index.getValueType(), Index,
                           DAG.getConstant(EntrySize, dl, Index.getValueType()));
-    SDValue Addr = DAG.getNode(ISD::ADD, dl, Index.getValueType(),
-                               Index, Table);
+    SDValue Addr = DAG.getMemBasePlusOffset(Table, Index, dl);
 
     EVT MemVT = EVT::getIntegerVT(*DAG.getContext(), EntrySize * 8);
     SDValue LD = DAG.getExtLoad(
@@ -3980,10 +3979,10 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     Addr = LD;
     if (TLI.isJumpTableRelative()) {
       // For PIC, the sequence is:
-      // BRIND(load(Jumptable + index) + RelocBase)
+      // BRIND(RelocBase + load(Jumptable + index))
       // RelocBase can be JumpTable, GOT or some sort of global base.
-      Addr = DAG.getNode(ISD::ADD, dl, PTy, Addr,
-                          TLI.getPICJumpTableRelocBase(Table, DAG));
+      Addr = DAG.getMemBasePlusOffset(TLI.getPICJumpTableRelocBase(Table, DAG),
+                                      Addr, dl);
     }
 
     Tmp1 = TLI.expandIndirectJTBranch(dl, LD.getValue(1), Addr, JTI, DAG);
diff --git llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index b416c0efbbc4..16c3b295426c 100644
--- llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -363,7 +363,7 @@ bool ISD::isFreezeUndef(const SDNode *N) {
 template <typename ConstNodeType>
 bool ISD::matchUnaryPredicateImpl(SDValue Op,
                                   std::function<bool(ConstNodeType *)> Match,
-                                  bool AllowUndefs) {
+                                  bool AllowUndefs, bool AllowTruncation) {
   // FIXME: Add support for scalar UNDEF cases?
   if (auto *C = dyn_cast<ConstNodeType>(Op))
     return Match(C);
@@ -382,16 +382,17 @@ bool ISD::matchUnaryPredicateImpl(SDValue Op,
     }
 
     auto *Cst = dyn_cast<ConstNodeType>(Op.getOperand(i));
-    if (!Cst || Cst->getValueType(0) != SVT || !Match(Cst))
+    if (!Cst || (!AllowTruncation && Cst->getValueType(0) != SVT) ||
+        !Match(Cst))
       return false;
   }
   return true;
 }
 // Build used template types.
 template bool ISD::matchUnaryPredicateImpl<ConstantSDNode>(
-    SDValue, std::function<bool(ConstantSDNode *)>, bool);
+    SDValue, std::function<bool(ConstantSDNode *)>, bool, bool);
 template bool ISD::matchUnaryPredicateImpl<ConstantFPSDNode>(
-    SDValue, std::function<bool(ConstantFPSDNode *)>, bool);
+    SDValue, std::function<bool(ConstantFPSDNode *)>, bool, bool);
 
 bool ISD::matchBinaryPredicate(
     SDValue LHS, SDValue RHS,
@@ -7296,15 +7297,15 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     // it's worth handling here.
     if (N2CV && N2CV->isZero())
       return N1;
-    if ((Opcode == ISD::ADD || Opcode == ISD::SUB) && VT.isVector() &&
-        VT.getVectorElementType() == MVT::i1)
+    if ((Opcode == ISD::ADD || Opcode == ISD::SUB) &&
+        VT.getScalarType() == MVT::i1)
       return getNode(ISD::XOR, DL, VT, N1, N2);
     break;
   case ISD::MUL:
     assert(VT.isInteger() && "This operator does not apply to FP types!");
     assert(N1.getValueType() == N2.getValueType() &&
            N1.getValueType() == VT && "Binary operator types must match!");
-    if (VT.isVector() && VT.getVectorElementType() == MVT::i1)
+    if (VT.getScalarType() == MVT::i1)
       return getNode(ISD::AND, DL, VT, N1, N2);
     if (N2C && (N1.getOpcode() == ISD::VSCALE) && Flags.hasNoSignedWrap()) {
       const APInt &MulImm = N1->getConstantOperandAPInt(0);
@@ -7325,7 +7326,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     assert(VT.isInteger() && "This operator does not apply to FP types!");
     assert(N1.getValueType() == N2.getValueType() &&
            N1.getValueType() == VT && "Binary operator types must match!");
-    if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
+    if (VT.getScalarType() == MVT::i1) {
       // fold (add_sat x, y) -> (or x, y) for bool types.
       if (Opcode == ISD::SADDSAT || Opcode == ISD::UADDSAT)
         return getNode(ISD::OR, DL, VT, N1, N2);
@@ -7358,7 +7359,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     assert(VT.isInteger() && "This operator does not apply to FP types!");
     assert(N1.getValueType() == N2.getValueType() &&
            N1.getValueType() == VT && "Binary operator types must match!");
-    if (VT.isVector() && VT.getVectorElementType() == MVT::i1)
+    if (VT.getScalarType() == MVT::i1)
       return getNode(ISD::XOR, DL, VT, N1, N2);
     break;
   case ISD::SMIN:
@@ -7366,7 +7367,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     assert(VT.isInteger() && "This operator does not apply to FP types!");
     assert(N1.getValueType() == N2.getValueType() &&
            N1.getValueType() == VT && "Binary operator types must match!");
-    if (VT.isVector() && VT.getVectorElementType() == MVT::i1)
+    if (VT.getScalarType() == MVT::i1)
       return getNode(ISD::OR, DL, VT, N1, N2);
     break;
   case ISD::SMAX:
@@ -7374,7 +7375,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     assert(VT.isInteger() && "This operator does not apply to FP types!");
     assert(N1.getValueType() == N2.getValueType() &&
            N1.getValueType() == VT && "Binary operator types must match!");
-    if (VT.isVector() && VT.getVectorElementType() == MVT::i1)
+    if (VT.getScalarType() == MVT::i1)
       return getNode(ISD::AND, DL, VT, N1, N2);
     break;
   case ISD::FADD:
@@ -10398,12 +10399,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
   case ISD::VP_ADD:
   case ISD::VP_SUB:
     // If it is VP_ADD/VP_SUB mask operation then turn it to VP_XOR
-    if (VT.isVector() && VT.getVectorElementType() == MVT::i1)
+    if (VT.getScalarType() == MVT::i1)
       Opcode = ISD::VP_XOR;
     break;
   case ISD::VP_MUL:
     // If it is VP_MUL mask operation then turn it to VP_AND
-    if (VT.isVector() && VT.getVectorElementType() == MVT::i1)
+    if (VT.getScalarType() == MVT::i1)
       Opcode = ISD::VP_AND;
     break;
   case ISD::VP_REDUCE_MUL:
@@ -10508,9 +10509,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
       return getNode(ISD::MERGE_VALUES, DL, VTList, {N1, ZeroOverFlow}, Flags);
     }
 
-    if (VTList.VTs[0].isVector() &&
-        VTList.VTs[0].getVectorElementType() == MVT::i1 &&
-        VTList.VTs[1].getVectorElementType() == MVT::i1) {
+    if (VTList.VTs[0].getScalarType() == MVT::i1 &&
+        VTList.VTs[1].getScalarType() == MVT::i1) {
       SDValue F1 = getFreeze(N1);
       SDValue F2 = getFreeze(N2);
       // {vXi1,vXi1} (u/s)addo(vXi1 x, vXi1y) -> {xor(x,y),and(x,y)}
diff --git llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 49ec47f4e8a7..98206b7484dc 100644
--- llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -7971,7 +7971,7 @@ static bool isNonZeroModBitWidthOrUndef(SDValue Z, unsigned BW) {
   return ISD::matchUnaryPredicate(
       Z,
       [=](ConstantSDNode *C) { return !C || C->getAPIntValue().urem(BW) != 0; },
-      true);
+      /*AllowUndef=*/true, /*AllowTruncation=*/true);
 }
 
 static SDValue expandVPFunnelShift(SDNode *Node, SelectionDAG &DAG) {
diff --git llvm/lib/CodeGen/StackColoring.cpp llvm/lib/CodeGen/StackColoring.cpp
index b77b8dbdd6e5..27c65d234a61 100644
--- llvm/lib/CodeGen/StackColoring.cpp
+++ llvm/lib/CodeGen/StackColoring.cpp
@@ -914,10 +914,10 @@ void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) {
     if (!VI.Var || !VI.inStackSlot())
       continue;
     int Slot = VI.getStackSlot();
-    if (SlotRemap.count(Slot)) {
+    if (auto It = SlotRemap.find(Slot); It != SlotRemap.end()) {
       LLVM_DEBUG(dbgs() << "Remapping debug info for ["
                         << cast<DILocalVariable>(VI.Var)->getName() << "].\n");
-      VI.updateStackSlot(SlotRemap[Slot]);
+      VI.updateStackSlot(It->second);
       FixedDbg++;
     }
   }
@@ -1004,10 +1004,11 @@ void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) {
         if (!AI)
           continue;
 
-        if (!Allocas.count(AI))
+        auto It = Allocas.find(AI);
+        if (It == Allocas.end())
           continue;
 
-        MMO->setValue(Allocas[AI]);
+        MMO->setValue(It->second);
         FixedMemOp++;
       }
 
@@ -1173,8 +1174,8 @@ void StackColoring::expungeSlotMap(DenseMap<int, int> &SlotRemap,
   // Expunge slot remap map.
   for (unsigned i=0; i < NumSlots; ++i) {
     // If we are remapping i
-    if (SlotRemap.count(i)) {
-      int Target = SlotRemap[i];
+    if (auto It = SlotRemap.find(i); It != SlotRemap.end()) {
+      int Target = It->second;
       // As long as our target is mapped to something else, follow it.
       while (SlotRemap.count(Target)) {
         Target = SlotRemap[Target];
diff --git llvm/lib/CodeGen/TwoAddressInstructionPass.cpp llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index fb6274b09919..6236268f77ab 100644
--- llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -489,6 +489,9 @@ MachineInstr *TwoAddressInstructionImpl::findOnlyInterestingUse(
     bool &IsDstPhys) const {
   MachineOperand *UseOp = nullptr;
   for (MachineOperand &MO : MRI->use_nodbg_operands(Reg)) {
+    if (MO.isUndef())
+      continue;
+
     MachineInstr *MI = MO.getParent();
     if (MI->getParent() != MBB)
       return nullptr;
diff --git llvm/lib/CodeGen/WindowScheduler.cpp llvm/lib/CodeGen/WindowScheduler.cpp
index f1658e36ae1e..e7fc0d9a3d25 100644
--- llvm/lib/CodeGen/WindowScheduler.cpp
+++ llvm/lib/CodeGen/WindowScheduler.cpp
@@ -356,8 +356,8 @@ void WindowScheduler::generateTripleMBB() {
           // ==================================
           //          < Terminators >
           // ==================================
-          if (DefPairs.count(NewUse))
-            NewUse = DefPairs[NewUse];
+          if (auto It = DefPairs.find(NewUse); It != DefPairs.end())
+            NewUse = It->second;
           NewMI->substituteRegister(DefRegPair.first, NewUse, 0, *TRI);
         }
       // DefPairs is updated at last.
@@ -581,9 +581,10 @@ DenseMap<MachineInstr *, int> WindowScheduler::getIssueOrder(unsigned Offset,
   DenseMap<MachineInstr *, int> IssueOrder;
   int Id = 0;
   for (int Cycle = 0; Cycle < (int)II; ++Cycle) {
-    if (!CycleToMIs.count(Cycle))
+    auto It = CycleToMIs.find(Cycle);
+    if (It == CycleToMIs.end())
       continue;
-    for (auto *MI : CycleToMIs[Cycle])
+    for (auto *MI : It->second)
       IssueOrder[MI] = Id++;
   }
   return IssueOrder;
diff --git llvm/lib/ExecutionEngine/IntelJITEvents/CMakeLists.txt llvm/lib/ExecutionEngine/IntelJITEvents/CMakeLists.txt
index b4fd04d65e26..56c529c08937 100644
--- llvm/lib/ExecutionEngine/IntelJITEvents/CMakeLists.txt
+++ llvm/lib/ExecutionEngine/IntelJITEvents/CMakeLists.txt
@@ -1,5 +1,8 @@
 include_directories( ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-include_directories( ${PROJECT_BINARY_DIR}/ittapi/include/ )
+if(NOT DEFINED ITTAPI_SOURCE_DIR)
+    set(ITTAPI_SOURCE_DIR ${PROJECT_BINARY_DIR})
+endif()
+include_directories( ${ITTAPI_SOURCE_DIR}/ittapi/include/ )
 add_llvm_component_library(LLVMIntelJITEvents
   IntelJITEventListener.cpp
 
diff --git llvm/lib/ExecutionEngine/JITLink/CMakeLists.txt llvm/lib/ExecutionEngine/JITLink/CMakeLists.txt
index 65dd0c7468ae..e5f5a99c39bc 100644
--- llvm/lib/ExecutionEngine/JITLink/CMakeLists.txt
+++ llvm/lib/ExecutionEngine/JITLink/CMakeLists.txt
@@ -3,7 +3,6 @@ tablegen(LLVM COFFOptions.inc -gen-opt-parser-defs)
 add_public_tablegen_target(JITLinkTableGen)
 
 add_llvm_component_library(LLVMJITLink
-  CompactUnwindSupport.cpp
   DWARFRecordSectionSplitter.cpp
   EHFrameSupport.cpp
   JITLink.cpp
diff --git llvm/lib/ExecutionEngine/JITLink/CompactUnwindSupport.cpp llvm/lib/ExecutionEngine/JITLink/CompactUnwindSupport.cpp
deleted file mode 100644
index 51e3d26479ff..000000000000
--- llvm/lib/ExecutionEngine/JITLink/CompactUnwindSupport.cpp
+++ /dev/null
@@ -1,103 +0,0 @@
-//=------- CompactUnwindSupport.cpp - Compact Unwind format support -------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Compact Unwind support.
-//
-//===----------------------------------------------------------------------===//
-
-#include "CompactUnwindSupport.h"
-
-#include "llvm/ADT/Sequence.h"
-
-#define DEBUG_TYPE "jitlink"
-
-namespace llvm {
-namespace jitlink {
-
-Error splitCompactUnwindBlocks(LinkGraph &G, Section &CompactUnwindSection,
-                               size_t RecordSize) {
-
-  std::vector<Block *> OriginalBlocks(CompactUnwindSection.blocks().begin(),
-                                      CompactUnwindSection.blocks().end());
-  LLVM_DEBUG({
-    dbgs() << "In " << G.getName() << " splitting compact unwind section "
-           << CompactUnwindSection.getName() << " containing "
-           << OriginalBlocks.size() << " initial blocks...\n";
-  });
-
-  while (!OriginalBlocks.empty()) {
-    auto *B = OriginalBlocks.back();
-    OriginalBlocks.pop_back();
-
-    if (B->getSize() == 0) {
-      LLVM_DEBUG({
-        dbgs() << "  Skipping empty block at "
-               << formatv("{0:x16}", B->getAddress()) << "\n";
-      });
-      continue;
-    }
-
-    unsigned NumBlocks = B->getSize() / RecordSize;
-
-    LLVM_DEBUG({
-      dbgs() << "  Splitting block at " << formatv("{0:x16}", B->getAddress())
-             << " into " << NumBlocks << " compact unwind record(s)\n";
-    });
-
-    if (B->getSize() % RecordSize)
-      return make_error<JITLinkError>(
-          "Error splitting compact unwind record in " + G.getName() +
-          ": block at " + formatv("{0:x}", B->getAddress()) + " has size " +
-          formatv("{0:x}", B->getSize()) +
-          " (not a multiple of CU record size of " +
-          formatv("{0:x}", RecordSize) + ")");
-
-    auto Blocks =
-        G.splitBlock(*B, map_range(seq(1U, NumBlocks), [=](Edge::OffsetT Idx) {
-          return Idx * RecordSize;
-        }));
-
-    for (auto *CURec : Blocks) {
-      bool AddedKeepAlive = false;
-
-      for (auto &E : CURec->edges()) {
-        if (E.getOffset() == 0) {
-          LLVM_DEBUG({
-            dbgs() << "    Updating compact unwind record at "
-                   << CURec->getAddress() << " to point to "
-                   << (E.getTarget().hasName() ? *E.getTarget().getName()
-                                               : StringRef())
-                   << " (at " << E.getTarget().getAddress() << ")\n";
-          });
-
-          if (E.getTarget().isExternal())
-            return make_error<JITLinkError>(
-                "Error adding keep-alive edge for compact unwind record at " +
-                formatv("{0:x}", CURec->getAddress()) + ": target " +
-                *E.getTarget().getName() + " is an external symbol");
-          auto &TgtBlock = E.getTarget().getBlock();
-          auto &CURecSym =
-              G.addAnonymousSymbol(*CURec, 0, RecordSize, false, false);
-          TgtBlock.addEdge(Edge::KeepAlive, 0, CURecSym, 0);
-          AddedKeepAlive = true;
-        }
-      }
-
-      if (!AddedKeepAlive)
-        return make_error<JITLinkError>(
-            "Error adding keep-alive edge for compact unwind record at " +
-            formatv("{0:x}", CURec->getAddress()) +
-            ": no outgoing target edge at offset 0");
-    }
-  }
-
-  return Error::success();
-}
-
-} // end namespace jitlink
-} // end namespace llvm
diff --git llvm/lib/ExecutionEngine/JITLink/CompactUnwindSupport.h llvm/lib/ExecutionEngine/JITLink/CompactUnwindSupport.h
deleted file mode 100644
index dc3ed942aa8a..000000000000
--- llvm/lib/ExecutionEngine/JITLink/CompactUnwindSupport.h
+++ /dev/null
@@ -1,653 +0,0 @@
-//===- CompactUnwindSupportImpl.h - Compact Unwind format impl --*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Compact Unwind format support implementation details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LIB_EXECUTIONENGINE_JITLINK_COMPACTUNWINDSUPPORTIMPL_H
-#define LIB_EXECUTIONENGINE_JITLINK_COMPACTUNWINDSUPPORTIMPL_H
-
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ExecutionEngine/JITLink/MachO.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/Endian.h"
-
-#define DEBUG_TYPE "jitlink_cu"
-
-namespace llvm {
-namespace jitlink {
-
-/// Split blocks in an __LD,__compact_unwind section on record boundaries.
-/// When this function returns edges within each record are guaranteed to be
-/// sorted by offset.
-Error splitCompactUnwindBlocks(LinkGraph &G, Section &CompactUnwindSection,
-                               size_t RecordSize);
-
-/// CRTP base for compact unwind traits classes. Automatically provides derived
-/// constants.
-///
-/// FIXME: Passing PtrSize as a template parameter is a hack to work around a
-///        bug in older MSVC compilers (until at least MSVC 15) where constexpr
-///        fields in the CRTP impl class were not visible to the base class.
-///        Once we no longer need to support these compilers the PtrSize
-///        template argument should be removed and PointerSize should be
-///        defined as a member in the CRTP Impl classes.
-template <typename CRTPImpl, size_t PtrSize> struct CompactUnwindTraits {
-  static constexpr size_t PointerSize = PtrSize;
-  static constexpr size_t Size = 3 * PointerSize + 2 * 4;
-  static constexpr size_t FnFieldOffset = 0;
-  static constexpr size_t SizeFieldOffset = FnFieldOffset + PointerSize;
-  static constexpr size_t EncodingFieldOffset = SizeFieldOffset + 4;
-  static constexpr size_t PersonalityFieldOffset = EncodingFieldOffset + 4;
-  static constexpr size_t LSDAFieldOffset =
-      PersonalityFieldOffset + PointerSize;
-
-  static uint32_t readPCRangeSize(ArrayRef<char> RecordContent) {
-    assert(SizeFieldOffset + 4 <= RecordContent.size() &&
-           "Truncated CU record?");
-    return support::endian::read32<CRTPImpl::Endianness>(RecordContent.data() +
-                                                         SizeFieldOffset);
-  }
-
-  static uint32_t readEncoding(ArrayRef<char> RecordContent) {
-    assert(EncodingFieldOffset + 4 <= RecordContent.size() &&
-           "Truncated CU record?");
-    return support::endian::read32<CRTPImpl::Endianness>(RecordContent.data() +
-                                                         EncodingFieldOffset);
-  }
-};
-
-/// Architecture specific implementation of CompactUnwindManager.
-template <typename CURecTraits> class CompactUnwindManager {
-public:
-  CompactUnwindManager(StringRef CompactUnwindSectionName,
-                       StringRef UnwindInfoSectionName,
-                       StringRef EHFrameSectionName)
-      : CompactUnwindSectionName(CompactUnwindSectionName),
-        UnwindInfoSectionName(UnwindInfoSectionName),
-        EHFrameSectionName(EHFrameSectionName) {}
-
-  // Split compact unwind records, add keep-alive edges from functions to
-  // compact unwind records, and from compact unwind records to FDEs where
-  // needed.
-  //
-  // This method must be called *after* __eh_frame has been processed: it
-  // assumes that eh-frame records have been split up and keep-alive edges have
-  // been inserted.
-  Error prepareForPrune(LinkGraph &G) {
-    Section *CUSec = G.findSectionByName(CompactUnwindSectionName);
-    if (!CUSec || CUSec->empty()) {
-      LLVM_DEBUG({
-        dbgs() << "Compact unwind: No compact unwind info for " << G.getName()
-               << "\n";
-      });
-      return Error::success();
-    }
-
-    LLVM_DEBUG({
-      dbgs() << "Compact unwind: preparing " << G.getName() << " for prune\n";
-    });
-
-    Section *EHFrameSec = G.findSectionByName(EHFrameSectionName);
-
-    if (auto Err = splitCompactUnwindBlocks(G, *CUSec, CURecTraits::Size))
-      return Err;
-
-    LLVM_DEBUG({
-      dbgs() << "  Preparing " << CUSec->blocks_size() << " blocks in "
-             << CompactUnwindSectionName << "\n";
-    });
-
-    for (auto *B : CUSec->blocks()) {
-
-      // Find target function edge.
-      Edge *PCBeginEdge = nullptr;
-      for (auto &E : B->edges_at(CURecTraits::FnFieldOffset)) {
-        PCBeginEdge = &E;
-        break;
-      }
-
-      if (!PCBeginEdge)
-        return make_error<JITLinkError>(
-            "In " + G.getName() + ", compact unwind record at " +
-            formatv("{0:x}", B->getAddress()) + " has no pc-begin edge");
-
-      if (!PCBeginEdge->getTarget().isDefined())
-        return make_error<JITLinkError>(
-            "In " + G.getName() + ", compact unwind record at " +
-            formatv("{0:x}", B->getAddress()) + " points at external symbol " +
-            *PCBeginEdge->getTarget().getName());
-
-      auto &Fn = PCBeginEdge->getTarget();
-
-      if (!Fn.isDefined()) {
-        LLVM_DEBUG({
-          dbgs() << "In " << CompactUnwindSectionName << " for " << G.getName()
-                 << " encountered unexpected pc-edge to undefined symbol "
-                 << Fn.getName() << "\n";
-        });
-        continue;
-      } else {
-        LLVM_DEBUG({
-          dbgs() << "    Found record for function ";
-          if (Fn.hasName())
-            dbgs() << Fn.getName();
-          else
-            dbgs() << "<anon @ " << Fn.getAddress() << '>';
-          dbgs() << '\n';
-        });
-      }
-
-      bool NeedsDWARF = CURecTraits::encodingSpecifiesDWARF(
-          CURecTraits::readEncoding(B->getContent()));
-
-      auto &CURecSym =
-          G.addAnonymousSymbol(*B, 0, CURecTraits::Size, false, false);
-
-      bool KeepAliveAlreadyPresent = false;
-      if (EHFrameSec) {
-        Edge *KeepAliveEdge = nullptr;
-        for (auto &E : Fn.getBlock().edges_at(0)) {
-          if (E.getKind() == Edge::KeepAlive && E.getTarget().isDefined() &&
-              &E.getTarget().getBlock().getSection() == EHFrameSec) {
-            KeepAliveEdge = &E;
-            break;
-          }
-        }
-
-        if (KeepAliveEdge) {
-          // Found a keep-alive edge to an FDE in the eh-frame. Switch the keep
-          // alive edge to point to the CU and if the CU needs DWARF then add
-          // an extra keep-alive edge from the CU to the FDE.
-          auto &FDE = KeepAliveEdge->getTarget();
-          KeepAliveEdge->setTarget(CURecSym);
-          KeepAliveAlreadyPresent = true;
-          if (NeedsDWARF) {
-            LLVM_DEBUG({
-              dbgs() << "      Needs DWARF: adding keep-alive edge to FDE at "
-                     << FDE.getAddress() << "\n";
-            });
-            B->addEdge(Edge::KeepAlive, 0, FDE, 0);
-          }
-        } else {
-          if (NeedsDWARF)
-            return make_error<JITLinkError>(
-                "In " + G.getName() + ", compact unwind recard ot " +
-                formatv("{0:x}", B->getAddress()) +
-                " needs DWARF, but no FDE was found");
-        }
-      } else {
-        if (NeedsDWARF)
-          return make_error<JITLinkError>(
-              "In " + G.getName() + ", compact unwind recard ot " +
-              formatv("{0:x}", B->getAddress()) + " needs DWARF, but no " +
-              EHFrameSectionName + " section exists");
-      }
-
-      if (!KeepAliveAlreadyPresent) {
-        // No FDE edge. We'll need to add a new edge from the function back
-        // to the CU record.
-        Fn.getBlock().addEdge(Edge::KeepAlive, 0, CURecSym, 0);
-      }
-    }
-
-    return Error::success();
-  }
-
-  /// Process all __compact_unwind records and reserve space for __unwind_info.
-  Error processAndReserveUnwindInfo(LinkGraph &G) {
-    // Bail out early if no unwind info.
-    Section *CUSec = G.findSectionByName(CompactUnwindSectionName);
-    if (!CUSec)
-      return Error::success();
-
-    // The __LD/__compact_unwind section is only used as input for the linker.
-    // We'll create a new __TEXT,__unwind_info section for unwind info output.
-    CUSec->setMemLifetime(orc::MemLifetime::NoAlloc);
-
-    // Find / make a mach-header to act as the base for unwind-info offsets
-    // (and to report the arch / subarch to libunwind).
-    if (auto Err = getOrCreateCompactUnwindBase(G))
-      return Err;
-
-    // Error out if there's already unwind-info in the graph: We have no idea
-    // how to merge unwind-info sections.
-    if (G.findSectionByName(UnwindInfoSectionName))
-      return make_error<JITLinkError>("In " + G.getName() + ", " +
-                                      UnwindInfoSectionName +
-                                      " already exists");
-
-    // Process the __compact_unwind section to build the Records vector that
-    // we'll use for writing the __unwind_info section.
-    if (auto Err = processCompactUnwind(G, *CUSec))
-      return Err;
-
-    // Calculate the size of __unwind_info.
-    size_t UnwindInfoSectionSize =
-        UnwindInfoSectionHeaderSize +
-        Personalities.size() * PersonalityEntrySize +
-        (NumSecondLevelPages + 1) * IndexEntrySize + NumLSDAs * LSDAEntrySize +
-        NumSecondLevelPages * SecondLevelPageHeaderSize +
-        Records.size() * SecondLevelPageEntrySize;
-
-    LLVM_DEBUG({
-      dbgs() << "In " << G.getName() << ", reserving "
-             << formatv("{0:x}", UnwindInfoSectionSize) << " bytes for "
-             << UnwindInfoSectionName << "\n";
-    });
-
-    // Create the __unwind_info section and reserve space for it.
-    Section &UnwindInfoSec =
-        G.createSection(UnwindInfoSectionName, orc::MemProt::Read);
-
-    auto UnwindInfoSectionContent = G.allocateBuffer(UnwindInfoSectionSize);
-    memset(UnwindInfoSectionContent.data(), 0, UnwindInfoSectionContent.size());
-    auto &B = G.createMutableContentBlock(
-        UnwindInfoSec, UnwindInfoSectionContent, orc::ExecutorAddr(), 8, 0);
-
-    // Add Keep-alive edges from the __unwind_info block to all of the target
-    // functions.
-    for (auto &R : Records)
-      B.addEdge(Edge::KeepAlive, 0, *R.Fn, 0);
-
-    return Error::success();
-  }
-
-  Error writeUnwindInfo(LinkGraph &G) {
-    Section *CUSec = G.findSectionByName(CompactUnwindSectionName);
-    if (!CUSec || CUSec->empty())
-      return Error::success();
-
-    Section *UnwindInfoSec = G.findSectionByName(UnwindInfoSectionName);
-    if (!UnwindInfoSec)
-      return make_error<JITLinkError>("In " + G.getName() + ", " +
-                                      UnwindInfoSectionName +
-                                      " missing after allocation");
-
-    if (UnwindInfoSec->blocks_size() != 1)
-      return make_error<JITLinkError>(
-          "In " + G.getName() + ", " + UnwindInfoSectionName +
-          " contains more than one block post-allocation");
-
-    LLVM_DEBUG(
-        { dbgs() << "Writing unwind info for " << G.getName() << "...\n"; });
-
-    mergeRecords();
-
-    auto &UnwindInfoBlock = **UnwindInfoSec->blocks().begin();
-    auto Content = UnwindInfoBlock.getMutableContent(G);
-    BinaryStreamWriter Writer(
-        {reinterpret_cast<uint8_t *>(Content.data()), Content.size()},
-        CURecTraits::Endianness);
-
-    // __unwind_info format, from mach-o/compact_unwind_encoding.h on Darwin:
-    //
-    // #define UNWIND_SECTION_VERSION 1
-    // struct unwind_info_section_header
-    // {
-    //     uint32_t    version;            // UNWIND_SECTION_VERSION
-    //     uint32_t    commonEncodingsArraySectionOffset;
-    //     uint32_t    commonEncodingsArrayCount;
-    //     uint32_t    personalityArraySectionOffset;
-    //     uint32_t    personalityArrayCount;
-    //     uint32_t    indexSectionOffset;
-    //     uint32_t    indexCount;
-    //     // compact_unwind_encoding_t[]
-    //     // uint32_t personalities[]
-    //     // unwind_info_section_header_index_entry[]
-    //     // unwind_info_section_header_lsda_index_entry[]
-    // };
-
-    if (auto Err = writeHeader(G, Writer))
-      return Err;
-
-    // Skip common encodings: JITLink doesn't use them.
-
-    if (auto Err = writePersonalities(G, Writer))
-      return Err;
-
-    // Calculate the offset to the LSDAs.
-    size_t SectionOffsetToLSDAs =
-        Writer.getOffset() + (NumSecondLevelPages + 1) * IndexEntrySize;
-
-    // Calculate offset to the 1st second-level page.
-    size_t SectionOffsetToSecondLevelPages =
-        SectionOffsetToLSDAs + NumLSDAs * LSDAEntrySize;
-
-    if (auto Err = writeIndexes(G, Writer, SectionOffsetToLSDAs,
-                                SectionOffsetToSecondLevelPages))
-      return Err;
-
-    if (auto Err = writeLSDAs(G, Writer))
-      return Err;
-
-    if (auto Err = writeSecondLevelPages(G, Writer))
-      return Err;
-
-    LLVM_DEBUG({
-      dbgs() << "    Wrote " << formatv("{0:x}", Writer.getOffset())
-             << " bytes of unwind info.\n";
-    });
-
-    return Error::success();
-  }
-
-private:
-  // Calculate the size of unwind-info.
-  static constexpr size_t MaxPersonalities = 4;
-  static constexpr size_t PersonalityShift = 28;
-
-  static constexpr size_t UnwindInfoSectionHeaderSize = 4 * 7;
-  static constexpr size_t PersonalityEntrySize = 4;
-  static constexpr size_t IndexEntrySize = 3 * 4;
-  static constexpr size_t LSDAEntrySize = 2 * 4;
-  static constexpr size_t SecondLevelPageSize = 4096;
-  static constexpr size_t SecondLevelPageHeaderSize = 8;
-  static constexpr size_t SecondLevelPageEntrySize = 8;
-  static constexpr size_t NumRecordsPerSecondLevelPage =
-      (SecondLevelPageSize - SecondLevelPageHeaderSize) /
-      SecondLevelPageEntrySize;
-
-  struct CompactUnwindRecord {
-    Symbol *Fn = nullptr;
-    uint32_t Size = 0;
-    uint32_t Encoding = 0;
-    Symbol *LSDA = nullptr;
-    Symbol *FDE = nullptr;
-  };
-
-  Error processCompactUnwind(LinkGraph &G, Section &CUSec) {
-    // TODO: Reset NumLSDAs, Personalities and CompactUnwindRecords if
-    // processing more than once.
-    assert(NumLSDAs == 0 && "NumLSDAs should be zero");
-    assert(Records.empty() && "CompactUnwindRecords vector should be empty.");
-    assert(Personalities.empty() && "Personalities vector should be empty.");
-
-    SmallVector<CompactUnwindRecord> NonUniquedRecords;
-    NonUniquedRecords.reserve(CUSec.blocks_size());
-
-    // Process __compact_unwind blocks.
-    for (auto *B : CUSec.blocks()) {
-      CompactUnwindRecord R;
-      R.Encoding = CURecTraits::readEncoding(B->getContent());
-      for (auto &E : B->edges()) {
-        switch (E.getOffset()) {
-        case CURecTraits::FnFieldOffset:
-          // This could be the function-pointer, or the FDE keep-alive. Check
-          // the type to decide.
-          if (E.getKind() == Edge::KeepAlive)
-            R.FDE = &E.getTarget();
-          else
-            R.Fn = &E.getTarget();
-          break;
-        case CURecTraits::PersonalityFieldOffset: {
-          // Add the Personality to the Personalities map and update the
-          // encoding.
-          size_t PersonalityIdx = 0;
-          for (; PersonalityIdx != Personalities.size(); ++PersonalityIdx)
-            if (Personalities[PersonalityIdx] == &E.getTarget())
-              break;
-          if (PersonalityIdx == MaxPersonalities)
-            return make_error<JITLinkError>(
-                "In " + G.getName() +
-                ", __compact_unwind contains too many personalities (max " +
-                formatv("{}", MaxPersonalities) + ")");
-          if (PersonalityIdx == Personalities.size())
-            Personalities.push_back(&E.getTarget());
-
-          R.Encoding |= (PersonalityIdx + 1) << PersonalityShift;
-          break;
-        }
-        case CURecTraits::LSDAFieldOffset:
-          ++NumLSDAs;
-          R.LSDA = &E.getTarget();
-          break;
-        default:
-          return make_error<JITLinkError>("In " + G.getName() +
-                                          ", compact unwind record at " +
-                                          formatv("{0:x}", B->getAddress()) +
-                                          " has unrecognized edge at offset " +
-                                          formatv("{0:x}", E.getOffset()));
-        }
-      }
-      Records.push_back(R);
-    }
-
-    // Sort the records into ascending order.
-    llvm::sort(Records, [](const CompactUnwindRecord &LHS,
-                           const CompactUnwindRecord &RHS) {
-      return LHS.Fn->getAddress() < RHS.Fn->getAddress();
-    });
-
-    // Calculate the number of second-level pages required.
-    NumSecondLevelPages = (Records.size() + NumRecordsPerSecondLevelPage - 1) /
-                          NumRecordsPerSecondLevelPage;
-
-    // Convert personality symbols to GOT entry pointers.
-    typename CURecTraits::GOTManager GOT(G);
-    for (auto &Personality : Personalities)
-      Personality = &GOT.getEntryForTarget(G, *Personality);
-
-    LLVM_DEBUG({
-      dbgs() << "  In " << G.getName() << ", " << CompactUnwindSectionName
-             << ": raw records = " << Records.size()
-             << ", personalities = " << Personalities.size()
-             << ", lsdas = " << NumLSDAs << "\n";
-    });
-
-    return Error::success();
-  }
-
-  void mergeRecords() {
-    SmallVector<CompactUnwindRecord> NonUniqued = std::move(Records);
-    Records.reserve(NonUniqued.size());
-
-    Records.push_back(NonUniqued.front());
-    for (size_t I = 1; I != NonUniqued.size(); ++I) {
-      auto &Next = NonUniqued[I];
-      auto &Last = Records.back();
-
-      bool NextNeedsDWARF = CURecTraits::encodingSpecifiesDWARF(Next.Encoding);
-      bool CannotBeMerged = CURecTraits::encodingCannotBeMerged(Next.Encoding);
-      if (NextNeedsDWARF || (Next.Encoding != Last.Encoding) ||
-          CannotBeMerged || Next.LSDA || Last.LSDA)
-        Records.push_back(Next);
-    }
-
-    // Recalculate derived values that may have changed.
-    NumSecondLevelPages = (Records.size() + NumRecordsPerSecondLevelPage - 1) /
-                          NumRecordsPerSecondLevelPage;
-  }
-
-  Error writeHeader(LinkGraph &G, BinaryStreamWriter &W) {
-    if (!isUInt<32>(NumSecondLevelPages + 1))
-      return make_error<JITLinkError>("In " + G.getName() + ", too many " +
-                                      UnwindInfoSectionName +
-                                      "second-level pages required");
-
-    // Write __unwind_info header.
-    size_t IndexArrayOffset = UnwindInfoSectionHeaderSize +
-                              Personalities.size() * PersonalityEntrySize;
-
-    cantFail(W.writeInteger<uint32_t>(1));
-    cantFail(W.writeInteger<uint32_t>(UnwindInfoSectionHeaderSize));
-    cantFail(W.writeInteger<uint32_t>(0));
-    cantFail(W.writeInteger<uint32_t>(UnwindInfoSectionHeaderSize));
-    cantFail(W.writeInteger<uint32_t>(Personalities.size()));
-    cantFail(W.writeInteger<uint32_t>(IndexArrayOffset));
-    cantFail(W.writeInteger<uint32_t>(NumSecondLevelPages + 1));
-
-    return Error::success();
-  }
-
-  Error writePersonalities(LinkGraph &G, BinaryStreamWriter &W) {
-    // Write personalities.
-    for (auto *PSym : Personalities) {
-      auto Delta = PSym->getAddress() - CompactUnwindBase->getAddress();
-      if (!isUInt<32>(Delta))
-        return makePersonalityRangeError(G, *PSym);
-      cantFail(W.writeInteger<uint32_t>(Delta));
-    }
-    return Error::success();
-  }
-
-  Error writeIndexes(LinkGraph &G, BinaryStreamWriter &W,
-                     size_t SectionOffsetToLSDAs,
-                     size_t SectionOffsetToSecondLevelPages) {
-    // Assume that function deltas are ok in this method -- we'll error
-    // check all of them when we write the second level pages.
-
-    // Write the header index entries.
-    size_t RecordIdx = 0;
-    size_t NumPreviousLSDAs = 0;
-    for (auto &R : Records) {
-      // If this record marks the start of a new second level page.
-      if (RecordIdx % NumRecordsPerSecondLevelPage == 0) {
-        auto FnDelta = R.Fn->getAddress() - CompactUnwindBase->getAddress();
-        auto SecondLevelPageOffset = SectionOffsetToSecondLevelPages +
-                                     (RecordIdx / NumRecordsPerSecondLevelPage);
-        auto LSDAOffset =
-            SectionOffsetToLSDAs + NumPreviousLSDAs * LSDAEntrySize;
-
-        cantFail(W.writeInteger<uint32_t>(FnDelta));
-        cantFail(W.writeInteger<uint32_t>(SecondLevelPageOffset));
-        cantFail(W.writeInteger<uint32_t>(LSDAOffset));
-      }
-      if (R.LSDA)
-        ++NumPreviousLSDAs;
-      ++RecordIdx;
-    }
-
-    // Write the index array terminator.
-    {
-      auto FnEndDelta =
-          Records.back().Fn->getRange().End - CompactUnwindBase->getAddress();
-
-      if (LLVM_UNLIKELY(!isUInt<32>(FnEndDelta)))
-        return make_error<JITLinkError>(
-            "In " + G.getName() + " " + UnwindInfoSectionName +
-            ", delta to end of functions  " +
-            formatv("{0:x}", Records.back().Fn->getRange().End) +
-            " exceeds 32 bits");
-
-      cantFail(W.writeInteger<uint32_t>(FnEndDelta));
-      cantFail(W.writeInteger<uint32_t>(0));
-      cantFail(W.writeInteger<uint32_t>(SectionOffsetToSecondLevelPages));
-    }
-
-    return Error::success();
-  }
-
-  Error writeLSDAs(LinkGraph &G, BinaryStreamWriter &W) {
-    // As with writeIndexes, assume that function deltas are ok for now.
-    for (auto &R : Records) {
-      if (R.LSDA) {
-        auto FnDelta = R.Fn->getAddress() - CompactUnwindBase->getAddress();
-        auto LSDADelta = R.LSDA->getAddress() - CompactUnwindBase->getAddress();
-
-        if (LLVM_UNLIKELY(!isUInt<32>(LSDADelta)))
-          return make_error<JITLinkError>(
-              "In " + G.getName() + " " + UnwindInfoSectionName +
-              ", delta to lsda at " + formatv("{0:x}", R.LSDA->getAddress()) +
-              " exceeds 32 bits");
-
-        cantFail(W.writeInteger<uint32_t>(FnDelta));
-        cantFail(W.writeInteger<uint32_t>(LSDADelta));
-      }
-    }
-
-    return Error::success();
-  }
-
-  Error writeSecondLevelPages(LinkGraph &G, BinaryStreamWriter &W) {
-    size_t RecordIdx = 0;
-
-    for (auto &R : Records) {
-      // When starting a new second-level page, write the page header:
-      //
-      //   2     : uint32_t    -- UNWIND_SECOND_LEVEL_REGULAR
-      //   8     : uint16_t    -- size of second level page table header
-      //   count : uint16_t    -- num entries in this second-level page
-      if (RecordIdx % NumRecordsPerSecondLevelPage == 0) {
-        constexpr uint32_t SecondLevelPageHeaderKind = 2;
-        constexpr uint16_t SecondLevelPageHeaderSize = 8;
-        uint16_t SecondLevelPageNumEntries =
-            std::min(Records.size() - RecordIdx, NumRecordsPerSecondLevelPage);
-
-        cantFail(W.writeInteger<uint32_t>(SecondLevelPageHeaderKind));
-        cantFail(W.writeInteger<uint16_t>(SecondLevelPageHeaderSize));
-        cantFail(W.writeInteger<uint16_t>(SecondLevelPageNumEntries));
-      }
-
-      // Write entry.
-      auto FnDelta = R.Fn->getAddress() - CompactUnwindBase->getAddress();
-
-      if (LLVM_UNLIKELY(!isUInt<32>(FnDelta)))
-        return make_error<JITLinkError>(
-            "In " + G.getName() + " " + UnwindInfoSectionName +
-            ", delta to function at " + formatv("{0:x}", R.Fn->getAddress()) +
-            " exceeds 32 bits");
-
-      cantFail(W.writeInteger<uint32_t>(FnDelta));
-      cantFail(W.writeInteger<uint32_t>(R.Encoding));
-
-      ++RecordIdx;
-    }
-
-    return Error::success();
-  }
-
-  Error getOrCreateCompactUnwindBase(LinkGraph &G) {
-    auto Name = G.intern("__jitlink$libunwind_dso_base");
-    CompactUnwindBase = G.findAbsoluteSymbolByName(Name);
-    if (!CompactUnwindBase) {
-      if (auto LocalCUBase = getOrCreateLocalMachOHeader(G)) {
-        CompactUnwindBase = &*LocalCUBase;
-        auto &B = LocalCUBase->getBlock();
-        G.addDefinedSymbol(B, 0, *Name, B.getSize(), Linkage::Strong,
-                           Scope::Local, false, true);
-      } else
-        return LocalCUBase.takeError();
-    }
-    CompactUnwindBase->setLive(true);
-    return Error::success();
-  }
-
-  Error makePersonalityRangeError(LinkGraph &G, Symbol &PSym) {
-    std::string ErrMsg;
-    {
-      raw_string_ostream ErrStream(ErrMsg);
-      ErrStream << "In " << G.getName() << " " << UnwindInfoSectionName
-                << ", personality ";
-      if (PSym.hasName())
-        ErrStream << PSym.getName() << " ";
-      ErrStream << "at " << PSym.getAddress()
-                << " is out of 32-bit delta range of compact-unwind base at "
-                << CompactUnwindBase->getAddress();
-    }
-    return make_error<JITLinkError>(std::move(ErrMsg));
-  }
-
-  StringRef CompactUnwindSectionName;
-  StringRef UnwindInfoSectionName;
-  StringRef EHFrameSectionName;
-  Symbol *CompactUnwindBase = nullptr;
-
-  size_t NumLSDAs = 0;
-  size_t NumSecondLevelPages = 0;
-  SmallVector<Symbol *, MaxPersonalities> Personalities;
-  SmallVector<CompactUnwindRecord> Records;
-};
-
-} // end namespace jitlink
-} // end namespace llvm
-
-#undef DEBUG_TYPE
-
-#endif // LIB_EXECUTIONENGINE_JITLINK_COMPACTUNWINDSUPPORTIMPL_H
diff --git llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp
index 179e458c3cd1..3e757f780b55 100644
--- llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp
+++ llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp
@@ -733,5 +733,121 @@ Error MachOLinkGraphBuilder::graphifyCStringSection(
   return Error::success();
 }
 
+Error CompactUnwindSplitter::operator()(LinkGraph &G) {
+  auto *CUSec = G.findSectionByName(CompactUnwindSectionName);
+  if (!CUSec)
+    return Error::success();
+
+  if (!G.getTargetTriple().isOSBinFormatMachO())
+    return make_error<JITLinkError>(
+        "Error linking " + G.getName() +
+        ": compact unwind splitting not supported on non-macho target " +
+        G.getTargetTriple().str());
+
+  unsigned CURecordSize = 0;
+  unsigned PersonalityEdgeOffset = 0;
+  unsigned LSDAEdgeOffset = 0;
+  switch (G.getTargetTriple().getArch()) {
+  case Triple::aarch64:
+  case Triple::x86_64:
+    // 64-bit compact-unwind record format:
+    // Range start: 8 bytes.
+    // Range size:  4 bytes.
+    // CU encoding: 4 bytes.
+    // Personality: 8 bytes.
+    // LSDA:        8 bytes.
+    CURecordSize = 32;
+    PersonalityEdgeOffset = 16;
+    LSDAEdgeOffset = 24;
+    break;
+  default:
+    return make_error<JITLinkError>(
+        "Error linking " + G.getName() +
+        ": compact unwind splitting not supported on " +
+        G.getTargetTriple().getArchName());
+  }
+
+  std::vector<Block *> OriginalBlocks(CUSec->blocks().begin(),
+                                      CUSec->blocks().end());
+  LLVM_DEBUG({
+    dbgs() << "In " << G.getName() << " splitting compact unwind section "
+           << CompactUnwindSectionName << " containing "
+           << OriginalBlocks.size() << " initial blocks...\n";
+  });
+
+  while (!OriginalBlocks.empty()) {
+    auto *B = OriginalBlocks.back();
+    OriginalBlocks.pop_back();
+
+    if (B->getSize() == 0) {
+      LLVM_DEBUG({
+        dbgs() << "  Skipping empty block at "
+               << formatv("{0:x16}", B->getAddress()) << "\n";
+      });
+      continue;
+    }
+
+    unsigned NumBlocks = B->getSize() / CURecordSize;
+
+    LLVM_DEBUG({
+      dbgs() << "  Splitting block at " << formatv("{0:x16}", B->getAddress())
+             << " into " << NumBlocks << " compact unwind record(s)\n";
+    });
+
+    if (B->getSize() % CURecordSize)
+      return make_error<JITLinkError>(
+          "Error splitting compact unwind record in " + G.getName() +
+          ": block at " + formatv("{0:x}", B->getAddress()) + " has size " +
+          formatv("{0:x}", B->getSize()) +
+          " (not a multiple of CU record size of " +
+          formatv("{0:x}", CURecordSize) + ")");
+
+    auto Blocks =
+        G.splitBlock(*B, map_range(seq(1U, NumBlocks), [=](Edge::OffsetT Idx) {
+          return Idx * CURecordSize;
+        }));
+
+    for (auto *CURec : Blocks) {
+      bool AddedKeepAlive = false;
+
+      for (auto &E : CURec->edges()) {
+        if (E.getOffset() == 0) {
+          LLVM_DEBUG({
+            dbgs() << "    Updating compact unwind record at "
+                   << CURec->getAddress() << " to point to "
+                   << (E.getTarget().hasName() ? *E.getTarget().getName()
+                                               : StringRef())
+                   << " (at " << E.getTarget().getAddress() << ")\n";
+          });
+
+          if (E.getTarget().isExternal())
+            return make_error<JITLinkError>(
+                "Error adding keep-alive edge for compact unwind record at " +
+                formatv("{0:x}", CURec->getAddress()) + ": target " +
+                *E.getTarget().getName() + " is an external symbol");
+          auto &TgtBlock = E.getTarget().getBlock();
+          auto &CURecSym =
+              G.addAnonymousSymbol(*CURec, 0, CURecordSize, false, false);
+          TgtBlock.addEdge(Edge::KeepAlive, 0, CURecSym, 0);
+          AddedKeepAlive = true;
+        } else if (E.getOffset() != PersonalityEdgeOffset &&
+                   E.getOffset() != LSDAEdgeOffset)
+          return make_error<JITLinkError>(
+              "Unexpected edge at offset " + formatv("{0:x}", E.getOffset()) +
+              " in compact unwind record at " +
+              formatv("{0:x}", CURec->getAddress()));
+      }
+
+      if (!AddedKeepAlive)
+        return make_error<JITLinkError>(
+            "Error adding keep-alive edge for compact unwind record at " +
+            formatv("{0:x}", CURec->getAddress()) +
+            ": no outgoing target edge at offset 0");
+    }
+  }
+
+  return Error::success();
+}
+
 } // end namespace jitlink
 } // end namespace llvm
diff --git llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.h llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.h
index 343218ec9ad1..6afa01250f62 100644
--- llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.h
+++ llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.h
@@ -236,6 +236,17 @@ private:
   StringMap<SectionParserFunction> CustomSectionParserFunctions;
 };
 
+/// A pass to split up __LD,__compact_unwind sections.
+class CompactUnwindSplitter {
+public:
+  CompactUnwindSplitter(StringRef CompactUnwindSectionName)
+      : CompactUnwindSectionName(CompactUnwindSectionName) {}
+  Error operator()(LinkGraph &G);
+
+private:
+  StringRef CompactUnwindSectionName;
+};
+
 } // end namespace jitlink
 } // end namespace llvm
 
diff --git llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp
index f9f2f4ebb2c8..29061fff9c2a 100644
--- llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp
+++ llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp
@@ -14,7 +14,6 @@
 #include "llvm/ExecutionEngine/JITLink/DWARFRecordSectionSplitter.h"
 #include "llvm/ExecutionEngine/JITLink/aarch64.h"
 
-#include "CompactUnwindSupport.h"
 #include "DefineExternalSectionStartAndEndSymbols.h"
 #include "MachOLinkGraphBuilder.h"
 
@@ -626,27 +625,6 @@ static Error applyPACSigningToModInitPointers(LinkGraph &G) {
   return Error::success();
 }
 
-struct CompactUnwindTraits_MachO_arm64
-    : public CompactUnwindTraits<CompactUnwindTraits_MachO_arm64,
-                                 /* PointerSize = */ 8> {
-  // FIXME: Reinstate once we no longer need the MSVC workaround. See
-  //        FIXME for CompactUnwindTraits in CompactUnwindSupport.h.
-  // constexpr static size_t PointerSize = 8;
-
-  constexpr static endianness Endianness = endianness::little;
-
-  constexpr static uint32_t EncodingModeMask = 0x0f000000;
-
-  using GOTManager = aarch64::GOTTableManager;
-
-  static bool encodingSpecifiesDWARF(uint32_t Encoding) {
-    constexpr uint32_t DWARFMode = 0x03000000;
-    return (Encoding & EncodingModeMask) == DWARFMode;
-  }
-
-  static bool encodingCannotBeMerged(uint32_t Encoding) { return false; }
-};
-
 void link_MachO_arm64(std::unique_ptr<LinkGraph> G,
                       std::unique_ptr<JITLinkContext> Ctx) {
 
@@ -659,21 +637,16 @@ void link_MachO_arm64(std::unique_ptr<LinkGraph> G,
     else
       Config.PrePrunePasses.push_back(markAllSymbolsLive);
 
+    // Add compact unwind splitter pass.
+    Config.PrePrunePasses.push_back(
+        CompactUnwindSplitter("__LD,__compact_unwind"));
+
     // Add eh-frame passes.
+    // FIXME: Prune eh-frames for which compact-unwind is available once
+    // we support compact-unwind registration with libunwind.
     Config.PrePrunePasses.push_back(createEHFrameSplitterPass_MachO_arm64());
     Config.PrePrunePasses.push_back(createEHFrameEdgeFixerPass_MachO_arm64());
 
-    // Create a compact-unwind manager for use in passes below.
-    auto CompactUnwindMgr =
-        std::make_shared<CompactUnwindManager<CompactUnwindTraits_MachO_arm64>>(
-            "__LD,__compact_unwind", "__TEXT,__unwind_info",
-            "__TEXT,__eh_frame");
-
-    // Add compact unwind prepare pass.
-    Config.PrePrunePasses.push_back([CompactUnwindMgr](LinkGraph &G) {
-      return CompactUnwindMgr->prepareForPrune(G);
-    });
-
     // Resolve any external section start / end symbols.
     Config.PostAllocationPasses.push_back(
         createDefineExternalSectionStartAndEndSymbolsPass(
@@ -690,16 +663,6 @@ void link_MachO_arm64(std::unique_ptr<LinkGraph> G,
       Config.PreFixupPasses.push_back(
           aarch64::lowerPointer64AuthEdgesToSigningFunction);
     }
-
-    // Reserve unwind-info space.
-    Config.PostPrunePasses.push_back([CompactUnwindMgr](LinkGraph &G) {
-      return CompactUnwindMgr->processAndReserveUnwindInfo(G);
-    });
-
-    // Translate compact-unwind to unwind-info.
-    Config.PreFixupPasses.push_back([CompactUnwindMgr](LinkGraph &G) {
-      return CompactUnwindMgr->writeUnwindInfo(G);
-    });
   }
 
   if (auto Err = Ctx->modifyPassConfig(*G, Config))
diff --git llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp
index 218f8ce97ef0..9547266dc978 100644
--- llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp
+++ llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp
@@ -14,7 +14,6 @@
 #include "llvm/ExecutionEngine/JITLink/DWARFRecordSectionSplitter.h"
 #include "llvm/ExecutionEngine/JITLink/x86_64.h"
 
-#include "CompactUnwindSupport.h"
 #include "DefineExternalSectionStartAndEndSymbols.h"
 #include "MachOLinkGraphBuilder.h"
 
@@ -501,55 +500,25 @@ Expected<std::unique_ptr<LinkGraph>> createLinkGraphFromMachOObject_x86_64(
       .buildGraph();
 }
 
-struct CompactUnwindTraits_MachO_x86_64
-    : public CompactUnwindTraits<CompactUnwindTraits_MachO_x86_64,
-                                 /* PointerSize = */ 8> {
-  // FIXME: Reinstate once we no longer need the MSVC workaround. See
-  //        FIXME for CompactUnwindTraits in CompactUnwindSupport.h.
-  // constexpr static size_t PointerSize = 8;
-
-  constexpr static endianness Endianness = endianness::little;
-
-  constexpr static uint32_t EncodingModeMask = 0x0f000000;
-
-  using GOTManager = x86_64::GOTTableManager;
-
-  static bool encodingSpecifiesDWARF(uint32_t Encoding) {
-    constexpr uint32_t DWARFMode = 0x04000000;
-    return (Encoding & EncodingModeMask) == DWARFMode;
-  }
-
-  static bool encodingCannotBeMerged(uint32_t Encoding) {
-    constexpr uint32_t StackIndirectMode = 0x03000000;
-    return (Encoding & EncodingModeMask) == StackIndirectMode;
-  }
-};
-
 void link_MachO_x86_64(std::unique_ptr<LinkGraph> G,
                        std::unique_ptr<JITLinkContext> Ctx) {
 
   PassConfiguration Config;
 
   if (Ctx->shouldAddDefaultTargetPasses(G->getTargetTriple())) {
-    // Add a mark-live pass.
-    if (auto MarkLive = Ctx->getMarkLivePass(G->getTargetTriple()))
-      Config.PrePrunePasses.push_back(std::move(MarkLive));
-    else
-      Config.PrePrunePasses.push_back(markAllSymbolsLive);
-
     // Add eh-frame passes.
     Config.PrePrunePasses.push_back(createEHFrameSplitterPass_MachO_x86_64());
     Config.PrePrunePasses.push_back(createEHFrameEdgeFixerPass_MachO_x86_64());
 
-    // Create a compact-unwind manager for use in passes below.
-    auto CompactUnwindMgr = std::make_shared<
-        CompactUnwindManager<CompactUnwindTraits_MachO_x86_64>>(
-        "__LD,__compact_unwind", "__TEXT,__unwind_info", "__TEXT,__eh_frame");
+    // Add compact unwind splitter pass.
+    Config.PrePrunePasses.push_back(
+        CompactUnwindSplitter("__LD,__compact_unwind"));
 
-    // Add compact unwind prepare pass.
-    Config.PrePrunePasses.push_back([CompactUnwindMgr](LinkGraph &G) {
-      return CompactUnwindMgr->prepareForPrune(G);
-    });
+    // Add a mark-live pass.
+    if (auto MarkLive = Ctx->getMarkLivePass(G->getTargetTriple()))
+      Config.PrePrunePasses.push_back(std::move(MarkLive));
+    else
+      Config.PrePrunePasses.push_back(markAllSymbolsLive);
 
     // Resolve any external section start / end symbols.
     Config.PostAllocationPasses.push_back(
@@ -559,16 +528,6 @@ void link_MachO_x86_64(std::unique_ptr<LinkGraph> G,
     // Add an in-place GOT/Stubs pass.
     Config.PostPrunePasses.push_back(buildGOTAndStubs_MachO_x86_64);
 
-    // Reserve space for unwind-info.
-    Config.PostPrunePasses.push_back([CompactUnwindMgr](LinkGraph &G) {
-      return CompactUnwindMgr->processAndReserveUnwindInfo(G);
-    });
-
-    // Translate compact-unwind to unwind-info.
-    Config.PreFixupPasses.push_back([CompactUnwindMgr](LinkGraph &G) {
-      return CompactUnwindMgr->writeUnwindInfo(G);
-    });
-
     // Add GOT/Stubs optimizer pass.
     Config.PreFixupPasses.push_back(x86_64::optimizeGOTAndStubAccesses);
   }
diff --git llvm/lib/ExecutionEngine/Orc/CMakeLists.txt llvm/lib/ExecutionEngine/Orc/CMakeLists.txt
index 8a866294eee2..2ab5d6dd39b6 100644
--- llvm/lib/ExecutionEngine/Orc/CMakeLists.txt
+++ llvm/lib/ExecutionEngine/Orc/CMakeLists.txt
@@ -57,7 +57,6 @@ add_llvm_component_library(LLVMOrcJIT
   ExecutorProcessControl.cpp
   TaskDispatch.cpp
   ThreadSafeModule.cpp
-  UnwindInfoRegistrationPlugin.cpp
   RedirectionManager.cpp
   JITLinkRedirectableSymbolManager.cpp
   ReOptimizeLayer.cpp
diff --git llvm/lib/ExecutionEngine/Orc/CompileUtils.cpp llvm/lib/ExecutionEngine/Orc/CompileUtils.cpp
index c4d65af1b57f..5d2f3cd4a8be 100644
--- llvm/lib/ExecutionEngine/Orc/CompileUtils.cpp
+++ llvm/lib/ExecutionEngine/Orc/CompileUtils.cpp
@@ -33,9 +33,6 @@ irManglingOptionsFromTargetOptions(const TargetOptions &Opts) {
 
 /// Compile a Module to an ObjectFile.
 Expected<SimpleCompiler::CompileResult> SimpleCompiler::operator()(Module &M) {
-  if (M.getDataLayout().isDefault())
-    M.setDataLayout(TM.createDataLayout());
-
   CompileResult CachedObject = tryToLoadFromObjectCache(M);
   if (CachedObject)
     return std::move(CachedObject);
diff --git llvm/lib/ExecutionEngine/Orc/Core.cpp llvm/lib/ExecutionEngine/Orc/Core.cpp
index 9f466e725668..d47eb4416d3c 100644
--- llvm/lib/ExecutionEngine/Orc/Core.cpp
+++ llvm/lib/ExecutionEngine/Orc/Core.cpp
@@ -1251,7 +1251,9 @@ JITDylib::JITDylib(ExecutionSession &ES, std::string Name)
   LinkOrder.push_back({this, JITDylibLookupFlags::MatchAllSymbols});
 }
 
-JITDylib::RemoveTrackerResult JITDylib::IL_removeTracker(ResourceTracker &RT) {
+std::pair<JITDylib::AsynchronousSymbolQuerySet,
+          std::shared_ptr<SymbolDependenceMap>>
+JITDylib::IL_removeTracker(ResourceTracker &RT) {
   // Note: Should be called under the session lock.
   assert(State != Closed && "JD is defunct");
 
@@ -1290,10 +1292,7 @@ JITDylib::RemoveTrackerResult JITDylib::IL_removeTracker(ResourceTracker &RT) {
       SymbolsToFail.push_back(Sym);
   }
 
-  auto [QueriesToFail, FailedSymbols] =
-      ES.IL_failSymbols(*this, std::move(SymbolsToFail));
-
-  std::vector<std::unique_ptr<MaterializationUnit>> DefunctMUs;
+  auto Result = ES.IL_failSymbols(*this, std::move(SymbolsToFail));
 
   // Removed symbols should be taken out of the table altogether.
   for (auto &Sym : SymbolsToRemove) {
@@ -1303,12 +1302,7 @@ JITDylib::RemoveTrackerResult JITDylib::IL_removeTracker(ResourceTracker &RT) {
     // Remove Materializer if present.
     if (I->second.hasMaterializerAttached()) {
       // FIXME: Should this discard the symbols?
-      auto J = UnmaterializedInfos.find(Sym);
-      assert(J != UnmaterializedInfos.end() &&
-             "Symbol table indicates MU present, but no UMI record");
-      if (J->second->MU)
-        DefunctMUs.push_back(std::move(J->second->MU));
-      UnmaterializedInfos.erase(J);
+      UnmaterializedInfos.erase(Sym);
     } else {
       assert(!UnmaterializedInfos.count(Sym) &&
              "Symbol has materializer attached");
@@ -1319,8 +1313,7 @@ JITDylib::RemoveTrackerResult JITDylib::IL_removeTracker(ResourceTracker &RT) {
 
   shrinkMaterializationInfoMemory();
 
-  return {std::move(QueriesToFail), std::move(FailedSymbols),
-          std::move(DefunctMUs)};
+  return Result;
 }
 
 void JITDylib::transferTracker(ResourceTracker &DstRT, ResourceTracker &SrcRT) {
@@ -2187,17 +2180,16 @@ Error ExecutionSession::removeResourceTracker(ResourceTracker &RT) {
   });
   std::vector<ResourceManager *> CurrentResourceManagers;
 
-  JITDylib::RemoveTrackerResult R;
+  JITDylib::AsynchronousSymbolQuerySet QueriesToFail;
+  std::shared_ptr<SymbolDependenceMap> FailedSymbols;
 
   runSessionLocked([&] {
     CurrentResourceManagers = ResourceManagers;
     RT.makeDefunct();
-    R = RT.getJITDylib().IL_removeTracker(RT);
+    std::tie(QueriesToFail, FailedSymbols) =
+        RT.getJITDylib().IL_removeTracker(RT);
   });
 
-  // Release any defunct MaterializationUnits.
-  R.DefunctMUs.clear();
-
   Error Err = Error::success();
 
   auto &JD = RT.getJITDylib();
@@ -2205,9 +2197,9 @@ Error ExecutionSession::removeResourceTracker(ResourceTracker &RT) {
     Err = joinErrors(std::move(Err),
                      L->handleRemoveResources(JD, RT.getKeyUnsafe()));
 
-  for (auto &Q : R.QueriesToFail)
-    Q->handleFailed(make_error<FailedToMaterialize>(getSymbolStringPool(),
-                                                    R.FailedSymbols));
+  for (auto &Q : QueriesToFail)
+    Q->handleFailed(
+        make_error<FailedToMaterialize>(getSymbolStringPool(), FailedSymbols));
 
   return Err;
 }
diff --git llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp
index aae7369fc29c..7f0a45941cf9 100644
--- llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp
+++ llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp
@@ -1,5 +1,4 @@
-//===------ ELFNixPlatform.cpp - Utilities for executing ELFNix in Orc
-//-----===//
+//===----- ELFNixPlatform.cpp - Utilities for executing ELFNix in Orc -----===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git llvm/lib/ExecutionEngine/Orc/ExecutorProcessControl.cpp llvm/lib/ExecutionEngine/Orc/ExecutorProcessControl.cpp
index b51fa24be76d..aa799687e6d5 100644
--- llvm/lib/ExecutionEngine/Orc/ExecutorProcessControl.cpp
+++ llvm/lib/ExecutionEngine/Orc/ExecutorProcessControl.cpp
@@ -45,7 +45,6 @@ SelfExecutorProcessControl::SelfExecutorProcessControl(
   this->DylibMgr = this;
   this->JDI = {ExecutorAddr::fromPtr(jitDispatchViaWrapperFunctionManager),
                ExecutorAddr::fromPtr(this)};
-
   if (this->TargetTriple.isOSBinFormatMachO())
     GlobalManglingPrefix = '_';
 
@@ -53,12 +52,6 @@ SelfExecutorProcessControl::SelfExecutorProcessControl(
       ExecutorAddr::fromPtr(&llvm_orc_registerEHFrameSectionWrapper);
   this->BootstrapSymbols[rt::DeregisterEHFrameSectionWrapperName] =
       ExecutorAddr::fromPtr(&llvm_orc_deregisterEHFrameSectionWrapper);
-
-#ifdef __APPLE__
-  this->UnwindInfoMgr = UnwindInfoManager::TryCreate();
-  if (this->UnwindInfoMgr)
-    this->UnwindInfoMgr->addBootstrapSymbols(this->BootstrapSymbols);
-#endif // __APPLE__
 }
 
 Expected<std::unique_ptr<SelfExecutorProcessControl>>
diff --git llvm/lib/ExecutionEngine/Orc/LLJIT.cpp llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
index ab7f854187b3..80500d0fdd9b 100644
--- llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
+++ llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
@@ -21,7 +21,6 @@
 #include "llvm/ExecutionEngine/Orc/ObjectTransformLayer.h"
 #include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
 #include "llvm/ExecutionEngine/Orc/TargetProcess/RegisterEHFrames.h"
-#include "llvm/ExecutionEngine/Orc/UnwindInfoRegistrationPlugin.h"
 #include "llvm/ExecutionEngine/SectionMemoryManager.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
@@ -1221,28 +1220,12 @@ Expected<JITDylibSP> setUpGenericLLVMIRPlatform(LLJIT &J) {
 
   if (auto *OLL = dyn_cast<ObjectLinkingLayer>(&J.getObjLinkingLayer())) {
 
-    bool CompactUnwindInfoSupported = false;
-
-    // Enable compact-unwind support if possible.
-    if (J.getTargetTriple().isOSDarwin() ||
-        J.getTargetTriple().isOSBinFormatMachO()) {
-      if (auto UIRP = UnwindInfoRegistrationPlugin::Create(
-              J.getIRCompileLayer(), PlatformJD)) {
-        CompactUnwindInfoSupported = true;
-        OLL->addPlugin(std::move(*UIRP));
-      } else
-        consumeError(UIRP.takeError());
-    }
-
-    // Otherwise fall back to standard unwind registration.
-    if (!CompactUnwindInfoSupported) {
-      auto &ES = J.getExecutionSession();
-      if (auto EHFrameRegistrar = EPCEHFrameRegistrar::Create(ES))
-        OLL->addPlugin(std::make_unique<EHFrameRegistrationPlugin>(
-            ES, std::move(*EHFrameRegistrar)));
-      else
-        return EHFrameRegistrar.takeError();
-    }
+    auto &ES = J.getExecutionSession();
+    if (auto EHFrameRegistrar = EPCEHFrameRegistrar::Create(ES))
+      OLL->addPlugin(std::make_unique<EHFrameRegistrationPlugin>(
+          ES, std::move(*EHFrameRegistrar)));
+    else
+      return EHFrameRegistrar.takeError();
   }
 
   J.setPlatformSupport(
diff --git llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
index 48d54190fafb..9479a69d4f0b 100644
--- llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
+++ llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
@@ -1278,8 +1278,7 @@ MachOPlatform::MachOPlatformPlugin::findUnwindSectionInfo(
   if (Section *EHFrameSec = G.findSectionByName(MachOEHFrameSectionName))
     ScanUnwindInfoSection(*EHFrameSec, US.DwarfSection);
 
-  if (Section *CUInfoSec =
-          G.findSectionByName(MachOCompactUnwindInfoSectionName))
+  if (Section *CUInfoSec = G.findSectionByName(MachOCompactUnwindSectionName))
     ScanUnwindInfoSection(*CUInfoSec, US.CompactUnwindSection);
 
   // If we didn't find any pointed-to code-blocks then there's no need to
diff --git llvm/lib/ExecutionEngine/Orc/Shared/MachOObjectFormat.cpp llvm/lib/ExecutionEngine/Orc/Shared/MachOObjectFormat.cpp
index 11e8eb7bc3a1..be92acd37aa8 100644
--- llvm/lib/ExecutionEngine/Orc/Shared/MachOObjectFormat.cpp
+++ llvm/lib/ExecutionEngine/Orc/Shared/MachOObjectFormat.cpp
@@ -18,7 +18,7 @@ namespace orc {
 StringRef MachODataCommonSectionName = "__DATA,__common";
 StringRef MachODataDataSectionName = "__DATA,__data";
 StringRef MachOEHFrameSectionName = "__TEXT,__eh_frame";
-StringRef MachOCompactUnwindInfoSectionName = "__TEXT,__unwind_info";
+StringRef MachOCompactUnwindSectionName = "__TEXT,__unwind_info";
 StringRef MachOCStringSectionName = "__TEXT,__cstring";
 StringRef MachOModInitFuncSectionName = "__DATA,__mod_init_func";
 StringRef MachOObjCCatListSectionName = "__DATA,__objc_catlist";
diff --git llvm/lib/ExecutionEngine/Orc/Shared/OrcRTBridge.cpp llvm/lib/ExecutionEngine/Orc/Shared/OrcRTBridge.cpp
index fef3ff989a52..54a25c007c58 100644
--- llvm/lib/ExecutionEngine/Orc/Shared/OrcRTBridge.cpp
+++ llvm/lib/ExecutionEngine/Orc/Shared/OrcRTBridge.cpp
@@ -64,19 +64,5 @@ const char *RunAsIntFunctionWrapperName =
     "__llvm_orc_bootstrap_run_as_int_function_wrapper";
 
 } // end namespace rt
-namespace rt_alt {
-const char *UnwindInfoManagerInstanceName =
-    "orc_rt_alt_UnwindInfoManager_Instance";
-const char *UnwindInfoManagerFindSectionsHelperName =
-    "orc_rt_alt_UnwindInfoManager_findSectionsHelper";
-const char *UnwindInfoManagerEnableWrapperName =
-    "orc_rt_alt_UnwindInfoManager_enable";
-const char *UnwindInfoManagerDisableWrapperName =
-    "orc_rt_alt_UnwindInfoManager_disable";
-const char *UnwindInfoManagerRegisterActionName =
-    "orc_rt_alt_UnwindInfoManager_register";
-const char *UnwindInfoManagerDeregisterActionName =
-    "orc_rt_alt_UnwindInfoManager_deregister";
-} // end namespace rt_alt
 } // end namespace orc
 } // end namespace llvm
diff --git llvm/lib/ExecutionEngine/Orc/TargetProcess/CMakeLists.txt llvm/lib/ExecutionEngine/Orc/TargetProcess/CMakeLists.txt
index ffc1bbfa121b..03677d610cbb 100644
--- llvm/lib/ExecutionEngine/Orc/TargetProcess/CMakeLists.txt
+++ llvm/lib/ExecutionEngine/Orc/TargetProcess/CMakeLists.txt
@@ -6,7 +6,10 @@ set(intel_jit_profiling )
 if( LLVM_USE_INTEL_JITEVENTS )
   set(intel_jit_profiling IntelJITProfiling)
   include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../IntelJITProfiling)
-  include_directories(${PROJECT_BINARY_DIR}/ittapi/include/ )
+  if(NOT DEFINED ITTAPI_SOURCE_DIR)
+      set(ITTAPI_SOURCE_DIR ${PROJECT_BINARY_DIR})
+  endif()
+  include_directories(${ITTAPI_SOURCE_DIR}/ittapi/include/ )
 endif()
 
 add_llvm_component_library(LLVMOrcTargetProcess
@@ -20,7 +23,6 @@ add_llvm_component_library(LLVMOrcTargetProcess
   SimpleExecutorMemoryManager.cpp
   SimpleRemoteEPCServer.cpp
   TargetExecutionUtils.cpp
-  UnwindInfoManager.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/ExecutionEngine/Orc
diff --git llvm/lib/ExecutionEngine/Orc/TargetProcess/UnwindInfoManager.cpp llvm/lib/ExecutionEngine/Orc/TargetProcess/UnwindInfoManager.cpp
deleted file mode 100644
index 68bba9520c19..000000000000
--- llvm/lib/ExecutionEngine/Orc/TargetProcess/UnwindInfoManager.cpp
+++ /dev/null
@@ -1,188 +0,0 @@
-//===------- UnwindInfoManager.cpp - Register unwind info sections --------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ExecutionEngine/Orc/TargetProcess/UnwindInfoManager.h"
-#include "llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h"
-#include "llvm/ExecutionEngine/Orc/Shared/WrapperFunctionUtils.h"
-#include "llvm/Support/DynamicLibrary.h"
-
-#define DEBUG_TYPE "orc"
-
-using namespace llvm;
-using namespace llvm::orc;
-using namespace llvm::orc::shared;
-
-static orc::shared::CWrapperFunctionResult
-llvm_orc_rt_alt_UnwindInfoManager_enable(const char *Data, uint64_t Size) {
-  return WrapperFunction<SPSError(SPSExecutorAddr, SPSExecutorAddr)>::handle(
-             Data, Size,
-             [](ExecutorAddr Instance, ExecutorAddr FindFn) {
-               return Instance.toPtr<UnwindInfoManager *>()->enable(
-                   FindFn.toPtr<void *>());
-             })
-      .release();
-}
-
-static orc::shared::CWrapperFunctionResult
-llvm_orc_rt_alt_UnwindInfoManager_disable(const char *Data, uint64_t Size) {
-  return WrapperFunction<SPSError(SPSExecutorAddr)>::handle(
-             Data, Size,
-             [](ExecutorAddr Instance) {
-               return Instance.toPtr<UnwindInfoManager *>()->disable();
-             })
-      .release();
-}
-
-static orc::shared::CWrapperFunctionResult
-llvm_orc_rt_alt_UnwindInfoManager_register(const char *Data, uint64_t Size) {
-  using SPSSig =
-      SPSError(SPSExecutorAddr, SPSSequence<SPSExecutorAddrRange>,
-               SPSExecutorAddr, SPSExecutorAddrRange, SPSExecutorAddrRange);
-
-  return WrapperFunction<SPSSig>::handle(
-             Data, Size,
-             [](ExecutorAddr Instance,
-                std::vector<ExecutorAddrRange> CodeRanges, ExecutorAddr DSOBase,
-                ExecutorAddrRange DWARFRange,
-                ExecutorAddrRange CompactUnwindRange) {
-               return Instance.toPtr<UnwindInfoManager *>()->registerSections(
-                   CodeRanges, DSOBase, DWARFRange, CompactUnwindRange);
-             })
-      .release();
-}
-
-static orc::shared::CWrapperFunctionResult
-llvm_orc_rt_alt_UnwindInfoManager_deregister(const char *Data, uint64_t Size) {
-  using SPSSig = SPSError(SPSExecutorAddr, SPSSequence<SPSExecutorAddrRange>);
-
-  return WrapperFunction<SPSSig>::handle(
-             Data, Size,
-             [](ExecutorAddr Instance,
-                std::vector<ExecutorAddrRange> CodeRanges) {
-               return Instance.toPtr<UnwindInfoManager *>()->deregisterSections(
-                   CodeRanges);
-             })
-      .release();
-}
-
-namespace llvm::orc {
-
-const char *UnwindInfoManager::AddFnName =
-    "__unw_add_find_dynamic_unwind_sections";
-const char *UnwindInfoManager::RemoveFnName =
-    "__unw_remove_find_dynamic_unwind_sections";
-
-std::unique_ptr<UnwindInfoManager> UnwindInfoManager::TryCreate() {
-  std::string ErrMsg;
-  auto DL = sys::DynamicLibrary::getPermanentLibrary(nullptr, &ErrMsg);
-  if (!DL.isValid())
-    return nullptr;
-
-  auto AddFindDynamicUnwindSections =
-      (int (*)(void *))DL.getAddressOfSymbol(AddFnName);
-  if (!AddFindDynamicUnwindSections)
-    return nullptr;
-
-  auto RemoveFindDynamicUnwindSections =
-      (int (*)(void *))DL.getAddressOfSymbol(RemoveFnName);
-  if (!RemoveFindDynamicUnwindSections)
-    return nullptr;
-
-  return std::unique_ptr<UnwindInfoManager>(new UnwindInfoManager(
-      AddFindDynamicUnwindSections, RemoveFindDynamicUnwindSections));
-}
-
-Error UnwindInfoManager::shutdown() { return Error::success(); }
-
-void UnwindInfoManager::addBootstrapSymbols(StringMap<ExecutorAddr> &M) {
-  M[rt_alt::UnwindInfoManagerInstanceName] = ExecutorAddr::fromPtr(this);
-  M[rt_alt::UnwindInfoManagerFindSectionsHelperName] =
-      ExecutorAddr::fromPtr(&findSectionsHelper);
-  M[rt_alt::UnwindInfoManagerEnableWrapperName] =
-      ExecutorAddr::fromPtr(llvm_orc_rt_alt_UnwindInfoManager_enable);
-  M[rt_alt::UnwindInfoManagerDisableWrapperName] =
-      ExecutorAddr::fromPtr(llvm_orc_rt_alt_UnwindInfoManager_disable);
-  M[rt_alt::UnwindInfoManagerRegisterActionName] =
-      ExecutorAddr::fromPtr(llvm_orc_rt_alt_UnwindInfoManager_register);
-  M[rt_alt::UnwindInfoManagerDeregisterActionName] =
-      ExecutorAddr::fromPtr(llvm_orc_rt_alt_UnwindInfoManager_deregister);
-}
-
-Error UnwindInfoManager::enable(void *FindDynamicUnwindSections) {
-  LLVM_DEBUG(dbgs() << "Enabling UnwindInfoManager.\n");
-
-  if (auto Err = AddFindDynamicUnwindSections(FindDynamicUnwindSections))
-    return make_error<StringError>(Twine("Could not register function via ") +
-                                       AddFnName +
-                                       ", error code = " + Twine(Err),
-                                   inconvertibleErrorCode());
-
-  this->FindDynamicUnwindSections = FindDynamicUnwindSections;
-  return Error::success();
-}
-
-Error UnwindInfoManager::disable(void) {
-  LLVM_DEBUG(dbgs() << "Disabling UnwindInfoManager.\n");
-
-  if (FindDynamicUnwindSections)
-    if (auto Err = RemoveFindDynamicUnwindSections(FindDynamicUnwindSections))
-      return make_error<StringError>(
-          Twine("Could not deregister function via ") + RemoveFnName +
-              "error code = " + Twine(Err),
-          inconvertibleErrorCode());
-
-  FindDynamicUnwindSections = nullptr;
-  return Error::success();
-}
-
-Error UnwindInfoManager::registerSections(
-    ArrayRef<ExecutorAddrRange> CodeRanges, ExecutorAddr DSOBase,
-    ExecutorAddrRange DWARFEHFrame, ExecutorAddrRange CompactUnwind) {
-  std::lock_guard<std::mutex> Lock(M);
-  for (auto &R : CodeRanges)
-    UWSecs[R.Start.getValue()] = UnwindSections{
-        static_cast<uintptr_t>(DSOBase.getValue()),
-        static_cast<uintptr_t>(DWARFEHFrame.Start.getValue()),
-        static_cast<size_t>(DWARFEHFrame.size()),
-        static_cast<uintptr_t>(CompactUnwind.Start.getValue()),
-        static_cast<size_t>(CompactUnwind.size())};
-  return Error::success();
-}
-
-Error UnwindInfoManager::deregisterSections(
-    ArrayRef<ExecutorAddrRange> CodeRanges) {
-  std::lock_guard<std::mutex> Lock(M);
-  for (auto &R : CodeRanges) {
-    auto I = UWSecs.find(R.Start.getValue());
-    if (I == UWSecs.end())
-      return make_error<StringError>(
-          "No unwind-info sections registered for range " +
-              formatv("{0:x} - {1:x}", R.Start, R.End),
-          inconvertibleErrorCode());
-    UWSecs.erase(I);
-  }
-  return Error::success();
-}
-
-int UnwindInfoManager::findSections(uintptr_t Addr, UnwindSections *Info) {
-  std::lock_guard<std::mutex> Lock(M);
-  auto I = UWSecs.upper_bound(Addr);
-  if (I == UWSecs.begin())
-    return 0;
-  --I;
-  *Info = I->second;
-  return 1;
-}
-
-int UnwindInfoManager::findSectionsHelper(UnwindInfoManager *Instance,
-                                          uintptr_t Addr,
-                                          UnwindSections *Info) {
-  return Instance->findSections(Addr, Info);
-}
-
-} // namespace llvm::orc
diff --git llvm/lib/ExecutionEngine/Orc/UnwindInfoRegistrationPlugin.cpp llvm/lib/ExecutionEngine/Orc/UnwindInfoRegistrationPlugin.cpp
deleted file mode 100644
index 0073f3daf7f2..000000000000
--- llvm/lib/ExecutionEngine/Orc/UnwindInfoRegistrationPlugin.cpp
+++ /dev/null
@@ -1,238 +0,0 @@
-//===----- UnwindInfoRegistrationPlugin.cpp - libunwind registration ------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ExecutionEngine/Orc/UnwindInfoRegistrationPlugin.h"
-
-#include "llvm/ADT/ScopeExit.h"
-#include "llvm/ExecutionEngine/Orc/AbsoluteSymbols.h"
-#include "llvm/ExecutionEngine/Orc/Shared/MachOObjectFormat.h"
-#include "llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Module.h"
-
-#define DEBUG_TYPE "orc"
-
-using namespace llvm::jitlink;
-
-static const char *FindDynamicUnwindSectionsFunctionName =
-    "_orc_rt_alt_find_dynamic_unwind_sections";
-
-namespace llvm::orc {
-
-Expected<std::shared_ptr<UnwindInfoRegistrationPlugin>>
-UnwindInfoRegistrationPlugin::Create(IRLayer &IRL, JITDylib &PlatformJD,
-                                     ExecutorAddr Instance,
-                                     ExecutorAddr FindHelper,
-                                     ExecutorAddr Enable, ExecutorAddr Disable,
-                                     ExecutorAddr Register,
-                                     ExecutorAddr Deregister) {
-
-  auto &ES = IRL.getExecutionSession();
-
-  // Build bouncer module.
-  auto M = makeBouncerModule(ES);
-  if (!M)
-    return M.takeError();
-
-  auto BouncerRT = PlatformJD.createResourceTracker();
-  auto RemoveBouncerModule = make_scope_exit([&]() {
-    if (auto Err = BouncerRT->remove())
-      ES.reportError(std::move(Err));
-  });
-
-  if (auto Err = PlatformJD.define(absoluteSymbols(
-          {{ES.intern(rt_alt::UnwindInfoManagerInstanceName),
-            ExecutorSymbolDef(Instance, JITSymbolFlags())},
-           {ES.intern(rt_alt::UnwindInfoManagerFindSectionsHelperName),
-            ExecutorSymbolDef(FindHelper, JITSymbolFlags::Callable)}})))
-    return std::move(Err);
-
-  if (auto Err = IRL.add(BouncerRT, std::move(*M)))
-    return Err;
-
-  auto FindUnwindSections =
-      ES.lookup({&PlatformJD}, FindDynamicUnwindSectionsFunctionName);
-  if (!FindUnwindSections)
-    return FindUnwindSections.takeError();
-
-  using namespace shared;
-  using SPSEnableSig = SPSError(SPSExecutorAddr, SPSExecutorAddr);
-  Error CallErr = Error::success();
-  if (auto Err = ES.callSPSWrapper<SPSEnableSig>(
-          Enable, CallErr, Instance, FindUnwindSections->getAddress())) {
-    consumeError(std::move(CallErr));
-    return std::move(Err);
-  }
-
-  if (CallErr)
-    return std::move(CallErr);
-
-  RemoveBouncerModule.release();
-
-  return std::shared_ptr<UnwindInfoRegistrationPlugin>(
-      new UnwindInfoRegistrationPlugin(ES, Instance, Disable, Register,
-                                       Deregister));
-}
-
-Expected<std::shared_ptr<UnwindInfoRegistrationPlugin>>
-UnwindInfoRegistrationPlugin::Create(IRLayer &IRL, JITDylib &PlatformJD) {
-
-  ExecutorAddr Instance, FindHelper, Enable, Disable, Register, Deregister;
-
-  auto &EPC = IRL.getExecutionSession().getExecutorProcessControl();
-  if (auto Err = EPC.getBootstrapSymbols(
-          {{Instance, rt_alt::UnwindInfoManagerInstanceName},
-           {FindHelper, rt_alt::UnwindInfoManagerFindSectionsHelperName},
-           {Enable, rt_alt::UnwindInfoManagerEnableWrapperName},
-           {Disable, rt_alt::UnwindInfoManagerDisableWrapperName},
-           {Register, rt_alt::UnwindInfoManagerRegisterActionName},
-           {Deregister, rt_alt::UnwindInfoManagerDeregisterActionName}}))
-    return std::move(Err);
-
-  return Create(IRL, PlatformJD, Instance, FindHelper, Enable, Disable,
-                Register, Deregister);
-}
-
-UnwindInfoRegistrationPlugin::~UnwindInfoRegistrationPlugin() {
-  using namespace shared;
-  using SPSDisableSig = SPSError(SPSExecutorAddr);
-  Error CallErr = Error::success();
-  if (auto Err = ES.callSPSWrapper<SPSDisableSig>(Disable, CallErr, Instance)) {
-    consumeError(std::move(CallErr));
-    ES.reportError(std::move(Err));
-  }
-  if (CallErr)
-    ES.reportError(std::move(CallErr));
-}
-
-void UnwindInfoRegistrationPlugin::modifyPassConfig(
-    MaterializationResponsibility &MR, LinkGraph &G,
-    PassConfiguration &PassConfig) {
-
-  PassConfig.PostFixupPasses.push_back(
-      [this](LinkGraph &G) { return addUnwindInfoRegistrationActions(G); });
-}
-
-Expected<ThreadSafeModule>
-UnwindInfoRegistrationPlugin::makeBouncerModule(ExecutionSession &ES) {
-  auto Ctx = std::make_unique<LLVMContext>();
-  auto M = std::make_unique<Module>("__libunwind_find_unwind_bouncer", *Ctx);
-  M->setTargetTriple(ES.getTargetTriple().str());
-
-  auto EscapeName = [](const char *N) { return std::string("\01") + N; };
-
-  auto *PtrTy = PointerType::getUnqual(*Ctx);
-  auto *OpaqueStructTy = StructType::create(*Ctx, "UnwindInfoMgr");
-  auto *UnwindMgrInstance = new GlobalVariable(
-      *M, OpaqueStructTy, true, GlobalValue::ExternalLinkage, nullptr,
-      EscapeName(rt_alt::UnwindInfoManagerInstanceName));
-
-  auto *Int64Ty = Type::getInt64Ty(*Ctx);
-  auto *FindHelperTy = FunctionType::get(Int64Ty, {PtrTy, PtrTy, PtrTy}, false);
-  auto *FindHelperFn = Function::Create(
-      FindHelperTy, GlobalValue::ExternalLinkage,
-      EscapeName(rt_alt::UnwindInfoManagerFindSectionsHelperName), *M);
-
-  auto *FindFnTy = FunctionType::get(Int64Ty, {PtrTy, PtrTy}, false);
-  auto *FindFn =
-      Function::Create(FindFnTy, GlobalValue::ExternalLinkage,
-                       EscapeName(FindDynamicUnwindSectionsFunctionName), *M);
-  auto *EntryBlock = BasicBlock::Create(M->getContext(), StringRef(), FindFn);
-  IRBuilder<> IB(EntryBlock);
-
-  std::vector<Value *> FindHelperArgs;
-  FindHelperArgs.push_back(UnwindMgrInstance);
-  for (auto &Arg : FindFn->args())
-    FindHelperArgs.push_back(&Arg);
-
-  IB.CreateRet(IB.CreateCall(FindHelperFn, FindHelperArgs));
-
-  return ThreadSafeModule(std::move(M), std::move(Ctx));
-}
-
-Error UnwindInfoRegistrationPlugin::addUnwindInfoRegistrationActions(
-    LinkGraph &G) {
-  ExecutorAddrRange EHFrameRange, UnwindInfoRange;
-
-  std::vector<Block *> CodeBlocks;
-
-  auto ScanUnwindInfoSection = [&](Section &Sec, ExecutorAddrRange &SecRange) {
-    if (Sec.empty())
-      return;
-
-    SecRange.Start = (*Sec.blocks().begin())->getAddress();
-    for (auto *B : Sec.blocks()) {
-      auto R = B->getRange();
-      SecRange.Start = std::min(SecRange.Start, R.Start);
-      SecRange.End = std::max(SecRange.End, R.End);
-      for (auto &E : B->edges()) {
-        if (E.getKind() != Edge::KeepAlive || !E.getTarget().isDefined())
-          continue;
-        auto &TargetBlock = E.getTarget().getBlock();
-        auto &TargetSection = TargetBlock.getSection();
-        if ((TargetSection.getMemProt() & MemProt::Exec) == MemProt::Exec)
-          CodeBlocks.push_back(&TargetBlock);
-      }
-    }
-  };
-
-  if (auto *EHFrame = G.findSectionByName(MachOEHFrameSectionName))
-    ScanUnwindInfoSection(*EHFrame, EHFrameRange);
-
-  if (auto *UnwindInfo = G.findSectionByName(MachOCompactUnwindInfoSectionName))
-    ScanUnwindInfoSection(*UnwindInfo, UnwindInfoRange);
-
-  if (CodeBlocks.empty())
-    return Error::success();
-
-  if ((EHFrameRange == ExecutorAddrRange() &&
-       UnwindInfoRange == ExecutorAddrRange()))
-    return Error::success();
-
-  llvm::sort(CodeBlocks, [](const Block *LHS, const Block *RHS) {
-    return LHS->getAddress() < RHS->getAddress();
-  });
-
-  SmallVector<ExecutorAddrRange> CodeRanges;
-  for (auto *B : CodeBlocks) {
-    if (CodeRanges.empty() || CodeRanges.back().End != B->getAddress())
-      CodeRanges.push_back(B->getRange());
-    else
-      CodeRanges.back().End = B->getRange().End;
-  }
-
-  ExecutorAddr DSOBase;
-  if (auto *DSOBaseSym = G.findAbsoluteSymbolByName(DSOBaseName))
-    DSOBase = DSOBaseSym->getAddress();
-  else if (auto *DSOBaseSym = G.findExternalSymbolByName(DSOBaseName))
-    DSOBase = DSOBaseSym->getAddress();
-  else if (auto *DSOBaseSym = G.findDefinedSymbolByName(DSOBaseName))
-    DSOBase = DSOBaseSym->getAddress();
-  else
-    return make_error<StringError>("In " + G.getName() +
-                                       " could not find dso base symbol",
-                                   inconvertibleErrorCode());
-
-  using namespace shared;
-  using SPSRegisterArgs =
-      SPSArgList<SPSExecutorAddr, SPSSequence<SPSExecutorAddrRange>,
-                 SPSExecutorAddr, SPSExecutorAddrRange, SPSExecutorAddrRange>;
-  using SPSDeregisterArgs =
-      SPSArgList<SPSExecutorAddr, SPSSequence<SPSExecutorAddrRange>>;
-
-  G.allocActions().push_back(
-      {cantFail(WrapperFunctionCall::Create<SPSRegisterArgs>(
-           Register, Instance, CodeRanges, DSOBase, EHFrameRange,
-           UnwindInfoRange)),
-       cantFail(WrapperFunctionCall::Create<SPSDeregisterArgs>(
-           Deregister, Instance, CodeRanges))});
-
-  return Error::success();
-}
-
-} // namespace llvm::orc
diff --git llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 4c4a0d25906c..695b15ac31f3 100644
--- llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -7438,10 +7438,15 @@ emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
       // '@.omp_target_task_proxy_func' in the pseudo code above)
       // "@.omp_target_task_proxy_func' is generated by
       // emitTargetTaskProxyFunction.
-      if (OutlinedFnID)
+      if (OutlinedFnID && DeviceID)
         return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
                                            EmitTargetCallFallbackCB, KArgs,
                                            DeviceID, RTLoc, TargetTaskAllocaIP);
+
+      // We only need to do the outlining if `DeviceID` is set to avoid calling
+      // `emitKernelLaunch` if we want to code-gen for the host; e.g. if we are
+      // generating the `else` branch of an `if` clause.
+      //
       // When OutlinedFnID is set to nullptr, then it's not an offloading call.
       // In this case, we execute the host implementation directly.
       return EmitTargetCallFallbackCB(OMPBuilder.Builder.saveIP());
diff --git llvm/lib/IR/Value.cpp llvm/lib/IR/Value.cpp
index eddb67282fca..b5a69b9ecdde 100644
--- llvm/lib/IR/Value.cpp
+++ llvm/lib/IR/Value.cpp
@@ -714,7 +714,8 @@ const Value *Value::stripPointerCastsForAliasAnalysis() const {
 const Value *Value::stripAndAccumulateConstantOffsets(
     const DataLayout &DL, APInt &Offset, bool AllowNonInbounds,
     bool AllowInvariantGroup,
-    function_ref<bool(Value &, APInt &)> ExternalAnalysis) const {
+    function_ref<bool(Value &, APInt &)> ExternalAnalysis,
+    bool LookThroughIntToPtr) const {
   if (!getType()->isPtrOrPtrVectorTy())
     return this;
 
@@ -775,6 +776,24 @@ const Value *Value::stripAndAccumulateConstantOffsets(
           V = RV;
         if (AllowInvariantGroup && Call->isLaunderOrStripInvariantGroup())
           V = Call->getArgOperand(0);
+    } else if (auto *Int2Ptr = dyn_cast<Operator>(V)) {
+      // Try to accumulate across (inttoptr (add (ptrtoint p), off)).
+      if (!AllowNonInbounds || !LookThroughIntToPtr || !Int2Ptr ||
+          Int2Ptr->getOpcode() != Instruction::IntToPtr ||
+          Int2Ptr->getOperand(0)->getType()->getScalarSizeInBits() != BitWidth)
+        return V;
+
+      auto *Add = dyn_cast<AddOperator>(Int2Ptr->getOperand(0));
+      if (!Add)
+        return V;
+
+      auto *Ptr2Int = dyn_cast<PtrToIntOperator>(Add->getOperand(0));
+      auto *CI = dyn_cast<ConstantInt>(Add->getOperand(1));
+      if (!Ptr2Int || !CI)
+        return V;
+
+      Offset += CI->getValue();
+      V = Ptr2Int->getOperand(0);
     }
     assert(V->getType()->isPtrOrPtrVectorTy() && "Unexpected operand type!");
   } while (Visited.insert(V).second);
diff --git llvm/lib/MC/MCAsmStreamer.cpp llvm/lib/MC/MCAsmStreamer.cpp
index dd8058c6d5cd..550246565662 100644
--- llvm/lib/MC/MCAsmStreamer.cpp
+++ llvm/lib/MC/MCAsmStreamer.cpp
@@ -398,6 +398,8 @@ public:
                          SMLoc Loc) override;
   void emitWinCFIPushFrame(bool Code, SMLoc Loc) override;
   void emitWinCFIEndProlog(SMLoc Loc) override;
+  void emitWinCFIBeginEpilogue(SMLoc Loc) override;
+  void emitWinCFIEndEpilogue(SMLoc Loc) override;
 
   void emitWinEHHandler(const MCSymbol *Sym, bool Unwind, bool Except,
                         SMLoc Loc) override;
@@ -2348,6 +2350,20 @@ void MCAsmStreamer::emitWinCFIEndProlog(SMLoc Loc) {
   EmitEOL();
 }
 
+void MCAsmStreamer::emitWinCFIBeginEpilogue(SMLoc Loc) {
+  MCStreamer::emitWinCFIBeginEpilogue(Loc);
+
+  OS << "\t.seh_startepilogue";
+  EmitEOL();
+}
+
+void MCAsmStreamer::emitWinCFIEndEpilogue(SMLoc Loc) {
+  MCStreamer::emitWinCFIEndEpilogue(Loc);
+
+  OS << "\t.seh_endepilogue";
+  EmitEOL();
+}
+
 void MCAsmStreamer::emitCGProfileEntry(const MCSymbolRefExpr *From,
                                        const MCSymbolRefExpr *To,
                                        uint64_t Count) {
diff --git llvm/lib/MC/MCParser/COFFAsmParser.cpp llvm/lib/MC/MCParser/COFFAsmParser.cpp
index dd5ce9964a19..4618e5675e47 100644
--- llvm/lib/MC/MCParser/COFFAsmParser.cpp
+++ llvm/lib/MC/MCParser/COFFAsmParser.cpp
@@ -92,6 +92,10 @@ class COFFAsmParser : public MCAsmParserExtension {
         ".seh_stackalloc");
     addDirectiveHandler<&COFFAsmParser::parseSEHDirectiveEndProlog>(
         ".seh_endprologue");
+    addDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveBeginEpilog>(
+        ".seh_startepilogue");
+    addDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveEndEpilog>(
+        ".seh_endepilogue");
   }
 
   bool parseSectionDirectiveText(StringRef, SMLoc) {
@@ -141,6 +145,8 @@ class COFFAsmParser : public MCAsmParserExtension {
   bool parseSEHDirectiveHandlerData(StringRef, SMLoc);
   bool parseSEHDirectiveAllocStack(StringRef, SMLoc);
   bool parseSEHDirectiveEndProlog(StringRef, SMLoc);
+  bool ParseSEHDirectiveBeginEpilog(StringRef, SMLoc);
+  bool ParseSEHDirectiveEndEpilog(StringRef, SMLoc);
 
   bool parseAtUnwindOrAtExcept(bool &unwind, bool &except);
   bool parseDirectiveSymbolAttribute(StringRef Directive, SMLoc);
@@ -749,6 +755,18 @@ bool COFFAsmParser::parseSEHDirectiveEndProlog(StringRef, SMLoc Loc) {
   return false;
 }
 
+bool COFFAsmParser::ParseSEHDirectiveBeginEpilog(StringRef, SMLoc Loc) {
+  Lex();
+  getStreamer().emitWinCFIBeginEpilogue(Loc);
+  return false;
+}
+
+bool COFFAsmParser::ParseSEHDirectiveEndEpilog(StringRef, SMLoc Loc) {
+  Lex();
+  getStreamer().emitWinCFIEndEpilogue(Loc);
+  return false;
+}
+
 bool COFFAsmParser::parseAtUnwindOrAtExcept(bool &unwind, bool &except) {
   StringRef identifier;
   if (getLexer().isNot(AsmToken::At) && getLexer().isNot(AsmToken::Percent))
diff --git llvm/lib/MC/MCStreamer.cpp llvm/lib/MC/MCStreamer.cpp
index e690723c0e50..462ebfedeba0 100644
--- llvm/lib/MC/MCStreamer.cpp
+++ llvm/lib/MC/MCStreamer.cpp
@@ -1013,6 +1013,36 @@ void MCStreamer::emitWinCFIEndProlog(SMLoc Loc) {
   CurFrame->PrologEnd = Label;
 }
 
+void MCStreamer::emitWinCFIBeginEpilogue(SMLoc Loc) {
+  WinEH::FrameInfo *CurFrame = EnsureValidWinFrameInfo(Loc);
+  if (!CurFrame)
+    return;
+
+  if (!CurFrame->PrologEnd)
+    return getContext().reportError(
+        Loc, "starting epilogue (.seh_startepilogue) before prologue has ended "
+             "(.seh_endprologue) in " +
+                 CurFrame->Function->getName());
+
+  InEpilogCFI = true;
+  CurrentEpilog = emitCFILabel();
+}
+
+void MCStreamer::emitWinCFIEndEpilogue(SMLoc Loc) {
+  WinEH::FrameInfo *CurFrame = EnsureValidWinFrameInfo(Loc);
+  if (!CurFrame)
+    return;
+
+  if (!InEpilogCFI)
+    return getContext().reportError(Loc, "Stray .seh_endepilogue in " +
+                                             CurFrame->Function->getName());
+
+  InEpilogCFI = false;
+  MCSymbol *Label = emitCFILabel();
+  CurFrame->EpilogMap[CurrentEpilog].End = Label;
+  CurrentEpilog = nullptr;
+}
+
 void MCStreamer::emitCOFFSafeSEH(MCSymbol const *Symbol) {}
 
 void MCStreamer::emitCOFFSymbolIndex(MCSymbol const *Symbol) {}
diff --git llvm/lib/Passes/PassBuilder.cpp llvm/lib/Passes/PassBuilder.cpp
index 9b93ebc36ae1..d9096edd3ba0 100644
--- llvm/lib/Passes/PassBuilder.cpp
+++ llvm/lib/Passes/PassBuilder.cpp
@@ -127,6 +127,7 @@
 #include "llvm/CodeGen/RegAllocFast.h"
 #include "llvm/CodeGen/RegUsageInfoCollector.h"
 #include "llvm/CodeGen/RegUsageInfoPropagate.h"
+#include "llvm/CodeGen/RegisterCoalescerPass.h"
 #include "llvm/CodeGen/RegisterUsageInfo.h"
 #include "llvm/CodeGen/SafeStack.h"
 #include "llvm/CodeGen/SelectOptimize.h"
diff --git llvm/lib/SandboxIR/Region.cpp llvm/lib/SandboxIR/Region.cpp
index 8c84d0c46fa1..dbb000e5dd92 100644
--- llvm/lib/SandboxIR/Region.cpp
+++ llvm/lib/SandboxIR/Region.cpp
@@ -94,16 +94,12 @@ Region::createRegionsFromMD(Function &F, TargetTransformInfo &TTI) {
   for (BasicBlock &BB : F) {
     for (Instruction &Inst : BB) {
       if (auto *MDN = cast<llvm::Instruction>(Inst.Val)->getMetadata(MDKind)) {
-        Region *R = nullptr;
-        auto It = MDNToRegion.find(MDN);
-        if (It == MDNToRegion.end()) {
+        auto [It, Inserted] = MDNToRegion.try_emplace(MDN);
+        if (Inserted) {
           Regions.push_back(std::make_unique<Region>(Ctx, TTI));
-          R = Regions.back().get();
-          MDNToRegion[MDN] = R;
-        } else {
-          R = It->second;
+          It->second = Regions.back().get();
         }
-        R->add(&Inst);
+        It->second->add(&Inst);
       }
     }
   }
diff --git llvm/lib/Support/BalancedPartitioning.cpp llvm/lib/Support/BalancedPartitioning.cpp
index 19977c57c08d..7b807b167c0e 100644
--- llvm/lib/Support/BalancedPartitioning.cpp
+++ llvm/lib/Support/BalancedPartitioning.cpp
@@ -305,7 +305,7 @@ void BalancedPartitioning::split(const FunctionNodeRange Nodes,
   unsigned NumNodes = std::distance(Nodes.begin(), Nodes.end());
   auto NodesMid = Nodes.begin() + (NumNodes + 1) / 2;
 
-  std::nth_element(Nodes.begin(), NodesMid, Nodes.end(), [](auto &L, auto &R) {
+  llvm::sort(Nodes.begin(), Nodes.end(), [](auto &L, auto &R) {
     return L.InputOrderIndex < R.InputOrderIndex;
   });
 
diff --git llvm/lib/Support/Unix/Signals.inc llvm/lib/Support/Unix/Signals.inc
index 9a12663228a3..2e7b467a14bb 100644
--- llvm/lib/Support/Unix/Signals.inc
+++ llvm/lib/Support/Unix/Signals.inc
@@ -468,7 +468,8 @@ void llvm::sys::AddSignalHandler(sys::SignalHandlerCallback FnPtr,
 
 #if ENABLE_BACKTRACES && defined(HAVE_BACKTRACE) &&                            \
     (defined(__linux__) || defined(__FreeBSD__) ||                             \
-     defined(__FreeBSD_kernel__) || defined(__NetBSD__))
+     defined(__FreeBSD_kernel__) || defined(__NetBSD__) ||                     \
+     defined(__OpenBSD__) || defined(__DragonFly__))
 struct DlIteratePhdrData {
   void **StackTrace;
   int depth;
diff --git llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index c6b4a219d201..f1f25b65fc53 100644
--- llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -78,11 +78,6 @@ static cl::opt<PtrauthCheckMode> PtrauthAuthChecks(
     cl::desc("Check pointer authentication auth/resign failures"),
     cl::init(Default));
 
-static cl::opt<bool> EnableImportCallOptimization(
-    "aarch64-win-import-call-optimization", cl::Hidden,
-    cl::desc("Enable import call optimization for AArch64 Windows"),
-    cl::init(false));
-
 #define DEBUG_TYPE "asm-printer"
 
 namespace {
@@ -95,6 +90,7 @@ class AArch64AsmPrinter : public AsmPrinter {
 #ifndef NDEBUG
   unsigned InstsEmitted;
 #endif
+  bool EnableImportCallOptimization = false;
   DenseMap<MCSection *, std::vector<std::pair<MCSymbol *, MCSymbol *>>>
       SectionToImportedFunctionCalls;
 
@@ -344,6 +340,9 @@ void AArch64AsmPrinter::emitStartOfAsmFile(Module &M) {
     OutStreamer->emitSymbolAttribute(S, MCSA_Global);
     OutStreamer->emitAssignment(
         S, MCConstantExpr::create(Feat00Value, MMI->getContext()));
+
+    if (M.getModuleFlag("import-call-optimization"))
+      EnableImportCallOptimization = true;
   }
 
   if (!TT.isOSBinFormatELF())
@@ -3172,8 +3171,7 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
 
 void AArch64AsmPrinter::recordIfImportCall(
     const llvm::MachineInstr *BranchInst) {
-  if (!EnableImportCallOptimization ||
-      !TM.getTargetTriple().isOSBinFormatCOFF())
+  if (!EnableImportCallOptimization)
     return;
 
   auto [GV, OpFlags] = BranchInst->getMF()->tryGetCalledGlobal(BranchInst);
diff --git llvm/lib/Target/AArch64/AArch64FrameLowering.cpp llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index a082a1ebe95b..81523adeefce 100644
--- llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -1634,6 +1634,9 @@ static bool IsSVECalleeSave(MachineBasicBlock::iterator I) {
   case AArch64::STR_PXI:
   case AArch64::LDR_ZXI:
   case AArch64::LDR_PXI:
+  case AArch64::PTRUE_B:
+  case AArch64::CPY_ZPzI_B:
+  case AArch64::CMPNE_PPzZI_B:
     return I->getFlag(MachineInstr::FrameSetup) ||
            I->getFlag(MachineInstr::FrameDestroy);
   }
@@ -3265,7 +3268,8 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
       StrOpc = RPI.isPaired() ? AArch64::ST1B_2Z_IMM : AArch64::STR_ZXI;
       break;
     case RegPairInfo::PPR:
-      StrOpc = AArch64::STR_PXI;
+      StrOpc =
+          Size == 16 ? AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO : AArch64::STR_PXI;
       break;
     case RegPairInfo::VG:
       StrOpc = AArch64::STRXui;
@@ -3494,7 +3498,8 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
       LdrOpc = RPI.isPaired() ? AArch64::LD1B_2Z_IMM : AArch64::LDR_ZXI;
       break;
     case RegPairInfo::PPR:
-      LdrOpc = AArch64::LDR_PXI;
+      LdrOpc = Size == 16 ? AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO
+                          : AArch64::LDR_PXI;
       break;
     case RegPairInfo::VG:
       continue;
@@ -3720,6 +3725,14 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
       continue;
     }
 
+    // Always save P4 when PPR spills are ZPR-sized and a predicate above p8 is
+    // spilled. If all of p0-p3 are used as return values p4 is must be free
+    // to reload p8-p15.
+    if (RegInfo->getSpillSize(AArch64::PPRRegClass) == 16 &&
+        AArch64::PPR_p8to15RegClass.contains(Reg)) {
+      SavedRegs.set(AArch64::P4);
+    }
+
     // MachO's compact unwind format relies on all registers being stored in
     // pairs.
     // FIXME: the usual format is actually better if unwinding isn't needed.
@@ -4159,8 +4172,295 @@ int64_t AArch64FrameLowering::assignSVEStackObjectOffsets(
                                         true);
 }
 
+/// Attempts to scavenge a register from \p ScavengeableRegs given the used
+/// registers in \p UsedRegs.
+static Register tryScavengeRegister(LiveRegUnits const &UsedRegs,
+                                    BitVector const &ScavengeableRegs) {
+  for (auto Reg : ScavengeableRegs.set_bits()) {
+    if (UsedRegs.available(Reg))
+      return Reg;
+  }
+  return AArch64::NoRegister;
+}
+
+/// Propagates frame-setup/destroy flags from \p SourceMI to all instructions in
+/// \p MachineInstrs.
+static void propagateFrameFlags(MachineInstr &SourceMI,
+                                ArrayRef<MachineInstr *> MachineInstrs) {
+  for (MachineInstr *MI : MachineInstrs) {
+    if (SourceMI.getFlag(MachineInstr::FrameSetup))
+      MI->setFlag(MachineInstr::FrameSetup);
+    if (SourceMI.getFlag(MachineInstr::FrameDestroy))
+      MI->setFlag(MachineInstr::FrameDestroy);
+  }
+}
+
+/// RAII helper class for scavenging or spilling a register. On construction
+/// attempts to find a free register of class \p RC (given \p UsedRegs and \p
+/// AllocatableRegs), if no register can be found spills \p SpillCandidate to \p
+/// MaybeSpillFI to free a register. The free'd register is returned via the \p
+/// FreeReg output parameter. On destruction, if there is a spill, its previous
+/// value is reloaded. The spilling and scavenging is only valid at the
+/// insertion point \p MBBI, this class should _not_ be used in places that
+/// create or manipulate basic blocks, moving the expected insertion point.
+struct ScopedScavengeOrSpill {
+  ScopedScavengeOrSpill(const ScopedScavengeOrSpill &) = delete;
+  ScopedScavengeOrSpill(ScopedScavengeOrSpill &&) = delete;
+
+  ScopedScavengeOrSpill(MachineFunction &MF, MachineBasicBlock &MBB,
+                        MachineBasicBlock::iterator MBBI,
+                        Register SpillCandidate, const TargetRegisterClass &RC,
+                        LiveRegUnits const &UsedRegs,
+                        BitVector const &AllocatableRegs,
+                        std::optional<int> *MaybeSpillFI)
+      : MBB(MBB), MBBI(MBBI), RC(RC), TII(static_cast<const AArch64InstrInfo &>(
+                                          *MF.getSubtarget().getInstrInfo())),
+        TRI(*MF.getSubtarget().getRegisterInfo()) {
+    FreeReg = tryScavengeRegister(UsedRegs, AllocatableRegs);
+    if (FreeReg != AArch64::NoRegister)
+      return;
+    assert(MaybeSpillFI && "Expected emergency spill slot FI information "
+                           "(attempted to spill in prologue/epilogue?)");
+    if (!MaybeSpillFI->has_value()) {
+      MachineFrameInfo &MFI = MF.getFrameInfo();
+      *MaybeSpillFI = MFI.CreateSpillStackObject(TRI.getSpillSize(RC),
+                                                 TRI.getSpillAlign(RC));
+    }
+    FreeReg = SpillCandidate;
+    SpillFI = MaybeSpillFI->value();
+    TII.storeRegToStackSlot(MBB, MBBI, FreeReg, false, *SpillFI, &RC, &TRI,
+                            Register());
+  }
+
+  bool hasSpilled() const { return SpillFI.has_value(); }
+
+  /// Returns the free register (found from scavenging or spilling a register).
+  Register freeRegister() const { return FreeReg; }
+
+  Register operator*() const { return freeRegister(); }
+
+  ~ScopedScavengeOrSpill() {
+    if (hasSpilled())
+      TII.loadRegFromStackSlot(MBB, MBBI, FreeReg, *SpillFI, &RC, &TRI,
+                               Register());
+  }
+
+private:
+  MachineBasicBlock &MBB;
+  MachineBasicBlock::iterator MBBI;
+  const TargetRegisterClass &RC;
+  const AArch64InstrInfo &TII;
+  const TargetRegisterInfo &TRI;
+  Register FreeReg = AArch64::NoRegister;
+  std::optional<int> SpillFI;
+};
+
+/// Emergency stack slots for expanding SPILL_PPR_TO_ZPR_SLOT_PSEUDO and
+/// FILL_PPR_FROM_ZPR_SLOT_PSEUDO.
+struct EmergencyStackSlots {
+  std::optional<int> ZPRSpillFI;
+  std::optional<int> PPRSpillFI;
+  std::optional<int> GPRSpillFI;
+};
+
+/// Registers available for scavenging (ZPR, PPR3b, GPR).
+struct ScavengeableRegs {
+  BitVector ZPRRegs;
+  BitVector PPR3bRegs;
+  BitVector GPRRegs;
+};
+
+static bool isInPrologueOrEpilogue(const MachineInstr &MI) {
+  return MI.getFlag(MachineInstr::FrameSetup) ||
+         MI.getFlag(MachineInstr::FrameDestroy);
+}
+
+/// Expands:
+/// ```
+/// SPILL_PPR_TO_ZPR_SLOT_PSEUDO $p0, %stack.0, 0
+/// ```
+/// To:
+/// ```
+/// $z0 = CPY_ZPzI_B $p0, 1, 0
+/// STR_ZXI $z0, $stack.0, 0
+/// ```
+/// While ensuring a ZPR ($z0 in this example) is free for the predicate (
+/// spilling if necessary).
+static void expandSpillPPRToZPRSlotPseudo(MachineBasicBlock &MBB,
+                                          MachineInstr &MI,
+                                          const TargetRegisterInfo &TRI,
+                                          LiveRegUnits const &UsedRegs,
+                                          ScavengeableRegs const &SR,
+                                          EmergencyStackSlots &SpillSlots) {
+  MachineFunction &MF = *MBB.getParent();
+  auto *TII =
+      static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
+
+  ScopedScavengeOrSpill ZPredReg(
+      MF, MBB, MI, AArch64::Z0, AArch64::ZPRRegClass, UsedRegs, SR.ZPRRegs,
+      isInPrologueOrEpilogue(MI) ? nullptr : &SpillSlots.ZPRSpillFI);
+
+  SmallVector<MachineInstr *, 2> MachineInstrs;
+  const DebugLoc &DL = MI.getDebugLoc();
+  MachineInstrs.push_back(BuildMI(MBB, MI, DL, TII->get(AArch64::CPY_ZPzI_B))
+                              .addReg(*ZPredReg, RegState::Define)
+                              .add(MI.getOperand(0))
+                              .addImm(1)
+                              .addImm(0)
+                              .getInstr());
+  MachineInstrs.push_back(BuildMI(MBB, MI, DL, TII->get(AArch64::STR_ZXI))
+                              .addReg(*ZPredReg)
+                              .add(MI.getOperand(1))
+                              .addImm(MI.getOperand(2).getImm())
+                              .setMemRefs(MI.memoperands())
+                              .getInstr());
+  propagateFrameFlags(MI, MachineInstrs);
+}
+
+/// Expands:
+/// ```
+/// $p0 = FILL_PPR_FROM_ZPR_SLOT_PSEUDO %stack.0, 0
+/// ```
+/// To:
+/// ```
+/// $z0 = LDR_ZXI %stack.0, 0
+/// $p0 = PTRUE_B 31, implicit $vg
+/// $p0 = CMPNE_PPzZI_B $p0, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+/// ```
+/// While ensuring a ZPR ($z0 in this example) is free for the predicate (
+/// spilling if necessary). If the status flags are in use at the point of
+/// expansion they are preserved (by moving them to/from a GPR). This may cause
+/// an additional spill if no GPR is free at the expansion point.
+static bool expandFillPPRFromZPRSlotPseudo(MachineBasicBlock &MBB,
+                                           MachineInstr &MI,
+                                           const TargetRegisterInfo &TRI,
+                                           LiveRegUnits const &UsedRegs,
+                                           ScavengeableRegs const &SR,
+                                           EmergencyStackSlots &SpillSlots) {
+  MachineFunction &MF = *MBB.getParent();
+  auto *TII =
+      static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
+
+  ScopedScavengeOrSpill ZPredReg(
+      MF, MBB, MI, AArch64::Z0, AArch64::ZPRRegClass, UsedRegs, SR.ZPRRegs,
+      isInPrologueOrEpilogue(MI) ? nullptr : &SpillSlots.ZPRSpillFI);
+
+  ScopedScavengeOrSpill PredReg(
+      MF, MBB, MI, AArch64::P0, AArch64::PPR_3bRegClass, UsedRegs, SR.PPR3bRegs,
+      isInPrologueOrEpilogue(MI) ? nullptr : &SpillSlots.PPRSpillFI);
+
+  // Elide NZCV spills if we know it is not used.
+  bool IsNZCVUsed = !UsedRegs.available(AArch64::NZCV);
+  std::optional<ScopedScavengeOrSpill> NZCVSaveReg;
+  if (IsNZCVUsed)
+    NZCVSaveReg.emplace(
+        MF, MBB, MI, AArch64::X0, AArch64::GPR64RegClass, UsedRegs, SR.GPRRegs,
+        isInPrologueOrEpilogue(MI) ? nullptr : &SpillSlots.GPRSpillFI);
+  SmallVector<MachineInstr *, 4> MachineInstrs;
+  const DebugLoc &DL = MI.getDebugLoc();
+  MachineInstrs.push_back(BuildMI(MBB, MI, DL, TII->get(AArch64::LDR_ZXI))
+                              .addReg(*ZPredReg, RegState::Define)
+                              .add(MI.getOperand(1))
+                              .addImm(MI.getOperand(2).getImm())
+                              .setMemRefs(MI.memoperands())
+                              .getInstr());
+  if (IsNZCVUsed)
+    MachineInstrs.push_back(
+        BuildMI(MBB, MI, DL, TII->get(AArch64::MRS))
+            .addReg(NZCVSaveReg->freeRegister(), RegState::Define)
+            .addImm(AArch64SysReg::NZCV)
+            .addReg(AArch64::NZCV, RegState::Implicit)
+            .getInstr());
+  MachineInstrs.push_back(BuildMI(MBB, MI, DL, TII->get(AArch64::PTRUE_B))
+                              .addReg(*PredReg, RegState::Define)
+                              .addImm(31));
+  MachineInstrs.push_back(
+      BuildMI(MBB, MI, DL, TII->get(AArch64::CMPNE_PPzZI_B))
+          .addReg(MI.getOperand(0).getReg(), RegState::Define)
+          .addReg(*PredReg)
+          .addReg(*ZPredReg)
+          .addImm(0)
+          .addReg(AArch64::NZCV, RegState::ImplicitDefine)
+          .getInstr());
+  if (IsNZCVUsed)
+    MachineInstrs.push_back(BuildMI(MBB, MI, DL, TII->get(AArch64::MSR))
+                                .addImm(AArch64SysReg::NZCV)
+                                .addReg(NZCVSaveReg->freeRegister())
+                                .addReg(AArch64::NZCV, RegState::ImplicitDefine)
+                                .getInstr());
+
+  propagateFrameFlags(MI, MachineInstrs);
+  return PredReg.hasSpilled();
+}
+
+/// Expands all FILL_PPR_FROM_ZPR_SLOT_PSEUDO and SPILL_PPR_TO_ZPR_SLOT_PSEUDO
+/// operations within the MachineBasicBlock \p MBB.
+static bool expandSMEPPRToZPRSpillPseudos(MachineBasicBlock &MBB,
+                                          const TargetRegisterInfo &TRI,
+                                          ScavengeableRegs const &SR,
+                                          EmergencyStackSlots &SpillSlots) {
+  LiveRegUnits UsedRegs(TRI);
+  UsedRegs.addLiveOuts(MBB);
+  bool HasPPRSpills = false;
+  for (MachineInstr &MI : make_early_inc_range(reverse(MBB))) {
+    UsedRegs.stepBackward(MI);
+    switch (MI.getOpcode()) {
+    case AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO:
+      HasPPRSpills |= expandFillPPRFromZPRSlotPseudo(MBB, MI, TRI, UsedRegs, SR,
+                                                     SpillSlots);
+      MI.eraseFromParent();
+      break;
+    case AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO:
+      expandSpillPPRToZPRSlotPseudo(MBB, MI, TRI, UsedRegs, SR, SpillSlots);
+      MI.eraseFromParent();
+      break;
+    default:
+      break;
+    }
+  }
+
+  return HasPPRSpills;
+}
+
 void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
     MachineFunction &MF, RegScavenger *RS) const {
+
+  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+  const TargetSubtargetInfo &TSI = MF.getSubtarget();
+  const TargetRegisterInfo &TRI = *TSI.getRegisterInfo();
+
+  // If predicates spills are 16-bytes we may need to expand
+  // SPILL_PPR_TO_ZPR_SLOT_PSEUDO/FILL_PPR_FROM_ZPR_SLOT_PSEUDO.
+  if (AFI->hasStackFrame() && TRI.getSpillSize(AArch64::PPRRegClass) == 16) {
+    auto ComputeScavengeableRegisters = [&](unsigned RegClassID) {
+      BitVector Regs = TRI.getAllocatableSet(MF, TRI.getRegClass(RegClassID));
+      assert(Regs.count() > 0 && "Expected scavengeable registers");
+      return Regs;
+    };
+
+    ScavengeableRegs SR{};
+    SR.ZPRRegs = ComputeScavengeableRegisters(AArch64::ZPRRegClassID);
+    // Only p0-7 are possible as the second operand of cmpne (needed for fills).
+    SR.PPR3bRegs = ComputeScavengeableRegisters(AArch64::PPR_3bRegClassID);
+    SR.GPRRegs = ComputeScavengeableRegisters(AArch64::GPR64RegClassID);
+
+    EmergencyStackSlots SpillSlots;
+    for (MachineBasicBlock &MBB : MF) {
+      // In the case we had to spill a predicate (in the range p0-p7) to reload
+      // a predicate (>= p8), additional spill/fill pseudos will be created.
+      // These need an additional expansion pass. Note: There will only be at
+      // most two expansion passes, as spilling/filling a predicate in the range
+      // p0-p7 never requires spilling another predicate.
+      for (int Pass = 0; Pass < 2; Pass++) {
+        bool HasPPRSpills =
+            expandSMEPPRToZPRSpillPseudos(MBB, TRI, SR, SpillSlots);
+        assert((Pass == 0 || !HasPPRSpills) && "Did not expect PPR spills");
+        if (!HasPPRSpills)
+          break;
+      }
+    }
+  }
+
   MachineFrameInfo &MFI = MF.getFrameInfo();
 
   assert(getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown &&
@@ -4170,7 +4470,6 @@ void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
   int64_t SVEStackSize =
       assignSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex);
 
-  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
   AFI->setStackSizeSVE(alignTo(SVEStackSize, 16U));
   AFI->setMinMaxSVECSFrameIndex(MinCSFrameIndex, MaxCSFrameIndex);
 
@@ -5204,9 +5503,13 @@ void AArch64FrameLowering::emitRemarks(
 
           unsigned RegTy = StackAccess::AccessType::GPR;
           if (MFI.getStackID(FrameIdx) == TargetStackID::ScalableVector) {
-            if (AArch64::PPRRegClass.contains(MI.getOperand(0).getReg()))
+            // SPILL_PPR_TO_ZPR_SLOT_PSEUDO and FILL_PPR_FROM_ZPR_SLOT_PSEUDO
+            // spill/fill the predicate as a data vector (so are an FPR acess).
+            if (MI.getOpcode() != AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO &&
+                MI.getOpcode() != AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO &&
+                AArch64::PPRRegClass.contains(MI.getOperand(0).getReg())) {
               RegTy = StackAccess::PPR;
-            else
+            } else
               RegTy = StackAccess::FPR;
           } else if (AArch64InstrInfo::isFpOrNEON(MI)) {
             RegTy = StackAccess::FPR;
diff --git llvm/lib/Target/AArch64/AArch64ISelLowering.cpp llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index bd9994bcb669..84f6d421b70f 100644
--- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -530,6 +530,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::XOR, MVT::i32, Custom);
   setOperationAction(ISD::XOR, MVT::i64, Custom);
 
+  setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
+  setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
+
   // Virtually no operation on f128 is legal, but LLVM can't expand them when
   // there's a valid register class, so we need custom operations in most cases.
   setOperationAction(ISD::FABS, MVT::f128, Expand);
@@ -6880,6 +6883,38 @@ static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST,
                       ST->getBasePtr(), ST->getMemOperand());
 }
 
+static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) {
+  SDLoc dl(Op);
+  SDValue Src = Op.getOperand(0);
+  MVT DestVT = Op.getSimpleValueType();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());
+
+  unsigned SrcAS = N->getSrcAddressSpace();
+  unsigned DestAS = N->getDestAddressSpace();
+  assert(SrcAS != DestAS &&
+         "addrspacecast must be between different address spaces");
+  assert(TLI.getTargetMachine().getPointerSize(SrcAS) !=
+             TLI.getTargetMachine().getPointerSize(DestAS) &&
+         "addrspacecast must be between different ptr sizes");
+  (void)TLI;
+
+  if (SrcAS == ARM64AS::PTR32_SPTR) {
+    return DAG.getNode(ISD::SIGN_EXTEND, dl, DestVT, Src,
+                       DAG.getTargetConstant(0, dl, DestVT));
+  } else if (SrcAS == ARM64AS::PTR32_UPTR) {
+    return DAG.getNode(ISD::ZERO_EXTEND, dl, DestVT, Src,
+                       DAG.getTargetConstant(0, dl, DestVT));
+  } else if ((DestAS == ARM64AS::PTR32_SPTR) ||
+             (DestAS == ARM64AS::PTR32_UPTR)) {
+    SDValue Ext = DAG.getAnyExtOrTrunc(Src, dl, DestVT);
+    SDValue Trunc = DAG.getZeroExtendInReg(Ext, dl, DestVT);
+    return Trunc;
+  } else {
+    return Src;
+  }
+}
+
 // Custom lowering for any store, vector or scalar and/or default or with
 // a truncate operations.  Currently only custom lower truncate operation
 // from vector v4i16 to v4i8 or volatile stores of i128.
@@ -7541,6 +7576,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
   case ISD::SIGN_EXTEND:
   case ISD::ZERO_EXTEND:
     return LowerFixedLengthVectorIntExtendToSVE(Op, DAG);
+  case ISD::ADDRSPACECAST:
+    return LowerADDRSPACECAST(Op, DAG);
   case ISD::SIGN_EXTEND_INREG: {
     // Only custom lower when ExtraVT has a legal byte based element type.
     EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
@@ -8765,17 +8802,9 @@ static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
 bool shouldUseFormStridedPseudo(MachineInstr &MI) {
   MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
 
-  const TargetRegisterClass *RegClass = nullptr;
-  switch (MI.getOpcode()) {
-  case AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO:
-    RegClass = &AArch64::ZPR2StridedOrContiguousRegClass;
-    break;
-  case AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO:
-    RegClass = &AArch64::ZPR4StridedOrContiguousRegClass;
-    break;
-  default:
-    llvm_unreachable("Unexpected opcode.");
-  }
+  assert((MI.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO ||
+          MI.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO) &&
+         "Unexpected opcode.");
 
   MCRegister SubReg = MCRegister::NoRegister;
   for (unsigned I = 1; I < MI.getNumOperands(); ++I) {
@@ -8792,8 +8821,11 @@ bool shouldUseFormStridedPseudo(MachineInstr &MI) {
       SubReg = OpSubReg;
 
     MachineOperand *CopySrcOp = MRI.getOneDef(CopySrc.getReg());
+    const TargetRegisterClass *CopySrcClass =
+        MRI.getRegClass(CopySrcOp->getReg());
     if (!CopySrcOp || !CopySrcOp->isReg() || OpSubReg != SubReg ||
-        MRI.getRegClass(CopySrcOp->getReg()) != RegClass)
+        (CopySrcClass != &AArch64::ZPR2StridedOrContiguousRegClass &&
+         CopySrcClass != &AArch64::ZPR4StridedOrContiguousRegClass))
       return false;
   }
 
@@ -9578,7 +9610,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
 
     DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
     DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
-    if (CalledGlobal)
+    if (CalledGlobal &&
+        MF.getFunction().getParent()->getModuleFlag("import-call-optimization"))
       DAG.addCalledGlobal(Ret.getNode(), CalledGlobal, OpFlags);
     return Ret;
   }
@@ -9591,7 +9624,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
   DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
   InGlue = Chain.getValue(1);
   DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
-  if (CalledGlobal)
+  if (CalledGlobal &&
+      MF.getFunction().getParent()->getModuleFlag("import-call-optimization"))
     DAG.addCalledGlobal(Chain.getNode(), CalledGlobal, OpFlags);
 
   uint64_t CalleePopBytes =
@@ -23558,6 +23592,26 @@ static SDValue performLOADCombine(SDNode *N,
     performTBISimplification(N->getOperand(1), DCI, DAG);
 
   LoadSDNode *LD = cast<LoadSDNode>(N);
+  EVT RegVT = LD->getValueType(0);
+  EVT MemVT = LD->getMemoryVT();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDLoc DL(LD);
+
+  // Cast ptr32 and ptr64 pointers to the default address space before a load.
+  unsigned AddrSpace = LD->getAddressSpace();
+  if (AddrSpace == ARM64AS::PTR64 || AddrSpace == ARM64AS::PTR32_SPTR ||
+      AddrSpace == ARM64AS::PTR32_UPTR) {
+    MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
+    if (PtrVT != LD->getBasePtr().getSimpleValueType()) {
+      SDValue Cast =
+          DAG.getAddrSpaceCast(DL, PtrVT, LD->getBasePtr(), AddrSpace, 0);
+      return DAG.getExtLoad(LD->getExtensionType(), DL, RegVT, LD->getChain(),
+                            Cast, LD->getPointerInfo(), MemVT,
+                            LD->getOriginalAlign(),
+                            LD->getMemOperand()->getFlags());
+    }
+  }
+
   if (LD->isVolatile() || !Subtarget->isLittleEndian())
     return SDValue(N, 0);
 
@@ -23567,13 +23621,11 @@ static SDValue performLOADCombine(SDNode *N,
   if (!LD->isNonTemporal())
     return SDValue(N, 0);
 
-  EVT MemVT = LD->getMemoryVT();
   if (MemVT.isScalableVector() || MemVT.getSizeInBits() <= 256 ||
       MemVT.getSizeInBits() % 256 == 0 ||
       256 % MemVT.getScalarSizeInBits() != 0)
     return SDValue(N, 0);
 
-  SDLoc DL(LD);
   SDValue Chain = LD->getChain();
   SDValue BasePtr = LD->getBasePtr();
   SDNodeFlags Flags = LD->getFlags();
@@ -23833,12 +23885,28 @@ static SDValue performSTORECombine(SDNode *N,
   SDValue Value = ST->getValue();
   SDValue Ptr = ST->getBasePtr();
   EVT ValueVT = Value.getValueType();
+  EVT MemVT = ST->getMemoryVT();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDLoc DL(ST);
 
   auto hasValidElementTypeForFPTruncStore = [](EVT VT) {
     EVT EltVT = VT.getVectorElementType();
     return EltVT == MVT::f32 || EltVT == MVT::f64;
   };
 
+  // Cast ptr32 and ptr64 pointers to the default address space before a store.
+  unsigned AddrSpace = ST->getAddressSpace();
+  if (AddrSpace == ARM64AS::PTR64 || AddrSpace == ARM64AS::PTR32_SPTR ||
+      AddrSpace == ARM64AS::PTR32_UPTR) {
+    MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
+    if (PtrVT != Ptr.getSimpleValueType()) {
+      SDValue Cast = DAG.getAddrSpaceCast(DL, PtrVT, Ptr, AddrSpace, 0);
+      return DAG.getStore(Chain, DL, Value, Cast, ST->getPointerInfo(),
+                          ST->getOriginalAlign(),
+                          ST->getMemOperand()->getFlags(), ST->getAAInfo());
+    }
+  }
+
   if (SDValue Res = combineI8TruncStore(ST, DAG, Subtarget))
     return Res;
 
@@ -23852,8 +23920,8 @@ static SDValue performSTORECombine(SDNode *N,
       ValueVT.isFixedLengthVector() &&
       ValueVT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits() &&
       hasValidElementTypeForFPTruncStore(Value.getOperand(0).getValueType()))
-    return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0), Ptr,
-                             ST->getMemoryVT(), ST->getMemOperand());
+    return DAG.getTruncStore(Chain, DL, Value.getOperand(0), Ptr, MemVT,
+                             ST->getMemOperand());
 
   if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
     return Split;
@@ -27394,6 +27462,11 @@ void AArch64TargetLowering::ReplaceNodeResults(
     ReplaceATOMIC_LOAD_128Results(N, Results, DAG, Subtarget);
     return;
   }
+  case ISD::ADDRSPACECAST: {
+    SDValue V = LowerADDRSPACECAST(SDValue(N, 0), DAG);
+    Results.push_back(V);
+    return;
+  }
   case ISD::ATOMIC_LOAD:
   case ISD::LOAD: {
     MemSDNode *LoadNode = cast<MemSDNode>(N);
diff --git llvm/lib/Target/AArch64/AArch64ISelLowering.h llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 470ed2a06b70..b26f28dc79f8 100644
--- llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -563,6 +563,10 @@ const unsigned StackProbeMaxLoopUnroll = 4;
 
 } // namespace AArch64
 
+namespace ARM64AS {
+enum : unsigned { PTR32_SPTR = 270, PTR32_UPTR = 271, PTR64 = 272 };
+}
+
 class AArch64Subtarget;
 
 class AArch64TargetLowering : public TargetLowering {
@@ -594,11 +598,19 @@ public:
                                            unsigned Depth) const override;
 
   MVT getPointerTy(const DataLayout &DL, uint32_t AS = 0) const override {
-    // Returning i64 unconditionally here (i.e. even for ILP32) means that the
-    // *DAG* representation of pointers will always be 64-bits. They will be
-    // truncated and extended when transferred to memory, but the 64-bit DAG
-    // allows us to use AArch64's addressing modes much more easily.
-    return MVT::getIntegerVT(64);
+    if ((AS == ARM64AS::PTR32_SPTR) || (AS == ARM64AS::PTR32_UPTR)) {
+      // These are 32-bit pointers created using the `__ptr32` extension or
+      // similar. They are handled by marking them as being in a different
+      // address space, and will be extended to 64-bits when used as the target
+      // of a load or store operation, or cast to a 64-bit pointer type.
+      return MVT::i32;
+    } else {
+      // Returning i64 unconditionally here (i.e. even for ILP32) means that the
+      // *DAG* representation of pointers will always be 64-bits. They will be
+      // truncated and extended when transferred to memory, but the 64-bit DAG
+      // allows us to use AArch64's addressing modes much more easily.
+      return MVT::i64;
+    }
   }
 
   bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits,
diff --git llvm/lib/Target/AArch64/AArch64InstrInfo.cpp llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 17dd8a073eff..0f2b969fba35 100644
--- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -81,7 +81,7 @@ static cl::opt<unsigned>
 AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
     : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP,
                           AArch64::CATCHRET),
-      RI(STI.getTargetTriple()), Subtarget(STI) {}
+      RI(STI.getTargetTriple(), STI.getHwMode()), Subtarget(STI) {}
 
 /// GetInstSize - Return the number of bytes of code the specified
 /// instruction may be.  This returns the maximum number of bytes.
@@ -2438,6 +2438,8 @@ unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
   case AArch64::STZ2Gi:
   case AArch64::STZGi:
   case AArch64::TAGPstack:
+  case AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO:
+  case AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO:
     return 2;
   case AArch64::LD1B_D_IMM:
   case AArch64::LD1B_H_IMM:
@@ -4223,6 +4225,8 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
     MinOffset = -256;
     MaxOffset = 254;
     break;
+  case AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO:
+  case AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO:
   case AArch64::LDR_ZXI:
   case AArch64::STR_ZXI:
     Scale = TypeSize::getScalable(16);
@@ -5355,6 +5359,11 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
              "Unexpected register store without SVE store instructions");
       Opc = AArch64::STR_ZXI;
       StackID = TargetStackID::ScalableVector;
+    } else if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.isSVEorStreamingSVEAvailable() &&
+             "Unexpected predicate store without SVE store instructions");
+      Opc = AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO;
+      StackID = TargetStackID::ScalableVector;
     }
     break;
   case 24:
@@ -5527,6 +5536,11 @@ void AArch64InstrInfo::loadRegFromStackSlot(
              "Unexpected register load without SVE load instructions");
       Opc = AArch64::LDR_ZXI;
       StackID = TargetStackID::ScalableVector;
+    } else if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.isSVEorStreamingSVEAvailable() &&
+             "Unexpected predicate load without SVE load instructions");
+      Opc = AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO;
+      StackID = TargetStackID::ScalableVector;
     }
     break;
   case 24:
diff --git llvm/lib/Target/AArch64/AArch64InstrInfo.td llvm/lib/Target/AArch64/AArch64InstrInfo.td
index b77246200db6..3c57ba414b2b 100644
--- llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -1027,9 +1027,9 @@ def add_and_or_is_add : PatFrags<(ops node:$lhs, node:$rhs),
    return CurDAG->isADDLike(SDValue(N,0));
 }]> {
   let GISelPredicateCode = [{
-     // Only handle G_ADD for now. FIXME. build capability to compute whether
-     // operands of G_OR have common bits set or not.
-     return MI.getOpcode() == TargetOpcode::G_ADD;
+     return MI.getOpcode() == TargetOpcode::G_ADD ||
+            (MI.getOpcode() == TargetOpcode::G_OR &&
+             MI.getFlag(MachineInstr::MIFlag::Disjoint));
   }];
 }
 
diff --git llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 5973b63b5a80..8fd34325bb00 100644
--- llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -20,6 +20,7 @@
 #include "MCTargetDesc/AArch64InstPrinter.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/CodeGen/LiveRegMatrix.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -38,8 +39,8 @@ using namespace llvm;
 #define GET_REGINFO_TARGET_DESC
 #include "AArch64GenRegisterInfo.inc"
 
-AArch64RegisterInfo::AArch64RegisterInfo(const Triple &TT)
-    : AArch64GenRegisterInfo(AArch64::LR), TT(TT) {
+AArch64RegisterInfo::AArch64RegisterInfo(const Triple &TT, unsigned HwMode)
+    : AArch64GenRegisterInfo(AArch64::LR, 0, 0, 0, HwMode), TT(TT) {
   AArch64_MC::initLLVMToCVRegMapping(this);
 }
 
@@ -1097,7 +1098,11 @@ bool AArch64RegisterInfo::getRegAllocationHints(
     Register VirtReg, ArrayRef<MCPhysReg> Order,
     SmallVectorImpl<MCPhysReg> &Hints, const MachineFunction &MF,
     const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const {
-  const MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  auto &ST = MF.getSubtarget<AArch64Subtarget>();
+  if (!ST.hasSME() || !ST.isStreaming())
+    return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF,
+                                                     VRM);
 
   // The SVE calling convention preserves registers Z8-Z23. As a result, there
   // are no ZPR2Strided or ZPR4Strided registers that do not overlap with the
@@ -1107,26 +1112,127 @@ bool AArch64RegisterInfo::getRegAllocationHints(
   // FORM_TRANSPOSED_REG_TUPLE pseudo, we want to favour reducing copy
   // instructions over reducing the number of clobbered callee-save registers,
   // so we add the strided registers as a hint.
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
   unsigned RegID = MRI.getRegClass(VirtReg)->getID();
-  // Look through uses of the register for FORM_TRANSPOSED_REG_TUPLE.
-  if ((RegID == AArch64::ZPR2StridedOrContiguousRegClassID ||
-       RegID == AArch64::ZPR4StridedOrContiguousRegClassID) &&
-      any_of(MRI.use_nodbg_instructions(VirtReg), [](const MachineInstr &Use) {
-        return Use.getOpcode() ==
-                   AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO ||
-               Use.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO;
-      })) {
-    const TargetRegisterClass *StridedRC =
-        RegID == AArch64::ZPR2StridedOrContiguousRegClassID
-            ? &AArch64::ZPR2StridedRegClass
-            : &AArch64::ZPR4StridedRegClass;
-
-    for (MCPhysReg Reg : Order)
-      if (StridedRC->contains(Reg))
-        Hints.push_back(Reg);
+  if (RegID == AArch64::ZPR2StridedOrContiguousRegClassID ||
+      RegID == AArch64::ZPR4StridedOrContiguousRegClassID) {
+
+    // Look through uses of the register for FORM_TRANSPOSED_REG_TUPLE.
+    for (const MachineInstr &Use : MRI.use_nodbg_instructions(VirtReg)) {
+      if (Use.getOpcode() != AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO &&
+          Use.getOpcode() != AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO)
+        continue;
+
+      unsigned UseOps = Use.getNumOperands() - 1;
+      const TargetRegisterClass *StridedRC;
+      switch (RegID) {
+      case AArch64::ZPR2StridedOrContiguousRegClassID:
+        StridedRC = &AArch64::ZPR2StridedRegClass;
+        break;
+      case AArch64::ZPR4StridedOrContiguousRegClassID:
+        StridedRC = &AArch64::ZPR4StridedRegClass;
+        break;
+      default:
+        llvm_unreachable("Unexpected RegID");
+      }
 
-    return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF,
-                                                     VRM);
+      SmallVector<MCPhysReg, 4> StridedOrder;
+      for (MCPhysReg Reg : Order)
+        if (StridedRC->contains(Reg))
+          StridedOrder.push_back(Reg);
+
+      int OpIdx = Use.findRegisterUseOperandIdx(VirtReg, this);
+      assert(OpIdx != -1 && "Expected operand index from register use.");
+
+      unsigned TupleID = MRI.getRegClass(Use.getOperand(0).getReg())->getID();
+      bool IsMulZPR = TupleID == AArch64::ZPR2Mul2RegClassID ||
+                      TupleID == AArch64::ZPR4Mul4RegClassID;
+
+      const MachineOperand *AssignedRegOp = llvm::find_if(
+          make_range(Use.operands_begin() + 1, Use.operands_end()),
+          [&VRM](const MachineOperand &Op) {
+            return VRM->hasPhys(Op.getReg());
+          });
+
+      // Example:
+      //
+      // When trying to find a suitable register allocation for VirtReg %v2 in:
+      //
+      //  %v0:zpr2stridedorcontiguous = ld1 p0/z, [...]
+      //  %v1:zpr2stridedorcontiguous = ld1 p0/z, [...]
+      //  %v2:zpr2stridedorcontiguous = ld1 p0/z, [...]
+      //  %v3:zpr2stridedorcontiguous = ld1 p0/z, [...]
+      //  %v4:zpr4mul4 = FORM_TRANSPOSED_X4 %v0:0, %v1:0, %v2:0, %v3:0
+      //
+      // One such suitable allocation would be:
+      //
+      //  { z0, z8 }  = ld1 p0/z, [...]
+      //  { z1, z9 }  = ld1 p0/z, [...]
+      //  { z2, z10 } = ld1 p0/z, [...]
+      //  { z3, z11 } = ld1 p0/z, [...]
+      //  { z0, z1, z2, z3 } =
+      //     FORM_TRANSPOSED_X4 {z0, z8}:0, {z1, z9}:0, {z2, z10}:0, {z3, z11}:0
+      //
+      // Below we distinguish two cases when trying to find a register:
+      // * None of the registers used by FORM_TRANSPOSED_X4 have been assigned
+      //   yet. In this case the code muse ensure that there are at least UseOps
+      //   free consecutive registers. If IsMulZPR is true, then the first of
+      //   registers must also be a multiple of UseOps, e.g. { z0, z1, z2, z3 }
+      //   is valid but { z1, z2, z3, z5 } is not.
+      // * One or more of the registers used by FORM_TRANSPOSED_X4 is already
+      //   assigned a physical register, which means only checking that a
+      //   consectutive range of free tuple registers exists which includes
+      //   the assigned register.
+      //   e.g. in the example above, if { z0, z8 } is already allocated for
+      //   %v0, we just need to ensure that { z1, z9 }, { z2, z10 } and
+      //   { z3, z11 } are also free. If so, we add { z2, z10 }.
+
+      if (AssignedRegOp == Use.operands_end()) {
+        // There are no registers already assigned to any of the pseudo
+        // operands. Look for a valid starting register for the group.
+        for (unsigned I = 0; I < StridedOrder.size(); ++I) {
+          MCPhysReg Reg = StridedOrder[I];
+          SmallVector<MCPhysReg> Regs;
+
+          // If the FORM_TRANSPOSE nodes use the ZPRMul classes, the starting
+          // register of the first load should be a multiple of 2 or 4.
+          unsigned SubRegIdx = Use.getOperand(OpIdx).getSubReg();
+          if (IsMulZPR && (getSubReg(Reg, SubRegIdx) - AArch64::Z0) % UseOps !=
+                              ((unsigned)OpIdx - 1))
+            continue;
+
+          // In the example above, if VirtReg is the third operand of the
+          // tuple (%v2) and Reg == Z2_Z10, then we need to make sure that
+          // Z0_Z8, Z1_Z9 and Z3_Z11 are also available.
+          auto IsFreeConsecutiveReg = [&](unsigned UseOp) {
+            unsigned R = Reg - (OpIdx - 1) + UseOp;
+            return StridedRC->contains(R) &&
+                   (UseOp == 0 ||
+                    ((getSubReg(R, AArch64::zsub0) - AArch64::Z0) ==
+                     (getSubReg(R - 1, AArch64::zsub0) - AArch64::Z0) + 1)) &&
+                   !Matrix->isPhysRegUsed(R);
+          };
+          if (all_of(iota_range<unsigned>(0U, UseOps, /*Inclusive=*/false),
+                     IsFreeConsecutiveReg))
+            Hints.push_back(Reg);
+        }
+      } else {
+        // At least one operand already has a physical register assigned.
+        // Find the starting sub-register of this and use it to work out the
+        // correct strided register to suggest based on the current op index.
+        MCPhysReg TargetStartReg =
+            getSubReg(VRM->getPhys(AssignedRegOp->getReg()), AArch64::zsub0) +
+            (OpIdx - AssignedRegOp->getOperandNo());
+
+        for (unsigned I = 0; I < StridedOrder.size(); ++I)
+          if (getSubReg(StridedOrder[I], AArch64::zsub0) == TargetStartReg)
+            Hints.push_back(StridedOrder[I]);
+      }
+
+      if (!Hints.empty())
+        return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints,
+                                                         MF, VRM);
+    }
   }
 
   for (MachineInstr &MI : MRI.def_instructions(VirtReg)) {
diff --git llvm/lib/Target/AArch64/AArch64RegisterInfo.h llvm/lib/Target/AArch64/AArch64RegisterInfo.h
index 11da624af488..898a509f7590 100644
--- llvm/lib/Target/AArch64/AArch64RegisterInfo.h
+++ llvm/lib/Target/AArch64/AArch64RegisterInfo.h
@@ -27,7 +27,7 @@ class AArch64RegisterInfo final : public AArch64GenRegisterInfo {
   const Triple &TT;
 
 public:
-  AArch64RegisterInfo(const Triple &TT);
+  AArch64RegisterInfo(const Triple &TT, unsigned HwMode);
 
   // FIXME: This should be tablegen'd like getDwarfRegNum is
   int getSEHRegNum(unsigned i) const {
diff --git llvm/lib/Target/AArch64/AArch64RegisterInfo.td llvm/lib/Target/AArch64/AArch64RegisterInfo.td
index dd4f2549929f..fed9b7b173e9 100644
--- llvm/lib/Target/AArch64/AArch64RegisterInfo.td
+++ llvm/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -979,10 +979,19 @@ class ZPRRegOp <string Suffix, AsmOperandClass C, ElementSizeEnum Size,
 //******************************************************************************
 
 // SVE predicate register classes.
+
+// Note: This hardware mode is enabled in AArch64Subtarget::getHwModeSet()
+// (without the use of the table-gen'd predicates).
+def SMEWithZPRPredicateSpills : HwMode<"", [Predicate<"false">]>;
+
+def PPRSpillFillRI : RegInfoByHwMode<
+      [DefaultMode,              SMEWithZPRPredicateSpills],
+      [RegInfo<16,16,16>,        RegInfo<16,128,128>]>;
+
 class PPRClass<int firstreg, int lastreg, int step = 1> : RegisterClass<"AArch64",
                                   [ nxv16i1, nxv8i1, nxv4i1, nxv2i1, nxv1i1 ], 16,
                                   (sequence "P%u", firstreg, lastreg, step)> {
-  let Size = 16;
+  let RegInfos = PPRSpillFillRI;
 }
 
 def PPR    : PPRClass<0, 15> {
diff --git llvm/lib/Target/AArch64/AArch64StackTagging.cpp llvm/lib/Target/AArch64/AArch64StackTagging.cpp
index 694ee17f8e5f..fad83bbebd5d 100644
--- llvm/lib/Target/AArch64/AArch64StackTagging.cpp
+++ llvm/lib/Target/AArch64/AArch64StackTagging.cpp
@@ -575,7 +575,7 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
       TagPCall->setName(Info.AI->getName() + ".tag");
     // Does not replace metadata, so we don't have to handle DbgVariableRecords.
     Info.AI->replaceUsesWithIf(TagPCall, [&](const Use &U) {
-      return !memtag::isLifetimeIntrinsic(U.getUser());
+      return !isa<LifetimeIntrinsic>(U.getUser());
     });
     TagPCall->setOperand(0, Info.AI);
 
diff --git llvm/lib/Target/AArch64/AArch64Subtarget.cpp llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index bc921f07e1db..68c386585a79 100644
--- llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -86,6 +86,11 @@ static cl::alias AArch64StreamingStackHazardSize(
     cl::desc("alias for -aarch64-streaming-hazard-size"),
     cl::aliasopt(AArch64StreamingHazardSize));
 
+static cl::opt<bool> EnableZPRPredicateSpills(
+    "aarch64-enable-zpr-predicate-spills", cl::init(false), cl::Hidden,
+    cl::desc(
+        "Enables spilling/reloading SVE predicates as data vectors (ZPRs)"));
+
 // Subreg liveness tracking is disabled by default for now until all issues
 // are ironed out. This option allows the feature to be used in tests.
 static cl::opt<bool>
@@ -400,6 +405,20 @@ AArch64Subtarget::AArch64Subtarget(const Triple &TT, StringRef CPU,
   EnableSubregLiveness = EnableSubregLivenessTracking.getValue();
 }
 
+unsigned AArch64Subtarget::getHwModeSet() const {
+  AArch64HwModeBits Modes = AArch64HwModeBits::DefaultMode;
+
+  // Use a special hardware mode in streaming[-compatible] functions with
+  // aarch64-enable-zpr-predicate-spills. This changes the spill size (and
+  // alignment) for the predicate register class.
+  if (EnableZPRPredicateSpills.getValue() &&
+      (isStreaming() || isStreamingCompatible())) {
+    Modes |= AArch64HwModeBits::SMEWithZPRPredicateSpills;
+  }
+
+  return to_underlying(Modes);
+}
+
 const CallLowering *AArch64Subtarget::getCallLowering() const {
   return CallLoweringInfo.get();
 }
diff --git llvm/lib/Target/AArch64/AArch64Subtarget.h llvm/lib/Target/AArch64/AArch64Subtarget.h
index d22991224d49..e7757907a664 100644
--- llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -130,6 +130,8 @@ public:
                    bool IsStreaming = false, bool IsStreamingCompatible = false,
                    bool HasMinSize = false);
 
+  virtual unsigned getHwModeSet() const override;
+
 // Getters for SubtargetFeatures defined in tablegen
 #define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER)                    \
   bool GETTER() const { return ATTRIBUTE; }
diff --git llvm/lib/Target/AArch64/AArch64TargetMachine.h llvm/lib/Target/AArch64/AArch64TargetMachine.h
index 76b1c9d917ec..621adb380dbc 100644
--- llvm/lib/Target/AArch64/AArch64TargetMachine.h
+++ llvm/lib/Target/AArch64/AArch64TargetMachine.h
@@ -68,8 +68,7 @@ public:
 
   /// Returns true if a cast between SrcAS and DestAS is a noop.
   bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override {
-    // Addrspacecasts are always noops.
-    return true;
+    return getPointerSize(SrcAS) == getPointerSize(DestAS);
   }
 
 private:
diff --git llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 43f07be15e9d..335b46b76688 100644
--- llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -4621,7 +4621,7 @@ ParseStatus AArch64AsmParser::tryParseVectorList(OperandVector &Operands,
     return ParseStatus::NoMatch;
   };
 
-  int NumRegs = getNumRegsForRegKind(VectorKind);
+  unsigned NumRegs = getNumRegsForRegKind(VectorKind);
   SMLoc S = getLoc();
   auto LCurly = getTok();
   Lex(); // Eat left bracket token.
@@ -4638,10 +4638,10 @@ ParseStatus AArch64AsmParser::tryParseVectorList(OperandVector &Operands,
   if (!ParseRes.isSuccess())
     return ParseRes;
 
-  int64_t PrevReg = FirstReg;
+  MCRegister PrevReg = FirstReg;
   unsigned Count = 1;
 
-  int Stride = 1;
+  unsigned Stride = 1;
   if (parseOptionalToken(AsmToken::Minus)) {
     SMLoc Loc = getLoc();
     StringRef NextKind;
@@ -4656,7 +4656,7 @@ ParseStatus AArch64AsmParser::tryParseVectorList(OperandVector &Operands,
       return Error(Loc, "mismatched register size suffix");
 
     unsigned Space =
-        (PrevReg < Reg) ? (Reg - PrevReg) : (Reg + NumRegs - PrevReg);
+        (PrevReg < Reg) ? (Reg - PrevReg) : (NumRegs - (PrevReg - Reg));
 
     if (Space == 0 || Space > 3)
       return Error(Loc, "invalid number of vectors");
@@ -4682,7 +4682,7 @@ ParseStatus AArch64AsmParser::tryParseVectorList(OperandVector &Operands,
           getContext().getRegisterInfo()->getEncodingValue(PrevReg);
       if (!HasCalculatedStride) {
         Stride = (PrevRegVal < RegVal) ? (RegVal - PrevRegVal)
-                                       : (RegVal + NumRegs - PrevRegVal);
+                                       : (NumRegs - (PrevRegVal - RegVal));
         HasCalculatedStride = true;
       }
 
diff --git llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 07f03644336c..467094e9befe 100644
--- llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -2999,9 +2999,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
     bool IsZExtLoad = I.getOpcode() == TargetOpcode::G_ZEXTLOAD;
     LLT PtrTy = MRI.getType(LdSt.getPointerReg());
 
+    // Can only handle AddressSpace 0, 64-bit pointers.
     if (PtrTy != LLT::pointer(0, 64)) {
-      LLVM_DEBUG(dbgs() << "Load/Store pointer has type: " << PtrTy
-                        << ", expected: " << LLT::pointer(0, 64) << '\n');
       return false;
     }
 
diff --git llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 93461e39f955..fdedf44e0ba1 100644
--- llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -1215,11 +1215,13 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
                  {s32, v4s32},
                  {s32, v2s32},
                  {s64, v2s64}})
+      .moreElementsToNextPow2(1)
       .clampMaxNumElements(1, s64, 2)
       .clampMaxNumElements(1, s32, 4)
       .clampMaxNumElements(1, s16, 8)
       .clampMaxNumElements(1, s8, 16)
-      .lower();
+      .widenVectorEltsToVectorMinSize(1, 64)
+      .scalarize(1);
 
   getActionDefinitionsBuilder({G_VECREDUCE_FMIN, G_VECREDUCE_FMAX,
                                G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM})
diff --git llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
index a33f0bc78c21..9183fb4cc5f5 100644
--- llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
+++ llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
@@ -137,12 +137,6 @@ public:
 };
 
 class AArch64TargetWinCOFFStreamer : public llvm::AArch64TargetStreamer {
-private:
-  // True if we are processing SEH directives in an epilogue.
-  bool InEpilogCFI = false;
-
-  // Symbol of the current epilog for which we are processing SEH directives.
-  MCSymbol *CurrentEpilog = nullptr;
 public:
   AArch64TargetWinCOFFStreamer(llvm::MCStreamer &S)
     : AArch64TargetStreamer(S) {}
diff --git llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
index 208d43502cb8..8075f53ae308 100644
--- llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
+++ llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
@@ -73,8 +73,8 @@ void AArch64TargetWinCOFFStreamer::emitARM64WinUnwindCode(unsigned UnwindCode,
   if (!CurFrame)
     return;
   auto Inst = WinEH::Instruction(UnwindCode, /*Label=*/nullptr, Reg, Offset);
-  if (InEpilogCFI)
-    CurFrame->EpilogMap[CurrentEpilog].Instructions.push_back(Inst);
+  if (S.isInEpilogCFI())
+    CurFrame->EpilogMap[S.getCurrentEpilog()].Instructions.push_back(Inst);
   else
     CurFrame->Instructions.push_back(Inst);
 }
@@ -183,13 +183,7 @@ void AArch64TargetWinCOFFStreamer::emitARM64WinCFIPrologEnd() {
 }
 
 void AArch64TargetWinCOFFStreamer::emitARM64WinCFIEpilogStart() {
-  auto &S = getStreamer();
-  WinEH::FrameInfo *CurFrame = S.EnsureValidWinFrameInfo(SMLoc());
-  if (!CurFrame)
-    return;
-
-  InEpilogCFI = true;
-  CurrentEpilog = S.emitCFILabel();
+  getStreamer().emitWinCFIBeginEpilogue();
 }
 
 void AArch64TargetWinCOFFStreamer::emitARM64WinCFIEpilogEnd() {
@@ -198,13 +192,12 @@ void AArch64TargetWinCOFFStreamer::emitARM64WinCFIEpilogEnd() {
   if (!CurFrame)
     return;
 
-  InEpilogCFI = false;
-  WinEH::Instruction Inst =
-      WinEH::Instruction(Win64EH::UOP_End, /*Label=*/nullptr, -1, 0);
-  CurFrame->EpilogMap[CurrentEpilog].Instructions.push_back(Inst);
-  MCSymbol *Label = S.emitCFILabel();
-  CurFrame->EpilogMap[CurrentEpilog].End = Label;
-  CurrentEpilog = nullptr;
+  if (S.isInEpilogCFI()) {
+    WinEH::Instruction Inst =
+        WinEH::Instruction(Win64EH::UOP_End, /*Label=*/nullptr, -1, 0);
+    CurFrame->EpilogMap[S.getCurrentEpilog()].Instructions.push_back(Inst);
+  }
+  S.emitWinCFIEndEpilogue();
 }
 
 void AArch64TargetWinCOFFStreamer::emitARM64WinCFITrapFrame() {
diff --git llvm/lib/Target/AArch64/SMEInstrFormats.td llvm/lib/Target/AArch64/SMEInstrFormats.td
index a01d59d0e5c4..0ac131e48c4f 100644
--- llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -59,6 +59,20 @@ def FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO :
   let hasPostISelHook = 1;
 }
 
+def SPILL_PPR_TO_ZPR_SLOT_PSEUDO :
+  Pseudo<(outs), (ins PPRorPNRAny:$Pt, GPR64sp:$Rn, simm9:$imm9), []>, Sched<[]>
+{
+  let mayStore = 1;
+  let hasSideEffects = 0;
+}
+
+def FILL_PPR_FROM_ZPR_SLOT_PSEUDO :
+  Pseudo<(outs PPRorPNRAny:$Pt), (ins GPR64sp:$Rn, simm9:$imm9), []>, Sched<[]>
+{
+  let mayLoad = 1;
+  let hasSideEffects = 0;
+}
+
 def SDTZALoadStore : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisPtrTy<1>, SDTCisInt<2>]>;
 def AArch64SMELdr : SDNode<"AArch64ISD::SME_ZA_LDR", SDTZALoadStore,
                              [SDNPHasChain, SDNPSideEffect, SDNPMayLoad]>;
diff --git llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 40eaba2c0920..3bbbbcf71d8a 100644
--- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1207,9 +1207,8 @@ static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size,
                           unsigned FakeS16Opc, unsigned S32Opc,
                           unsigned S64Opc) {
     if (Size == 16)
-      // FIXME-TRUE16 use TrueS16Opc when realtrue16 is supported for CMP code
       return ST.hasTrue16BitInsts()
-                 ? ST.useRealTrue16Insts() ? FakeS16Opc : FakeS16Opc
+                 ? ST.useRealTrue16Insts() ? TrueS16Opc : FakeS16Opc
                  : S16Opc;
     if (Size == 32)
       return S32Opc;
diff --git llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index d8f441d1ccfe..ed922245b3e2 100644
--- llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -4853,7 +4853,10 @@ bool AMDGPUAsmParser::validateAGPRLdSt(const MCInst &Inst) const {
 
 bool AMDGPUAsmParser::validateVGPRAlign(const MCInst &Inst) const {
   auto FB = getFeatureBits();
-  if (!FB[AMDGPU::FeatureGFX90AInsts])
+  unsigned Opc = Inst.getOpcode();
+  // DS_READ_B96_TR_B6 is the only DS instruction in GFX950, that allows
+  // unaligned VGPR. All others only allow even aligned VGPRs.
+  if (!(FB[AMDGPU::FeatureGFX90AInsts]) || Opc == AMDGPU::DS_READ_B96_TR_B6_vi)
     return true;
 
   const MCRegisterInfo *MRI = getMRI();
diff --git llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index de2095fa60ff..3d6419778f4b 100644
--- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -53,6 +53,11 @@ static cl::opt<bool>
                                "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
                       cl::init(false), cl::Hidden);
 
+static cl::opt<bool> ForceEmitZeroLoadFlag(
+    "amdgpu-waitcnt-load-forcezero",
+    cl::desc("Force all waitcnt load counters to wait until 0"),
+    cl::init(false), cl::Hidden);
+
 namespace {
 // Class of object that encapsulates latest instruction counter score
 // associated with the operand.  Used for determining whether
@@ -1850,6 +1855,9 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
       Wait.BvhCnt = 0;
   }
 
+  if (ForceEmitZeroLoadFlag && Wait.LoadCnt != ~0u)
+    Wait.LoadCnt = 0;
+
   return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,
                          OldWaitcntInstr);
 }
diff --git llvm/lib/Target/AMDGPU/SIInstrInfo.cpp llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 5727d14ec49e..35667801c809 100644
--- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2366,11 +2366,11 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     assert(ST.useVGPRIndexMode());
     Register VecReg = MI.getOperand(0).getReg();
     bool IsUndef = MI.getOperand(1).isUndef();
-    Register Idx = MI.getOperand(3).getReg();
+    MachineOperand &Idx = MI.getOperand(3);
     Register SubReg = MI.getOperand(4).getImm();
 
     MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
-                              .addReg(Idx)
+                              .add(Idx)
                               .addImm(AMDGPU::VGPRIndexMode::DST_ENABLE);
     SetOn->getOperand(3).setIsUndef();
 
diff --git llvm/lib/Target/AMDGPU/SIInstructions.td llvm/lib/Target/AMDGPU/SIInstructions.td
index 5af46989aca9..bee4c47a23ba 100644
--- llvm/lib/Target/AMDGPU/SIInstructions.td
+++ llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -228,16 +228,39 @@ def S_INVERSE_BALLOT_U64 : SPseudoInstSI<
 // Pseudo instructions used for @llvm.fptrunc.round. The final codegen is done
 // in the ModeRegister pass.
 let Uses = [MODE, EXEC] in {
+let True16Predicate = NotHasTrue16BitInsts in
 def FPTRUNC_ROUND_F16_F32_PSEUDO : VPseudoInstSI <(outs VGPR_32:$vdst),
   (ins VGPR_32:$src0, i32imm:$round)>;
 
+let True16Predicate = UseFakeTrue16Insts in
+def FPTRUNC_ROUND_F16_F32_PSEUDO_fake16_e32 : VPseudoInstSI <(outs VGPR_32:$vdst),
+  (ins VGPR_32:$src0, i32imm:$round)>;
+
+let True16Predicate = UseRealTrue16Insts in
+// The operands of these pseudos should match V_CVT_F16_F32_t16_e64
+def FPTRUNC_ROUND_F16_F32_PSEUDO_t16_e64 : VPseudoInstSI <(outs VOPDstOperand_t16:$vdst),
+  (ins FP32InputMods:$src0_modifiers, VSrc_f32:$src0, Clamp0:$clamp, omod0:$omod, op_sel0:$op_sel, i32imm:$round)> {
+   let FPClamp = 1;
+   let ClampLo = 1;
+   let UseNamedOperandTable = 1;
+}
+
 def FPTRUNC_ROUND_F32_F64_PSEUDO : VPseudoInstSI <(outs VGPR_32:$vdst),
   (ins VReg_64:$src0, i32imm:$round)>;
 } // End Uses = [MODE, EXEC]
 
+let True16Predicate = NotHasTrue16BitInsts in
 def : GCNPat <(f16 (fptrunc_round f32:$src0, (i32 SupportedRoundMode:$round))),
      (FPTRUNC_ROUND_F16_F32_PSEUDO $src0, (as_hw_round_mode $round))>;
 
+let True16Predicate = UseFakeTrue16Insts in
+def : GCNPat <(f16 (fptrunc_round f32:$src0, (i32 SupportedRoundMode:$round))),
+     (FPTRUNC_ROUND_F16_F32_PSEUDO_fake16_e32 $src0, (as_hw_round_mode $round))>;
+
+let True16Predicate = UseRealTrue16Insts in
+def : GCNPat <(f16 (fptrunc_round (f32 (VOP3OpSelMods f32:$src0, i32:$src0_modifiers)), (i32 SupportedRoundMode:$round))),
+     (FPTRUNC_ROUND_F16_F32_PSEUDO_t16_e64 $src0_modifiers, $src0, (as_hw_round_mode $round))>;
+
 def : GCNPat <(f32 (fptrunc_round f64:$src0, (i32 SupportedRoundMode:$round))),
      (FPTRUNC_ROUND_F32_F64_PSEUDO $src0, (as_hw_round_mode $round))>;
 
diff --git llvm/lib/Target/AMDGPU/SIModeRegister.cpp llvm/lib/Target/AMDGPU/SIModeRegister.cpp
index 412e2f2fe45d..99aea52c184d 100644
--- llvm/lib/Target/AMDGPU/SIModeRegister.cpp
+++ llvm/lib/Target/AMDGPU/SIModeRegister.cpp
@@ -166,6 +166,8 @@ Status SIModeRegister::getInstructionMode(MachineInstr &MI,
   unsigned Opcode = MI.getOpcode();
   if (TII->usesFPDPRounding(MI) ||
       Opcode == AMDGPU::FPTRUNC_ROUND_F16_F32_PSEUDO ||
+      Opcode == AMDGPU::FPTRUNC_ROUND_F16_F32_PSEUDO_fake16_e32 ||
+      Opcode == AMDGPU::FPTRUNC_ROUND_F16_F32_PSEUDO_t16_e64 ||
       Opcode == AMDGPU::FPTRUNC_ROUND_F32_F64_PSEUDO) {
     switch (Opcode) {
     case AMDGPU::V_INTERP_P1LL_F16:
@@ -177,19 +179,19 @@ Status SIModeRegister::getInstructionMode(MachineInstr &MI,
     case AMDGPU::FPTRUNC_ROUND_F16_F32_PSEUDO: {
       unsigned Mode = MI.getOperand(2).getImm();
       MI.removeOperand(2);
-      // Replacing the pseudo by a real instruction in place
-      if (TII->getSubtarget().hasTrue16BitInsts()) {
-        MachineBasicBlock &MBB = *MI.getParent();
-        MachineInstrBuilder B(*MBB.getParent(), MI);
-        MI.setDesc(TII->get(AMDGPU::V_CVT_F16_F32_fake16_e64));
-        MachineOperand Src0 = MI.getOperand(1);
-        MI.removeOperand(1);
-        B.addImm(0); // src0_modifiers
-        B.add(Src0); // re-add src0 operand
-        B.addImm(0); // clamp
-        B.addImm(0); // omod
-      } else
-        MI.setDesc(TII->get(AMDGPU::V_CVT_F16_F32_e32));
+      MI.setDesc(TII->get(AMDGPU::V_CVT_F16_F32_e32));
+      return Status(FP_ROUND_MODE_DP(3), FP_ROUND_MODE_DP(Mode));
+    }
+    case AMDGPU::FPTRUNC_ROUND_F16_F32_PSEUDO_fake16_e32: {
+      unsigned Mode = MI.getOperand(2).getImm();
+      MI.removeOperand(2);
+      MI.setDesc(TII->get(AMDGPU::V_CVT_F16_F32_fake16_e32));
+      return Status(FP_ROUND_MODE_DP(3), FP_ROUND_MODE_DP(Mode));
+    }
+    case AMDGPU::FPTRUNC_ROUND_F16_F32_PSEUDO_t16_e64: {
+      unsigned Mode = MI.getOperand(6).getImm();
+      MI.removeOperand(6);
+      MI.setDesc(TII->get(AMDGPU::V_CVT_F16_F32_t16_e64));
       return Status(FP_ROUND_MODE_DP(3), FP_ROUND_MODE_DP(Mode));
     }
     case AMDGPU::FPTRUNC_ROUND_F32_F64_PSEUDO: {
diff --git llvm/lib/Target/AMDGPU/VOP3Instructions.td llvm/lib/Target/AMDGPU/VOP3Instructions.td
index c06c932a5375..ce73e0ca361d 100644
--- llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -645,8 +645,8 @@ defm V_ADD_I16 : VOP3Inst_t16 <"v_add_i16", VOP_I16_I16_I16>;
 defm V_SUB_I16 : VOP3Inst_t16 <"v_sub_i16", VOP_I16_I16_I16>;
 
 let isCommutable = 1 in {
-  defm V_MAD_U32_U16 : VOP3Inst <"v_mad_u32_u16", VOP3_Profile<VOP_I32_I16_I16_I32, VOP3_OPSEL>>;
-  defm V_MAD_I32_I16 : VOP3Inst <"v_mad_i32_i16", VOP3_Profile<VOP_I32_I16_I16_I32, VOP3_OPSEL>>;
+  defm V_MAD_U32_U16 : VOP3Inst_t16 <"v_mad_u32_u16", VOP_I32_I16_I16_I32>;
+  defm V_MAD_I32_I16 : VOP3Inst_t16 <"v_mad_i32_i16", VOP_I32_I16_I16_I32>;
 } // End isCommutable = 1
 
 defm V_CVT_PKNORM_I16_F16 : VOP3Inst_t16 <"v_cvt_pknorm_i16_f16", VOP_B32_F16_F16>;
@@ -1736,8 +1736,8 @@ defm V_ADD3_U32            : VOP3_Realtriple_gfx11_gfx12<0x255>;
 defm V_LSHL_OR_B32         : VOP3_Realtriple_gfx11_gfx12<0x256>;
 defm V_AND_OR_B32          : VOP3_Realtriple_gfx11_gfx12<0x257>;
 defm V_OR3_B32             : VOP3_Realtriple_gfx11_gfx12<0x258>;
-defm V_MAD_U32_U16         : VOP3_Realtriple_gfx11_gfx12<0x259>;
-defm V_MAD_I32_I16         : VOP3_Realtriple_gfx11_gfx12<0x25a>;
+defm V_MAD_U32_U16         : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<0x259, "v_mad_u32_u16">;
+defm V_MAD_I32_I16         : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<0x25a, "v_mad_i32_i16">;
 defm V_PERMLANE16_B32      : VOP3_Real_Base_gfx11_gfx12<0x25b>;
 defm V_PERMLANEX16_B32     : VOP3_Real_Base_gfx11_gfx12<0x25c>;
 defm V_MAXMIN_F32          : VOP3_Realtriple_gfx11<0x25e>;
diff --git llvm/lib/Target/AMDGPU/VOPCInstructions.td llvm/lib/Target/AMDGPU/VOPCInstructions.td
index e16ac4423265..00a3381b3fd4 100644
--- llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -1035,6 +1035,20 @@ multiclass VOPCClassPat64<string inst_name> {
   >;
 }
 
+multiclass VOPCClassPat64_t16<string inst_name> {
+  defvar inst = !cast<VOP_Pseudo>(inst_name#"_t16_e64");
+  defvar P = inst.Pfl;
+  def : GCNPat <
+    (i1:$sdst
+      (AMDGPUfp_class
+        (P.Src0VT (VOP3ModsNonCanonicalizing P.Src0VT:$src0, i32:$src0_modifiers)),
+        i32:$src1)),
+    (inst i32:$src0_modifiers, VSrcT_f16:$src0,
+          0 /* src1_modifiers */, (f16 (EXTRACT_SUBREG VGPR_32:$src1, lo16)),
+          0) /* op_sel */
+  >;
+}
+
 multiclass VOPCClassPat64_fake16<string inst_name> {
   defvar inst = !cast<VOP_Pseudo>(inst_name#"_fake16_e64");
   defvar P = inst.Pfl;
@@ -1158,6 +1172,7 @@ multiclass VOPC_CLASS_F16 <string opName> {
   }
   let True16Predicate = UseRealTrue16Insts in {
     defm _t16 : VOPC_Class_Pseudos <opName#"_t16", VOPC_I1_F16_I16_t16, 0>;
+    defm : VOPCClassPat64_t16<NAME>;
   }
   let True16Predicate = UseFakeTrue16Insts in {
     defm _fake16 : VOPC_Class_Pseudos <opName#"_fake16", VOPC_I1_F16_I16_fake16, 0>;
@@ -1207,27 +1222,30 @@ defm V_CMPX_CLASS_F16 : VOPCX_CLASS_F16 <"v_cmpx_class_f16">;
 
 // We need to use COPY_TO_REGCLASS to w/a the problem when ReplaceAllUsesWith()
 // complaints it cannot replace i1 <-> i64/i32 if node was not morphed in place.
-multiclass ICMP_Pattern <PatFrags cond, Instruction inst, ValueType vt> {
+multiclass ICMP_Pattern <PatFrags cond, Instruction inst, ValueType vt, dag dstInst = (inst $src0, $src1)> {
   let WaveSizePredicate = isWave64 in
   def : GCNPat <
     (i64 (AMDGPUsetcc vt:$src0, vt:$src1, cond)),
-    (i64 (COPY_TO_REGCLASS (inst $src0, $src1), SReg_64))
+    (i64 (COPY_TO_REGCLASS dstInst, SReg_64))
   >;
 
   let WaveSizePredicate = isWave32 in {
     def : GCNPat <
       (i32 (AMDGPUsetcc vt:$src0, vt:$src1, cond)),
-      (i32 (COPY_TO_REGCLASS (inst $src0, $src1), SReg_32))
+      (i32 (COPY_TO_REGCLASS dstInst, SReg_32))
     >;
 
     // Support codegen of i64 setcc in wave32 mode.
     def : GCNPat <
       (i64 (AMDGPUsetcc vt:$src0, vt:$src1, cond)),
-      (i64 (REG_SEQUENCE SReg_64, (inst $src0, $src1), sub0, (S_MOV_B32 (i32 0)), sub1))
+      (i64 (REG_SEQUENCE SReg_64, dstInst, sub0, (S_MOV_B32 (i32 0)), sub1))
     >;
   }
 }
 
+multiclass ICMP_Pattern_t16<PatFrags cond, Instruction inst, ValueType vt>
+    : ICMP_Pattern<cond, inst, vt, (inst 0, $src0, 0, $src1)>;
+
 defm : ICMP_Pattern <COND_EQ, V_CMP_EQ_U32_e64, i32>;
 defm : ICMP_Pattern <COND_NE, V_CMP_NE_U32_e64, i32>;
 defm : ICMP_Pattern <COND_UGT, V_CMP_GT_U32_e64, i32>;
@@ -1250,6 +1268,19 @@ defm : ICMP_Pattern <COND_SGE, V_CMP_GE_I64_e64, i64>;
 defm : ICMP_Pattern <COND_SLT, V_CMP_LT_I64_e64, i64>;
 defm : ICMP_Pattern <COND_SLE, V_CMP_LE_I64_e64, i64>;
 
+let True16Predicate = UseRealTrue16Insts in {
+defm : ICMP_Pattern_t16 <COND_EQ, V_CMP_EQ_U16_t16_e64, i16>;
+defm : ICMP_Pattern_t16 <COND_NE, V_CMP_NE_U16_t16_e64, i16>;
+defm : ICMP_Pattern_t16 <COND_UGT, V_CMP_GT_U16_t16_e64, i16>;
+defm : ICMP_Pattern_t16 <COND_UGE, V_CMP_GE_U16_t16_e64, i16>;
+defm : ICMP_Pattern_t16 <COND_ULT, V_CMP_LT_U16_t16_e64, i16>;
+defm : ICMP_Pattern_t16 <COND_ULE, V_CMP_LE_U16_t16_e64, i16>;
+defm : ICMP_Pattern_t16 <COND_SGT, V_CMP_GT_I16_t16_e64, i16>;
+defm : ICMP_Pattern_t16 <COND_SGE, V_CMP_GE_I16_t16_e64, i16>;
+defm : ICMP_Pattern_t16 <COND_SLT, V_CMP_LT_I16_t16_e64, i16>;
+defm : ICMP_Pattern_t16 <COND_SLE, V_CMP_LE_I16_t16_e64, i16>;
+} // End True16Predicate = UseRealTrue16Insts
+
 let True16Predicate = UseFakeTrue16Insts in {
 defm : ICMP_Pattern <COND_EQ, V_CMP_EQ_U16_fake16_e64, i16>;
 defm : ICMP_Pattern <COND_NE, V_CMP_NE_U16_fake16_e64, i16>;
@@ -1335,6 +1366,24 @@ defm : FCMP_Pattern <COND_UGE, V_CMP_NLT_F64_e64, f64>;
 defm : FCMP_Pattern <COND_ULT, V_CMP_NGE_F64_e64, f64>;
 defm : FCMP_Pattern <COND_ULE, V_CMP_NGT_F64_e64, f64>;
 
+let True16Predicate = UseRealTrue16Insts in {
+defm : FCMP_Pattern <COND_O, V_CMP_O_F16_t16_e64, f16>;
+defm : FCMP_Pattern <COND_UO, V_CMP_U_F16_t16_e64, f16>;
+defm : FCMP_Pattern <COND_OEQ, V_CMP_EQ_F16_t16_e64, f16>;
+defm : FCMP_Pattern <COND_ONE, V_CMP_NEQ_F16_t16_e64, f16>;
+defm : FCMP_Pattern <COND_OGT, V_CMP_GT_F16_t16_e64, f16>;
+defm : FCMP_Pattern <COND_OGE, V_CMP_GE_F16_t16_e64, f16>;
+defm : FCMP_Pattern <COND_OLT, V_CMP_LT_F16_t16_e64, f16>;
+defm : FCMP_Pattern <COND_OLE, V_CMP_LE_F16_t16_e64, f16>;
+
+defm : FCMP_Pattern <COND_UEQ, V_CMP_NLG_F16_t16_e64, f16>;
+defm : FCMP_Pattern <COND_UNE, V_CMP_NEQ_F16_t16_e64, f16>;
+defm : FCMP_Pattern <COND_UGT, V_CMP_NLE_F16_t16_e64, f16>;
+defm : FCMP_Pattern <COND_UGE, V_CMP_NLT_F16_t16_e64, f16>;
+defm : FCMP_Pattern <COND_ULT, V_CMP_NGE_F16_t16_e64, f16>;
+defm : FCMP_Pattern <COND_ULE, V_CMP_NGT_F16_t16_e64, f16>;
+} // End True16Predicate = UseRealTrue16Insts
+
 let True16Predicate = UseFakeTrue16Insts in {
 defm : FCMP_Pattern <COND_O, V_CMP_O_F16_fake16_e64, f16>;
 defm : FCMP_Pattern <COND_UO, V_CMP_U_F16_fake16_e64, f16>;
diff --git llvm/lib/Target/AMDGPU/VOPInstructions.td llvm/lib/Target/AMDGPU/VOPInstructions.td
index eb18cabe368c..d6ad01c8f9b3 100644
--- llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -2019,6 +2019,10 @@ def : ClassPat<V_CMP_CLASS_F16_e64, f16> {
   let True16Predicate = NotHasTrue16BitInsts;
 }
 
+def : ClassPat_t16<V_CMP_CLASS_F16_t16_e64, f16> {
+  let True16Predicate = UseRealTrue16Insts;
+}
+
 def : ClassPat_t16<V_CMP_CLASS_F16_fake16_e64, f16> {
   let True16Predicate = UseFakeTrue16Insts;
 }
diff --git llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp
index e66059c2a0e0..bb51d716311b 100644
--- llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp
+++ llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp
@@ -77,13 +77,6 @@ llvm::createARMWinCOFFStreamer(MCContext &Context,
 
 namespace {
 class ARMTargetWinCOFFStreamer : public llvm::ARMTargetStreamer {
-private:
-  // True if we are processing SEH directives in an epilogue.
-  bool InEpilogCFI = false;
-
-  // Symbol of the current epilog for which we are processing SEH directives.
-  MCSymbol *CurrentEpilog = nullptr;
-
 public:
   ARMTargetWinCOFFStreamer(llvm::MCStreamer &S) : ARMTargetStreamer(S) {}
 
@@ -114,8 +107,8 @@ void ARMTargetWinCOFFStreamer::emitARMWinUnwindCode(unsigned UnwindCode,
     return;
   MCSymbol *Label = S.emitCFILabel();
   auto Inst = WinEH::Instruction(UnwindCode, Label, Reg, Offset);
-  if (InEpilogCFI)
-    CurFrame->EpilogMap[CurrentEpilog].Instructions.push_back(Inst);
+  if (S.isInEpilogCFI())
+    CurFrame->EpilogMap[S.getCurrentEpilog()].Instructions.push_back(Inst);
   else
     CurFrame->Instructions.push_back(Inst);
 }
@@ -224,9 +217,10 @@ void ARMTargetWinCOFFStreamer::emitARMWinCFIEpilogStart(unsigned Condition) {
   if (!CurFrame)
     return;
 
-  InEpilogCFI = true;
-  CurrentEpilog = S.emitCFILabel();
-  CurFrame->EpilogMap[CurrentEpilog].Condition = Condition;
+  S.emitWinCFIBeginEpilogue();
+  if (S.isInEpilogCFI()) {
+    CurFrame->EpilogMap[S.getCurrentEpilog()].Condition = Condition;
+  }
 }
 
 void ARMTargetWinCOFFStreamer::emitARMWinCFIEpilogEnd() {
@@ -235,33 +229,26 @@ void ARMTargetWinCOFFStreamer::emitARMWinCFIEpilogEnd() {
   if (!CurFrame)
     return;
 
-  if (!CurrentEpilog) {
-    S.getContext().reportError(SMLoc(), "Stray .seh_endepilogue in " +
-                                            CurFrame->Function->getName());
-    return;
-  }
-
-  std::vector<WinEH::Instruction> &Epilog =
-      CurFrame->EpilogMap[CurrentEpilog].Instructions;
-
-  unsigned UnwindCode = Win64EH::UOP_End;
-  if (!Epilog.empty()) {
-    WinEH::Instruction EndInstr = Epilog.back();
-    if (EndInstr.Operation == Win64EH::UOP_Nop) {
-      UnwindCode = Win64EH::UOP_EndNop;
-      Epilog.pop_back();
-    } else if (EndInstr.Operation == Win64EH::UOP_WideNop) {
-      UnwindCode = Win64EH::UOP_WideEndNop;
-      Epilog.pop_back();
+  if (S.isInEpilogCFI()) {
+    std::vector<WinEH::Instruction> &Epilog =
+        CurFrame->EpilogMap[S.getCurrentEpilog()].Instructions;
+
+    unsigned UnwindCode = Win64EH::UOP_End;
+    if (!Epilog.empty()) {
+      WinEH::Instruction EndInstr = Epilog.back();
+      if (EndInstr.Operation == Win64EH::UOP_Nop) {
+        UnwindCode = Win64EH::UOP_EndNop;
+        Epilog.pop_back();
+      } else if (EndInstr.Operation == Win64EH::UOP_WideNop) {
+        UnwindCode = Win64EH::UOP_WideEndNop;
+        Epilog.pop_back();
+      }
     }
-  }
 
-  InEpilogCFI = false;
-  WinEH::Instruction Inst = WinEH::Instruction(UnwindCode, nullptr, -1, 0);
-  CurFrame->EpilogMap[CurrentEpilog].Instructions.push_back(Inst);
-  MCSymbol *Label = S.emitCFILabel();
-  CurFrame->EpilogMap[CurrentEpilog].End = Label;
-  CurrentEpilog = nullptr;
+    WinEH::Instruction Inst = WinEH::Instruction(UnwindCode, nullptr, -1, 0);
+    CurFrame->EpilogMap[S.getCurrentEpilog()].Instructions.push_back(Inst);
+  }
+  S.emitWinCFIEndEpilogue();
 }
 
 void ARMTargetWinCOFFStreamer::emitARMWinCFICustom(unsigned Opcode) {
diff --git llvm/lib/Target/DirectX/DXILDataScalarization.cpp llvm/lib/Target/DirectX/DXILDataScalarization.cpp
index 2ab2daaff5b5..a0dd17904f6f 100644
--- llvm/lib/Target/DirectX/DXILDataScalarization.cpp
+++ llvm/lib/Target/DirectX/DXILDataScalarization.cpp
@@ -91,7 +91,7 @@ bool DataScalarizerVisitor::visitLoadInst(LoadInst &LI) {
     if (CE && CE->getOpcode() == Instruction::GetElementPtr) {
       GetElementPtrInst *OldGEP =
           cast<GetElementPtrInst>(CE->getAsInstruction());
-      OldGEP->insertBefore(&LI);
+      OldGEP->insertBefore(LI.getIterator());
       IRBuilder<> Builder(&LI);
       LoadInst *NewLoad =
           Builder.CreateLoad(LI.getType(), OldGEP, LI.getName());
@@ -115,7 +115,7 @@ bool DataScalarizerVisitor::visitStoreInst(StoreInst &SI) {
     if (CE && CE->getOpcode() == Instruction::GetElementPtr) {
       GetElementPtrInst *OldGEP =
           cast<GetElementPtrInst>(CE->getAsInstruction());
-      OldGEP->insertBefore(&SI);
+      OldGEP->insertBefore(SI.getIterator());
       IRBuilder<> Builder(&SI);
       StoreInst *NewStore = Builder.CreateStore(SI.getValueOperand(), OldGEP);
       NewStore->setAlignment(SI.getAlign());
diff --git llvm/lib/Target/DirectX/DXILFlattenArrays.cpp llvm/lib/Target/DirectX/DXILFlattenArrays.cpp
index 53fc1c713a8c..a3163a896964 100644
--- llvm/lib/Target/DirectX/DXILFlattenArrays.cpp
+++ llvm/lib/Target/DirectX/DXILFlattenArrays.cpp
@@ -164,7 +164,7 @@ bool DXILFlattenArraysVisitor::visitLoadInst(LoadInst &LI) {
     if (CE && CE->getOpcode() == Instruction::GetElementPtr) {
       GetElementPtrInst *OldGEP =
           cast<GetElementPtrInst>(CE->getAsInstruction());
-      OldGEP->insertBefore(&LI);
+      OldGEP->insertBefore(LI.getIterator());
 
       IRBuilder<> Builder(&LI);
       LoadInst *NewLoad =
@@ -187,7 +187,7 @@ bool DXILFlattenArraysVisitor::visitStoreInst(StoreInst &SI) {
     if (CE && CE->getOpcode() == Instruction::GetElementPtr) {
       GetElementPtrInst *OldGEP =
           cast<GetElementPtrInst>(CE->getAsInstruction());
-      OldGEP->insertBefore(&SI);
+      OldGEP->insertBefore(SI.getIterator());
 
       IRBuilder<> Builder(&SI);
       StoreInst *NewStore = Builder.CreateStore(SI.getValueOperand(), OldGEP);
diff --git llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
index d2ae2ef7bd7f..41803e66a0f8 100644
--- llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
+++ llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
@@ -1388,14 +1388,12 @@ bool PolynomialMultiplyRecognize::convertShiftsToLeft(BasicBlock *LoopB,
 
   CastMapType CastMap;
 
-  auto upcast = [] (CastMapType &CM, IRBuilder<> &IRB, Value *V,
-        IntegerType *Ty) -> Value* {
-    auto H = CM.find(std::make_pair(V, Ty));
-    if (H != CM.end())
-      return H->second;
-    Value *CV = IRB.CreateIntCast(V, Ty, false);
-    CM.insert(std::make_pair(std::make_pair(V, Ty), CV));
-    return CV;
+  auto upcast = [](CastMapType &CM, IRBuilder<> &IRB, Value *V,
+                   IntegerType *Ty) -> Value * {
+    auto [H, Inserted] = CM.try_emplace(std::make_pair(V, Ty));
+    if (Inserted)
+      H->second = IRB.CreateIntCast(V, Ty, false);
+    return H->second;
   };
 
   for (auto I = LoopB->begin(), E = LoopB->end(); I != E; ++I) {
diff --git llvm/lib/Target/M68k/M68kExpandPseudo.cpp llvm/lib/Target/M68k/M68kExpandPseudo.cpp
index c7fdd7d7c350..1ba265a60c3d 100644
--- llvm/lib/Target/M68k/M68kExpandPseudo.cpp
+++ llvm/lib/Target/M68k/M68kExpandPseudo.cpp
@@ -193,31 +193,23 @@ bool M68kExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
   case M68k::MOV8dc:
     return TII->ExpandCCR(MIB, /*IsToCCR=*/false);
 
-  case M68k::MOVM8jm_P:
-    return TII->ExpandMOVEM(MIB, TII->get(M68k::MOVM32jm), /*IsRM=*/false);
   case M68k::MOVM16jm_P:
-    return TII->ExpandMOVEM(MIB, TII->get(M68k::MOVM32jm), /*IsRM=*/false);
+    return TII->ExpandMOVEM(MIB, TII->get(M68k::MOVM16jm), /*IsRM=*/false);
   case M68k::MOVM32jm_P:
     return TII->ExpandMOVEM(MIB, TII->get(M68k::MOVM32jm), /*IsRM=*/false);
 
-  case M68k::MOVM8pm_P:
-    return TII->ExpandMOVEM(MIB, TII->get(M68k::MOVM32pm), /*IsRM=*/false);
   case M68k::MOVM16pm_P:
-    return TII->ExpandMOVEM(MIB, TII->get(M68k::MOVM32pm), /*IsRM=*/false);
+    return TII->ExpandMOVEM(MIB, TII->get(M68k::MOVM16pm), /*IsRM=*/false);
   case M68k::MOVM32pm_P:
     return TII->ExpandMOVEM(MIB, TII->get(M68k::MOVM32pm), /*IsRM=*/false);
 
-  case M68k::MOVM8mj_P:
-    return TII->ExpandMOVEM(MIB, TII->get(M68k::MOVM32mj), /*IsRM=*/true);
   case M68k::MOVM16mj_P:
-    return TII->ExpandMOVEM(MIB, TII->get(M68k::MOVM32mj), /*IsRM=*/true);
+    return TII->ExpandMOVEM(MIB, TII->get(M68k::MOVM16mj), /*IsRM=*/true);
   case M68k::MOVM32mj_P:
     return TII->ExpandMOVEM(MIB, TII->get(M68k::MOVM32mj), /*IsRM=*/true);
 
-  case M68k::MOVM8mp_P:
-    return TII->ExpandMOVEM(MIB, TII->get(M68k::MOVM32mp), /*IsRM=*/true);
   case M68k::MOVM16mp_P:
-    return TII->ExpandMOVEM(MIB, TII->get(M68k::MOVM32mp), /*IsRM=*/true);
+    return TII->ExpandMOVEM(MIB, TII->get(M68k::MOVM16mp), /*IsRM=*/true);
   case M68k::MOVM32mp_P:
     return TII->ExpandMOVEM(MIB, TII->get(M68k::MOVM32mp), /*IsRM=*/true);
 
diff --git llvm/lib/Target/M68k/M68kInstrData.td llvm/lib/Target/M68k/M68kInstrData.td
index a7d7f1826f7f..398c55fa6da4 100644
--- llvm/lib/Target/M68k/M68kInstrData.td
+++ llvm/lib/Target/M68k/M68kInstrData.td
@@ -337,20 +337,16 @@ class MxMOVEM_RM_Pseudo<MxType TYPE, MxOperand MEMOp>
     : MxPseudo<(outs TYPE.ROp:$dst), (ins MEMOp:$src)>;
 
 // Mem <- Reg
-def MOVM8jm_P  : MxMOVEM_MR_Pseudo<MxType8d,  MxType8.JOp>;
 def MOVM16jm_P : MxMOVEM_MR_Pseudo<MxType16r, MxType16.JOp>;
 def MOVM32jm_P : MxMOVEM_MR_Pseudo<MxType32r, MxType32.JOp>;
 
-def MOVM8pm_P  : MxMOVEM_MR_Pseudo<MxType8d,  MxType8.POp>;
 def MOVM16pm_P : MxMOVEM_MR_Pseudo<MxType16r, MxType16.POp>;
 def MOVM32pm_P : MxMOVEM_MR_Pseudo<MxType32r, MxType32.POp>;
 
 // Reg <- Mem
-def MOVM8mj_P  : MxMOVEM_RM_Pseudo<MxType8d,  MxType8.JOp>;
 def MOVM16mj_P : MxMOVEM_RM_Pseudo<MxType16r, MxType16.JOp>;
 def MOVM32mj_P : MxMOVEM_RM_Pseudo<MxType32r, MxType32.JOp>;
 
-def MOVM8mp_P  : MxMOVEM_RM_Pseudo<MxType8d,  MxType8.POp>;
 def MOVM16mp_P : MxMOVEM_RM_Pseudo<MxType16r, MxType16.POp>;
 def MOVM32mp_P : MxMOVEM_RM_Pseudo<MxType32r, MxType32.POp>;
 
diff --git llvm/lib/Target/M68k/M68kInstrInfo.cpp llvm/lib/Target/M68k/M68kInstrInfo.cpp
index 182582642c50..febd020f3f2a 100644
--- llvm/lib/Target/M68k/M68kInstrInfo.cpp
+++ llvm/lib/Target/M68k/M68kInstrInfo.cpp
@@ -593,7 +593,6 @@ bool M68kInstrInfo::ExpandCCR(MachineInstrBuilder &MIB, bool IsToCCR) const {
 bool M68kInstrInfo::ExpandMOVEM(MachineInstrBuilder &MIB,
                                 const MCInstrDesc &Desc, bool IsRM) const {
   int Reg = 0, Offset = 0, Base = 0;
-  auto XR32 = RI.getRegClass(M68k::XR32RegClassID);
   auto DL = MIB->getDebugLoc();
   auto MI = MIB.getInstr();
   auto &MBB = *MIB->getParent();
@@ -608,13 +607,6 @@ bool M68kInstrInfo::ExpandMOVEM(MachineInstrBuilder &MIB,
     Reg = MIB->getOperand(2).getReg();
   }
 
-  // If the register is not in XR32 then it is smaller than 32 bit, we
-  // implicitly promote it to 32
-  if (!XR32->contains(Reg)) {
-    Reg = RI.getMatchingMegaReg(Reg, XR32);
-    assert(Reg && "Has not meaningful MEGA register");
-  }
-
   unsigned Mask = 1 << RI.getSpillRegisterOrder(Reg);
   if (IsRM) {
     BuildMI(MBB, MI, DL, Desc)
@@ -799,22 +791,25 @@ namespace {
 unsigned getLoadStoreRegOpcode(unsigned Reg, const TargetRegisterClass *RC,
                                const TargetRegisterInfo *TRI,
                                const M68kSubtarget &STI, bool load) {
-  switch (TRI->getRegSizeInBits(*RC)) {
+  switch (TRI->getSpillSize(*RC)) {
   default:
+    LLVM_DEBUG(
+        dbgs() << "Cannot determine appropriate opcode for load/store to/from "
+               << TRI->getName(Reg) << " of class " << TRI->getRegClassName(RC)
+               << " with spill size " << TRI->getSpillSize(*RC) << '\n');
     llvm_unreachable("Unknown spill size");
-  case 8:
+  case 2:
+    if (M68k::XR16RegClass.hasSubClassEq(RC))
+      return load ? M68k::MOVM16mp_P : M68k::MOVM16pm_P;
     if (M68k::DR8RegClass.hasSubClassEq(RC))
-      return load ? M68k::MOV8dp : M68k::MOV8pd;
+      return load ? M68k::MOVM16mp_P : M68k::MOVM16pm_P;
     if (M68k::CCRCRegClass.hasSubClassEq(RC))
-      return load ? M68k::MOV16cp : M68k::MOV16pc;
-
-    llvm_unreachable("Unknown 1-byte regclass");
-  case 16:
-    assert(M68k::XR16RegClass.hasSubClassEq(RC) && "Unknown 2-byte regclass");
-    return load ? M68k::MOVM16mp_P : M68k::MOVM16pm_P;
-  case 32:
-    assert(M68k::XR32RegClass.hasSubClassEq(RC) && "Unknown 4-byte regclass");
-    return load ? M68k::MOVM32mp_P : M68k::MOVM32pm_P;
+      return load ? M68k::MOVM16mp_P : M68k::MOVM16pm_P;
+    llvm_unreachable("Unknown 2-byte regclass");
+  case 4:
+    if (M68k::XR32RegClass.hasSubClassEq(RC))
+      return load ? M68k::MOVM32mp_P : M68k::MOVM32pm_P;
+    llvm_unreachable("Unknown 4-byte regclass");
   }
 }
 
diff --git llvm/lib/Target/M68k/M68kRegisterInfo.td llvm/lib/Target/M68k/M68kRegisterInfo.td
index 45b492eba4ec..4942636ffd52 100644
--- llvm/lib/Target/M68k/M68kRegisterInfo.td
+++ llvm/lib/Target/M68k/M68kRegisterInfo.td
@@ -99,52 +99,77 @@ class MxRegClass<list<ValueType> regTypes, int alignment, dag regList>
     : RegisterClass<"M68k", regTypes, alignment, regList>;
 
 // Data Registers
+let RegInfos = RegInfoByHwMode<[DefaultMode], [RegInfo<8,16,16>]> in
 def DR8  : MxRegClass<[i8],  16, (sequence "BD%u", 0, 7)>;
+let RegInfos = RegInfoByHwMode<[DefaultMode], [RegInfo<16,16,16>]> in
 def DR16 : MxRegClass<[i16], 16, (sequence "WD%u", 0, 7)>;
+let RegInfos = RegInfoByHwMode<[DefaultMode], [RegInfo<32,32,32>]> in
 def DR32 : MxRegClass<[i32], 32, (sequence "D%u",  0, 7)>;
 
 // Address Registers
+let RegInfos = RegInfoByHwMode<[DefaultMode], [RegInfo<16,16,16>]> in
 def AR16 : MxRegClass<[i16], 16, (add (sequence "WA%u", 0, 6), WSP)>;
+let RegInfos = RegInfoByHwMode<[DefaultMode], [RegInfo<32,32,32>]> in
 def AR32 : MxRegClass<[i32], 32, (add (sequence "A%u", 0, 6), SP)>;
 
+let RegInfos = RegInfoByHwMode<[DefaultMode], [RegInfo<32,32,32>]> in
 def AR32_NOSP : MxRegClass<[i32], 32, (sequence "A%u", 0, 6)>;
 
 // Index Register Classes
 // FIXME try alternative ordering like `D0, D1, A0, A1, ...`
+let RegInfos = RegInfoByHwMode<[DefaultMode], [RegInfo<16,16,16>]> in
 def XR16 : MxRegClass<[i16], 16, (add DR16, AR16)>;
+let RegInfos = RegInfoByHwMode<[DefaultMode], [RegInfo<32,32,32>]> in
 def XR32 : MxRegClass<[i32], 32, (add DR32, AR32)>;
 
+let RegInfos = RegInfoByHwMode<[DefaultMode], [RegInfo<32,32,32>]> in
 def SPC  : MxRegClass<[i32], 32, (add SP)>;
 
 // Floating Point Data Registers
+let RegInfos = RegInfoByHwMode<[DefaultMode], [RegInfo<32,32,32>]> in
 def FPDR32 : MxRegClass<[f32], 32, (sequence "FP%u", 0, 7)>;
+let RegInfos = RegInfoByHwMode<[DefaultMode], [RegInfo<64,64,32>]> in
 def FPDR64 : MxRegClass<[f64], 32, (add FPDR32)>;
+let RegInfos = RegInfoByHwMode<[DefaultMode], [RegInfo<80,128,32>]> in
 def FPDR80 : MxRegClass<[f80], 32, (add FPDR32)>;
 
 let CopyCost = -1 in {
+  let RegInfos = RegInfoByHwMode<[DefaultMode], [RegInfo<8,16,16>]> in
   def CCRC : MxRegClass<[i8],  16, (add CCR)>;
+  let RegInfos = RegInfoByHwMode<[DefaultMode], [RegInfo<16,16,16>]> in
   def SRC  : MxRegClass<[i16], 16, (add SR)>;
 
   // Float Point System Control Registers
-  def FPIC   : MxRegClass<[i32], 32, (add FPIAR)>;
-  def FPCSC  : MxRegClass<[i32], 32, (add FPC, FPS)>;
-  def FPSYSC : MxRegClass<[i32], 32, (add FPCSC, FPIC)>;
+  let RegInfos = RegInfoByHwMode<[DefaultMode], [RegInfo<32,32,32>]> in {
+    def FPIC   : MxRegClass<[i32], 32, (add FPIAR)>;
+    def FPCSC  : MxRegClass<[i32], 32, (add FPC, FPS)>;
+    def FPSYSC : MxRegClass<[i32], 32, (add FPCSC, FPIC)>;
+  }
 }
 
 let isAllocatable = 0 in {
+  let RegInfos = RegInfoByHwMode<[DefaultMode], [RegInfo<32,32,32>]> in
   def PCC  : MxRegClass<[i32], 32, (add PC)>;
 }
 
 // Register used with tail call
+let RegInfos = RegInfoByHwMode<[DefaultMode], [RegInfo<16,16,16>]> in
 def DR16_TC : MxRegClass<[i16], 16, (add D0, D1)>;
+let RegInfos = RegInfoByHwMode<[DefaultMode], [RegInfo<32,32,32>]> in
 def DR32_TC : MxRegClass<[i32], 32, (add D0, D1)>;
 
+let RegInfos = RegInfoByHwMode<[DefaultMode], [RegInfo<16,16,16>]> in
 def AR16_TC : MxRegClass<[i16], 16, (add A0, A1)>;
+let RegInfos = RegInfoByHwMode<[DefaultMode], [RegInfo<32,32,32>]> in
 def AR32_TC : MxRegClass<[i32], 32, (add A0, A1)>;
 
+let RegInfos = RegInfoByHwMode<[DefaultMode], [RegInfo<16,16,16>]> in
 def XR16_TC : MxRegClass<[i16], 16, (add DR16_TC, AR16_TC)>;
+let RegInfos = RegInfoByHwMode<[DefaultMode], [RegInfo<32,32,32>]> in
 def XR32_TC : MxRegClass<[i32], 32, (add DR32_TC, AR32_TC)>;
 
 // These classes provide spill/restore order if used with MOVEM instruction
-def SPILL   : MxRegClass<[i32], 32, (add XR32)>;
-def SPILL_R : MxRegClass<[i32], 32, (add SP, (sequence "A%u", 6, 0), (sequence "D%u", 7, 0))>;
+let RegInfos = RegInfoByHwMode<[DefaultMode], [RegInfo<32,32,32>]> in {
+  def SPILL   : MxRegClass<[i32], 32, (add XR32)>;
+  def SPILL_R : MxRegClass<[i32], 32, (add SP, (sequence "A%u", 6, 0), (sequence "D%u", 7, 0))>;
+}
diff --git llvm/lib/Target/Mips/MipsISelLowering.cpp llvm/lib/Target/Mips/MipsISelLowering.cpp
index 7c4257c222c0..099a3f7dbc65 100644
--- llvm/lib/Target/Mips/MipsISelLowering.cpp
+++ llvm/lib/Target/Mips/MipsISelLowering.cpp
@@ -1138,13 +1138,21 @@ static SDValue performADDCombine(SDNode *N, SelectionDAG &DAG,
     return SDValue();
   }
 
+  // When loading from a jump table, push the Lo node to the position that
+  // allows folding it into a load immediate.
   // (add v0, (add v1, abs_lo(tjt))) => (add (add v0, v1), abs_lo(tjt))
-  SDValue Add = N->getOperand(1);
-
-  if (Add.getOpcode() != ISD::ADD)
+  // (add (add abs_lo(tjt), v1), v0) => (add (add v0, v1), abs_lo(tjt))
+  SDValue InnerAdd = N->getOperand(1);
+  SDValue Index = N->getOperand(0);
+  if (InnerAdd.getOpcode() != ISD::ADD)
+    std::swap(InnerAdd, Index);
+  if (InnerAdd.getOpcode() != ISD::ADD)
     return SDValue();
 
-  SDValue Lo = Add.getOperand(1);
+  SDValue Lo = InnerAdd.getOperand(0);
+  SDValue Other = InnerAdd.getOperand(1);
+  if (Lo.getOpcode() != MipsISD::Lo)
+    std::swap(Lo, Other);
 
   if ((Lo.getOpcode() != MipsISD::Lo) ||
       (Lo.getOperand(0).getOpcode() != ISD::TargetJumpTable))
@@ -1153,8 +1161,7 @@ static SDValue performADDCombine(SDNode *N, SelectionDAG &DAG,
   EVT ValTy = N->getValueType(0);
   SDLoc DL(N);
 
-  SDValue Add1 = DAG.getNode(ISD::ADD, DL, ValTy, N->getOperand(0),
-                             Add.getOperand(0));
+  SDValue Add1 = DAG.getNode(ISD::ADD, DL, ValTy, Index, Other);
   return DAG.getNode(ISD::ADD, DL, ValTy, Add1, Lo);
 }
 
diff --git llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 773c97f7b4dc..9e7e1dbcea25 100644
--- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -286,7 +286,7 @@ static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
     if (VT.isVector()) {
       unsigned NumElts = VT.getVectorNumElements();
       EVT EltVT = VT.getVectorElementType();
-      // We require power-of-2 sized vectors becuase
+      // We require power-of-2 sized vectors because
       // TargetLoweringBase::getVectorTypeBreakdown() which is invoked in
       // ComputePTXValueVTs() cannot currently break down non-power-of-2 sized
       // vectors.
diff --git llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
index 3f1539da4a9c..6a42fdf3c356 100644
--- llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
+++ llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
@@ -128,8 +128,6 @@ private:
                           int OpIdx) const;
   void renderImmPlus1(MachineInstrBuilder &MIB, const MachineInstr &MI,
                       int OpIdx) const;
-  void renderImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
-                 int OpIdx) const;
   void renderFrameIndex(MachineInstrBuilder &MIB, const MachineInstr &MI,
                         int OpIdx) const;
 
@@ -837,15 +835,6 @@ void RISCVInstructionSelector::renderImmPlus1(MachineInstrBuilder &MIB,
   MIB.addImm(CstVal + 1);
 }
 
-void RISCVInstructionSelector::renderImm(MachineInstrBuilder &MIB,
-                                         const MachineInstr &MI,
-                                         int OpIdx) const {
-  assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
-         "Expected G_CONSTANT");
-  int64_t CstVal = MI.getOperand(1).getCImm()->getSExtValue();
-  MIB.addImm(CstVal);
-}
-
 void RISCVInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
                                                 const MachineInstr &MI,
                                                 int OpIdx) const {
diff --git llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 9855028ead9e..63864dd0e323 100644
--- llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -3538,7 +3538,8 @@ bool RISCVDAGToDAGISel::selectVSplat(SDValue N, SDValue &SplatVal) {
 static bool selectVSplatImmHelper(SDValue N, SDValue &SplatVal,
                                   SelectionDAG &DAG,
                                   const RISCVSubtarget &Subtarget,
-                                  std::function<bool(int64_t)> ValidateImm) {
+                                  std::function<bool(int64_t)> ValidateImm,
+                                  bool Decrement = false) {
   SDValue Splat = findVSplat(N);
   if (!Splat || !isa<ConstantSDNode>(Splat.getOperand(1)))
     return false;
@@ -3561,6 +3562,9 @@ static bool selectVSplatImmHelper(SDValue N, SDValue &SplatVal,
   if (!ValidateImm(SplatImm))
     return false;
 
+  if (Decrement)
+    SplatImm -= 1;
+
   SplatVal =
       DAG.getSignedTargetConstant(SplatImm, SDLoc(N), Subtarget.getXLenVT());
   return true;
@@ -3574,15 +3578,18 @@ bool RISCVDAGToDAGISel::selectVSplatSimm5(SDValue N, SDValue &SplatVal) {
 bool RISCVDAGToDAGISel::selectVSplatSimm5Plus1(SDValue N, SDValue &SplatVal) {
   return selectVSplatImmHelper(
       N, SplatVal, *CurDAG, *Subtarget,
-      [](int64_t Imm) { return (isInt<5>(Imm) && Imm != -16) || Imm == 16; });
+      [](int64_t Imm) { return (isInt<5>(Imm) && Imm != -16) || Imm == 16; },
+      /*Decrement=*/true);
 }
 
 bool RISCVDAGToDAGISel::selectVSplatSimm5Plus1NonZero(SDValue N,
                                                       SDValue &SplatVal) {
   return selectVSplatImmHelper(
-      N, SplatVal, *CurDAG, *Subtarget, [](int64_t Imm) {
+      N, SplatVal, *CurDAG, *Subtarget,
+      [](int64_t Imm) {
         return Imm != 0 && ((isInt<5>(Imm) && Imm != -16) || Imm == 16);
-      });
+      },
+      /*Decrement=*/true);
 }
 
 bool RISCVDAGToDAGISel::selectVSplatUimm(SDValue N, unsigned Bits,
diff --git llvm/lib/Target/RISCV/RISCVISelLowering.cpp llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 8d09e534b185..8e3caf51d876 100644
--- llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1526,18 +1526,16 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     setTargetDAGCombine({ISD::ZERO_EXTEND, ISD::FP_TO_SINT, ISD::FP_TO_UINT,
                          ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT});
   if (Subtarget.hasVInstructions())
-    setTargetDAGCombine({ISD::FCOPYSIGN,     ISD::MGATHER,
-                         ISD::MSCATTER,      ISD::VP_GATHER,
-                         ISD::VP_SCATTER,    ISD::SRA,
-                         ISD::SRL,           ISD::SHL,
-                         ISD::STORE,         ISD::SPLAT_VECTOR,
-                         ISD::BUILD_VECTOR,  ISD::CONCAT_VECTORS,
-                         ISD::VP_STORE,      ISD::EXPERIMENTAL_VP_REVERSE,
-                         ISD::MUL,           ISD::SDIV,
-                         ISD::UDIV,          ISD::SREM,
-                         ISD::UREM,          ISD::INSERT_VECTOR_ELT,
-                         ISD::ABS,           ISD::CTPOP,
-                         ISD::VECTOR_SHUFFLE, ISD::VSELECT});
+    setTargetDAGCombine(
+        {ISD::FCOPYSIGN,    ISD::MGATHER,      ISD::MSCATTER,
+         ISD::VP_GATHER,    ISD::VP_SCATTER,   ISD::SRA,
+         ISD::SRL,          ISD::SHL,          ISD::STORE,
+         ISD::SPLAT_VECTOR, ISD::BUILD_VECTOR, ISD::CONCAT_VECTORS,
+         ISD::VP_STORE,     ISD::VP_TRUNCATE,  ISD::EXPERIMENTAL_VP_REVERSE,
+         ISD::MUL,          ISD::SDIV,         ISD::UDIV,
+         ISD::SREM,         ISD::UREM,         ISD::INSERT_VECTOR_ELT,
+         ISD::ABS,          ISD::CTPOP,        ISD::VECTOR_SHUFFLE,
+         ISD::VSELECT});
 
   if (Subtarget.hasVendorXTHeadMemPair())
     setTargetDAGCombine({ISD::LOAD, ISD::STORE});
@@ -4512,7 +4510,8 @@ static SDValue getSingleShuffleSrc(MVT VT, MVT ContainerVT, SDValue V1,
 
   // Src needs to have twice the number of elements.
   unsigned NumElts = VT.getVectorNumElements();
-  if (Src.getValueType().getVectorNumElements() != (NumElts * 2))
+  if (!Src.getValueType().isFixedLengthVector() ||
+      Src.getValueType().getVectorNumElements() != (NumElts * 2))
     return SDValue();
 
   // The extracts must extract the two halves of the source.
@@ -5603,6 +5602,23 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
     if (SDValue V = lowerVECTOR_SHUFFLEAsRotate(SVN, DAG, Subtarget))
       return V;
 
+    // Match a spread(4,8) which can be done via extend and shift.  Spread(2)
+    // is fully covered in interleave(2) above, so it is ignored here.
+    if (VT.getScalarSizeInBits() < Subtarget.getELen()) {
+      unsigned MaxFactor = Subtarget.getELen() / VT.getScalarSizeInBits();
+      assert(MaxFactor == 2 || MaxFactor == 4 || MaxFactor == 8);
+      for (unsigned Factor = 4; Factor <= MaxFactor; Factor <<= 1) {
+        unsigned Index;
+        if (isSpreadMask(Mask, Factor, Index)) {
+          MVT NarrowVT =
+              MVT::getVectorVT(VT.getVectorElementType(), NumElts / Factor);
+          SDValue Src = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowVT, V1,
+                                    DAG.getVectorIdxConstant(0, DL));
+          return getWideningSpread(Src, Factor, Index, DL, DAG);
+        }
+      }
+    }
+
     // Before hitting generic lowering fallbacks, try to widen the mask
     // to a wider SEW.
     if (SDValue V = tryWidenMaskForShuffle(Op, DAG))
@@ -5627,23 +5643,6 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
                          DAG.getUNDEF(VT));
     }
 
-    // Match a spread(4,8) which can be done via extend and shift.  Spread(2)
-    // is fully covered in interleave(2) above, so it is ignored here.
-    if (VT.getScalarSizeInBits() < Subtarget.getELen()) {
-      unsigned MaxFactor = Subtarget.getELen() / VT.getScalarSizeInBits();
-      assert(MaxFactor == 2 || MaxFactor == 4 || MaxFactor == 8);
-      for (unsigned Factor = 4; Factor <= MaxFactor; Factor <<= 1) {
-        unsigned Index;
-        if (isSpreadMask(Mask, Factor, Index)) {
-          MVT NarrowVT =
-              MVT::getVectorVT(VT.getVectorElementType(), NumElts / Factor);
-          SDValue Src = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowVT, V1,
-                                    DAG.getVectorIdxConstant(0, DL));
-          return getWideningSpread(Src, Factor, Index, DL, DAG);
-        }
-      }
-    }
-
     if (VT.getScalarSizeInBits() == 8 &&
         any_of(Mask, [&](const auto &Idx) { return Idx > 255; })) {
       // On such a vector we're unable to use i8 as the index type.
@@ -16373,6 +16372,93 @@ static SDValue performVP_STORECombine(SDNode *N, SelectionDAG &DAG,
       VPStore->isTruncatingStore(), VPStore->isCompressingStore());
 }
 
+// Peephole avgceil pattern.
+//   %1 = zext <N x i8> %a to <N x i32>
+//   %2 = zext <N x i8> %b to <N x i32>
+//   %3 = add nuw nsw <N x i32> %1, splat (i32 1)
+//   %4 = add nuw nsw <N x i32> %3, %2
+//   %5 = lshr <N x i32> %4, splat (i32 1)
+//   %6 = trunc <N x i32> %5 to <N x i8>
+static SDValue performVP_TRUNCATECombine(SDNode *N, SelectionDAG &DAG,
+                                         const RISCVSubtarget &Subtarget) {
+  EVT VT = N->getValueType(0);
+
+  // Ignore fixed vectors.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!VT.isScalableVector() || !TLI.isTypeLegal(VT))
+    return SDValue();
+
+  SDValue In = N->getOperand(0);
+  SDValue Mask = N->getOperand(1);
+  SDValue VL = N->getOperand(2);
+
+  // Input should be a vp_srl with same mask and VL.
+  if (In.getOpcode() != ISD::VP_SRL || In.getOperand(2) != Mask ||
+      In.getOperand(3) != VL)
+    return SDValue();
+
+  // Shift amount should be 1.
+  if (!isOneOrOneSplat(In.getOperand(1)))
+    return SDValue();
+
+  // Shifted value should be a vp_add with same mask and VL.
+  SDValue LHS = In.getOperand(0);
+  if (LHS.getOpcode() != ISD::VP_ADD || LHS.getOperand(2) != Mask ||
+      LHS.getOperand(3) != VL)
+    return SDValue();
+
+  SDValue Operands[3];
+
+  // Matches another VP_ADD with same VL and Mask.
+  auto FindAdd = [&](SDValue V, SDValue Other) {
+    if (V.getOpcode() != ISD::VP_ADD || V.getOperand(2) != Mask ||
+        V.getOperand(3) != VL)
+      return false;
+
+    Operands[0] = Other;
+    Operands[1] = V.getOperand(1);
+    Operands[2] = V.getOperand(0);
+    return true;
+  };
+
+  // We need to find another VP_ADD in one of the operands.
+  SDValue LHS0 = LHS.getOperand(0);
+  SDValue LHS1 = LHS.getOperand(1);
+  if (!FindAdd(LHS0, LHS1) && !FindAdd(LHS1, LHS0))
+    return SDValue();
+
+  // Now we have three operands of two additions. Check that one of them is a
+  // constant vector with ones.
+  auto I = llvm::find_if(Operands,
+                         [](const SDValue &Op) { return isOneOrOneSplat(Op); });
+  if (I == std::end(Operands))
+    return SDValue();
+  // We found a vector with ones, move if it to the end of the Operands array.
+  std::swap(*I, Operands[2]);
+
+  // Make sure the other 2 operands can be promoted from the result type.
+  for (SDValue Op : drop_end(Operands)) {
+    if (Op.getOpcode() != ISD::VP_ZERO_EXTEND || Op.getOperand(1) != Mask ||
+        Op.getOperand(2) != VL)
+      return SDValue();
+    // Input must be the same size or smaller than our result.
+    if (Op.getOperand(0).getScalarValueSizeInBits() > VT.getScalarSizeInBits())
+      return SDValue();
+  }
+
+  // Pattern is detected.
+  // Rebuild the zero extends in case the inputs are smaller than our result.
+  SDValue NewOp0 = DAG.getNode(ISD::VP_ZERO_EXTEND, SDLoc(Operands[0]), VT,
+                               Operands[0].getOperand(0), Mask, VL);
+  SDValue NewOp1 = DAG.getNode(ISD::VP_ZERO_EXTEND, SDLoc(Operands[1]), VT,
+                               Operands[1].getOperand(0), Mask, VL);
+  // Build a AVGCEILU_VL which will be selected as a VAADDU with RNU rounding
+  // mode.
+  SDLoc DL(N);
+  return DAG.getNode(RISCVISD::AVGCEILU_VL, DL, VT,
+                     {NewOp0, NewOp1, DAG.getUNDEF(VT), Mask, VL});
+}
+
 // Convert from one FMA opcode to another based on whether we are negating the
 // multiply result and/or the accumulator.
 // NOTE: Only supports RVV operations with VL.
@@ -17930,6 +18016,8 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     if (SDValue V = combineTruncOfSraSext(N, DAG))
       return V;
     return combineTruncToVnclip(N, DAG, Subtarget);
+  case ISD::VP_TRUNCATE:
+    return performVP_TRUNCATECombine(N, DAG, Subtarget);
   case ISD::TRUNCATE:
     return performTRUNCATECombine(N, DAG, Subtarget);
   case ISD::SELECT:
diff --git llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
index 880ea0ae0a97..8f77b2ce34d1 100644
--- llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
+++ llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
@@ -313,29 +313,10 @@ multiclass VPatIntegerSetCCSDNode_VX_Swappable<string instruction_name,
                                           SplatPat, GPR>;
 
 multiclass VPatIntegerSetCCSDNode_VI_Swappable<string instruction_name,
-                                               CondCode cc, CondCode invcc>
+                                               CondCode cc, CondCode invcc,
+                                               ComplexPattern splatpat_kind = SplatPat_simm5>
     : VPatIntegerSetCCSDNode_XI_Swappable<instruction_name, cc, invcc, "VI",
-                                          SplatPat_simm5, simm5>;
-
-multiclass VPatIntegerSetCCSDNode_VIPlus1_Swappable<string instruction_name,
-                                                    CondCode cc, CondCode invcc,
-                                                    ComplexPattern splatpat_kind> {
-  foreach vti = AllIntegerVectors in {
-    defvar instruction = !cast<Instruction>(instruction_name#"_VI_"#vti.LMul.MX);
-    let Predicates = GetVTypePredicates<vti>.Predicates in {
-      def : Pat<(vti.Mask (setcc (vti.Vector vti.RegClass:$rs1),
-                                 (vti.Vector (splatpat_kind simm5:$rs2)),
-                                 cc)),
-                (instruction vti.RegClass:$rs1, (DecImm simm5:$rs2),
-                             vti.AVL, vti.Log2SEW)>;
-      def : Pat<(vti.Mask (setcc (vti.Vector (splatpat_kind simm5:$rs2)),
-                                 (vti.Vector vti.RegClass:$rs1),
-                                 invcc)),
-                (instruction vti.RegClass:$rs1, (DecImm simm5:$rs2),
-                             vti.AVL, vti.Log2SEW)>;
-    }
-  }
-}
+                                          splatpat_kind, simm5>;
 
 multiclass VPatFPSetCCSDNode_VV_VF_FV<CondCode cc,
                                       string inst_name,
@@ -1021,14 +1002,14 @@ defm : VPatIntegerSetCCSDNode_VI_Swappable<"PseudoVMSLEU", SETULE, SETUGE>;
 defm : VPatIntegerSetCCSDNode_VI_Swappable<"PseudoVMSGT",  SETGT, SETLT>;
 defm : VPatIntegerSetCCSDNode_VI_Swappable<"PseudoVMSGTU", SETUGT, SETULT>;
 
-defm : VPatIntegerSetCCSDNode_VIPlus1_Swappable<"PseudoVMSLE", SETLT, SETGT,
-                                                SplatPat_simm5_plus1>;
-defm : VPatIntegerSetCCSDNode_VIPlus1_Swappable<"PseudoVMSLEU", SETULT, SETUGT,
-                                                SplatPat_simm5_plus1_nonzero>;
-defm : VPatIntegerSetCCSDNode_VIPlus1_Swappable<"PseudoVMSGT", SETGE, SETLE,
-                                                SplatPat_simm5_plus1>;
-defm : VPatIntegerSetCCSDNode_VIPlus1_Swappable<"PseudoVMSGTU", SETUGE, SETULE,
-                                                SplatPat_simm5_plus1_nonzero>;
+defm : VPatIntegerSetCCSDNode_VI_Swappable<"PseudoVMSLE", SETLT, SETGT,
+                                           SplatPat_simm5_plus1>;
+defm : VPatIntegerSetCCSDNode_VI_Swappable<"PseudoVMSLEU", SETULT, SETUGT,
+                                           SplatPat_simm5_plus1_nonzero>;
+defm : VPatIntegerSetCCSDNode_VI_Swappable<"PseudoVMSGT", SETGE, SETLE,
+                                           SplatPat_simm5_plus1>;
+defm : VPatIntegerSetCCSDNode_VI_Swappable<"PseudoVMSGTU", SETUGE, SETULE,
+                                           SplatPat_simm5_plus1_nonzero>;
 
 // 11.9. Vector Integer Min/Max Instructions
 defm : VPatBinarySDNode_VV_VX<umin, "PseudoVMINU">;
diff --git llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
index 2026ba79e623..f35dc6eb2cb8 100644
--- llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
+++ llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
@@ -1052,32 +1052,8 @@ multiclass VPatIntegerSetCCVL_VX_Swappable<VTypeInfo vti, string instruction_nam
 }
 
 multiclass VPatIntegerSetCCVL_VI_Swappable<VTypeInfo vti, string instruction_name,
-                                           CondCode cc, CondCode invcc> {
-  defvar instruction_masked = !cast<Instruction>(instruction_name#"_VI_"#vti.LMul.MX#"_MASK");
-  def : Pat<(vti.Mask (riscv_setcc_vl (vti.Vector vti.RegClass:$rs1),
-                                      (SplatPat_simm5 simm5:$rs2), cc,
-                                      VR:$passthru,
-                                      (vti.Mask V0),
-                                      VLOpFrag)),
-            (instruction_masked VR:$passthru, vti.RegClass:$rs1,
-                                XLenVT:$rs2, (vti.Mask V0), GPR:$vl,
-                                vti.Log2SEW)>;
-
-  // FIXME: Can do some canonicalization to remove these patterns.
-  def : Pat<(vti.Mask (riscv_setcc_vl (SplatPat_simm5 simm5:$rs2),
-                                      (vti.Vector vti.RegClass:$rs1), invcc,
-                                      VR:$passthru,
-                                      (vti.Mask V0),
-                                      VLOpFrag)),
-            (instruction_masked VR:$passthru, vti.RegClass:$rs1,
-                                simm5:$rs2, (vti.Mask V0), GPR:$vl,
-                                vti.Log2SEW)>;
-}
-
-multiclass VPatIntegerSetCCVL_VIPlus1_Swappable<VTypeInfo vti,
-                                                string instruction_name,
-                                                CondCode cc, CondCode invcc,
-                                                ComplexPattern splatpat_kind> {
+                                           CondCode cc, CondCode invcc,
+                                           ComplexPattern splatpat_kind = SplatPat_simm5> {
   defvar instruction_masked = !cast<Instruction>(instruction_name#"_VI_"#vti.LMul.MX#"_MASK");
   def : Pat<(vti.Mask (riscv_setcc_vl (vti.Vector vti.RegClass:$rs1),
                                       (splatpat_kind simm5:$rs2), cc,
@@ -1085,7 +1061,7 @@ multiclass VPatIntegerSetCCVL_VIPlus1_Swappable<VTypeInfo vti,
                                       (vti.Mask V0),
                                       VLOpFrag)),
             (instruction_masked VR:$passthru, vti.RegClass:$rs1,
-                                (DecImm simm5:$rs2), (vti.Mask V0), GPR:$vl,
+                                XLenVT:$rs2, (vti.Mask V0), GPR:$vl,
                                 vti.Log2SEW)>;
 
   // FIXME: Can do some canonicalization to remove these patterns.
@@ -1095,7 +1071,7 @@ multiclass VPatIntegerSetCCVL_VIPlus1_Swappable<VTypeInfo vti,
                                       (vti.Mask V0),
                                       VLOpFrag)),
             (instruction_masked VR:$passthru, vti.RegClass:$rs1,
-                                (DecImm simm5:$rs2), (vti.Mask V0), GPR:$vl,
+                                simm5:$rs2, (vti.Mask V0), GPR:$vl,
                                 vti.Log2SEW)>;
 }
 
@@ -2173,14 +2149,14 @@ foreach vti = AllIntegerVectors in {
     defm : VPatIntegerSetCCVL_VI_Swappable<vti, "PseudoVMSGT",  SETGT,  SETLT>;
     defm : VPatIntegerSetCCVL_VI_Swappable<vti, "PseudoVMSGTU", SETUGT, SETULT>;
 
-    defm : VPatIntegerSetCCVL_VIPlus1_Swappable<vti, "PseudoVMSLE",  SETLT, SETGT,
-                                                SplatPat_simm5_plus1>;
-    defm : VPatIntegerSetCCVL_VIPlus1_Swappable<vti, "PseudoVMSLEU", SETULT, SETUGT,
-                                                SplatPat_simm5_plus1_nonzero>;
-    defm : VPatIntegerSetCCVL_VIPlus1_Swappable<vti, "PseudoVMSGT",  SETGE, SETLE,
-                                                SplatPat_simm5_plus1>;
-    defm : VPatIntegerSetCCVL_VIPlus1_Swappable<vti, "PseudoVMSGTU", SETUGE, SETULE,
-                                                SplatPat_simm5_plus1_nonzero>;
+    defm : VPatIntegerSetCCVL_VI_Swappable<vti, "PseudoVMSLE",  SETLT, SETGT,
+                                           SplatPat_simm5_plus1>;
+    defm : VPatIntegerSetCCVL_VI_Swappable<vti, "PseudoVMSLEU", SETULT, SETUGT,
+                                           SplatPat_simm5_plus1_nonzero>;
+    defm : VPatIntegerSetCCVL_VI_Swappable<vti, "PseudoVMSGT",  SETGE, SETLE,
+                                           SplatPat_simm5_plus1>;
+    defm : VPatIntegerSetCCVL_VI_Swappable<vti, "PseudoVMSGTU", SETUGE, SETULE,
+                                           SplatPat_simm5_plus1_nonzero>;
   }
 } // foreach vti = AllIntegerVectors
 
diff --git llvm/lib/Target/RISCV/RISCVInstrInfoXMips.td llvm/lib/Target/RISCV/RISCVInstrInfoXMips.td
index 281829e99cc5..9be424310d66 100644
--- llvm/lib/Target/RISCV/RISCVInstrInfoXMips.td
+++ llvm/lib/Target/RISCV/RISCVInstrInfoXMips.td
@@ -109,61 +109,41 @@ class SWPFormat<dag outs, dag ins, string opcodestr, string argstr>
 
 let Predicates = [HasVendorXMIPSCMove], hasSideEffects = 0, mayLoad = 0, mayStore = 0,
                  DecoderNamespace = "Xmipscmove" in {
-def CCMOV : RVInstR4<0b11, 0b011, OPC_CUSTOM_0, (outs GPR:$rd),
-                     (ins GPR:$rs1, GPR:$rs2, GPR:$rs3),
-                    "mips.ccmov", "$rd, $rs2, $rs1, $rs3">,
-           Sched<[]>;
+def MIPS_CCMOV : RVInstR4<0b11, 0b011, OPC_CUSTOM_0, (outs GPR:$rd),
+                          (ins GPR:$rs1, GPR:$rs2, GPR:$rs3),
+                          "mips.ccmov", "$rd, $rs2, $rs1, $rs3">,
+                 Sched<[]>;
 }
 
 let Predicates = [UseCCMovInsn] in {
-def : Pat<(select (XLenVT (setne (XLenVT GPR:$rs2), (XLenVT 0))),
+def : Pat<(select (riscv_setne (XLenVT GPR:$rs2)),
                   (XLenVT GPR:$rs1), (XLenVT GPR:$rs3)),
-          (CCMOV GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
-def : Pat<(select (XLenVT (setne (XLenVT GPR:$x), (XLenVT simm12_plus1:$y))),
-                  (XLenVT GPR:$rs1), (XLenVT GPR:$rs3)),
-          (CCMOV GPR:$rs1, (ADDI GPR:$x, (NegImm simm12_plus1:$y)), GPR:$rs3)>;
-def : Pat<(select (XLenVT (setne (XLenVT GPR:$x), (XLenVT GPR:$y))),
-                  (XLenVT GPR:$rs1), (XLenVT GPR:$rs3)),
-          (CCMOV GPR:$rs1, (XOR GPR:$x, GPR:$y), GPR:$rs3)>;
-def : Pat<(select (XLenVT (seteq (XLenVT GPR:$rs2), (XLenVT 0))),
-                  (XLenVT GPR:$rs3), (XLenVT GPR:$rs1)),
-          (CCMOV GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
-def : Pat<(select (XLenVT (seteq (XLenVT GPR:$x), (XLenVT simm12_plus1:$y))),
+          (MIPS_CCMOV GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
+def : Pat<(select (riscv_seteq (XLenVT GPR:$rs2)),
                   (XLenVT GPR:$rs3), (XLenVT GPR:$rs1)),
-          (CCMOV GPR:$rs1, (ADDI GPR:$x, (NegImm simm12_plus1:$y)), GPR:$rs3)>;
-def : Pat<(select (XLenVT (seteq (XLenVT GPR:$x), (XLenVT GPR:$y))),
-                  (XLenVT GPR:$rs3), (XLenVT GPR:$rs1)),
-          (CCMOV GPR:$rs1, (XOR GPR:$x, GPR:$y), GPR:$rs3)>;
+          (MIPS_CCMOV GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
+
 def : Pat<(select (XLenVT GPR:$rs2), (XLenVT GPR:$rs1), (XLenVT GPR:$rs3)),
-          (CCMOV GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
+          (MIPS_CCMOV GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
 }
 
 let Predicates = [HasVendorXMIPSLSP], hasSideEffects = 0,
                  DecoderNamespace = "Xmipslsp" in {
-
-def LWP : LWPFormat<(outs GPR:$rd1, GPR:$rd2), (ins GPR:$rs1, uimm7_lsb00:$imm7),
-                    "mips.lwp", "$rd1, $rd2, ${imm7}(${rs1})">,
-          Sched<[WriteLDW, WriteLDW, ReadMemBase]> {
-  let mayLoad = 1;
-  let mayStore = 0;
-}
-def LDP : LDPFormat<(outs GPR:$rd1, GPR:$rd2), (ins GPR:$rs1, uimm7_lsb000:$imm7),
-                    "mips.ldp", "$rd1, $rd2, ${imm7}(${rs1})">,
-          Sched<[WriteLDD, WriteLDD, ReadMemBase]> {
-  let mayLoad = 1;
-  let mayStore = 0;
-}
-def SWP : SWPFormat<(outs), (ins GPR:$rs2, GPR:$rs3, GPR:$rs1, uimm7_lsb00:$imm7),
-                    "mips.swp", "$rs2, $rs3, ${imm7}(${rs1})">,
-          Sched<[WriteSTW, ReadStoreData, ReadStoreData, ReadMemBase]> {
-  let mayLoad = 0;
-  let mayStore = 1;
-}
-def SDP : SDPFormat<(outs), (ins GPR:$rs2, GPR:$rs3, GPR:$rs1, uimm7_lsb000:$imm7),
-                    "mips.sdp", "$rs2, $rs3, ${imm7}(${rs1})">,
-          Sched<[WriteSTD, ReadStoreData, ReadStoreData, ReadMemBase]> {
-  let mayLoad = 0;
-  let mayStore = 1;
-}
-
-}
+let mayLoad = 1, mayStore = 0 in {
+def MIPS_LWP : LWPFormat<(outs GPR:$rd1, GPR:$rd2), (ins GPR:$rs1, uimm7_lsb00:$imm7),
+                         "mips.lwp", "$rd1, $rd2, ${imm7}(${rs1})">,
+               Sched<[WriteLDW, WriteLDW, ReadMemBase]>;
+def MIPS_LDP : LDPFormat<(outs GPR:$rd1, GPR:$rd2), (ins GPR:$rs1, uimm7_lsb000:$imm7),
+                         "mips.ldp", "$rd1, $rd2, ${imm7}(${rs1})">,
+               Sched<[WriteLDD, WriteLDD, ReadMemBase]>;
+} // mayLoad = 1, mayStore = 0
+
+let mayLoad = 0, mayStore = 1 in {
+def MIPS_SWP : SWPFormat<(outs), (ins GPR:$rs2, GPR:$rs3, GPR:$rs1, uimm7_lsb00:$imm7),
+                         "mips.swp", "$rs2, $rs3, ${imm7}(${rs1})">,
+               Sched<[WriteSTW, ReadStoreData, ReadStoreData, ReadMemBase]>;
+def MIPS_SDP : SDPFormat<(outs), (ins GPR:$rs2, GPR:$rs3, GPR:$rs1, uimm7_lsb000:$imm7),
+                         "mips.sdp", "$rs2, $rs3, ${imm7}(${rs1})">,
+               Sched<[WriteSTD, ReadStoreData, ReadStoreData, ReadMemBase]>;
+} // mayLoad = 0, mayStore = 1
+} // Predicates = [HasVendorXMIPSLSP], hasSideEffects = 0, DecoderNamespace = "Xmipslsp"
diff --git llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp
index effec2cc776d..28bee8383765 100644
--- llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp
+++ llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp
@@ -326,6 +326,7 @@ static bool hasAllNBitUsers(const MachineInstr &OrigMI,
         break;
 
       case RISCV::PseudoCCMOVGPR:
+      case RISCV::PseudoCCMOVGPRNoX0:
         // Either operand 4 or operand 5 is returned by this instruction. If
         // only the lower word of the result is used, then only the lower word
         // of operand 4 and 5 is used.
@@ -538,6 +539,7 @@ static bool isSignExtendedW(Register SrcReg, const RISCVSubtarget &ST,
     case RISCV::MIN:
     case RISCV::MINU:
     case RISCV::PseudoCCMOVGPR:
+    case RISCV::PseudoCCMOVGPRNoX0:
     case RISCV::PseudoCCAND:
     case RISCV::PseudoCCOR:
     case RISCV::PseudoCCXOR:
@@ -546,7 +548,7 @@ static bool isSignExtendedW(Register SrcReg, const RISCVSubtarget &ST,
       // MIN, MAX, or PHI is also sign-extended.
 
       // The input registers for PHI are operand 1, 3, ...
-      // The input registers for PseudoCCMOVGPR are 4 and 5.
+      // The input registers for PseudoCCMOVGPR(NoX0) are 4 and 5.
       // The input registers for PseudoCCAND/OR/XOR are 4, 5, and 6.
       // The input registers for others are operand 1 and 2.
       unsigned B = 1, E = 3, D = 1;
@@ -556,6 +558,7 @@ static bool isSignExtendedW(Register SrcReg, const RISCVSubtarget &ST,
         D = 2;
         break;
       case RISCV::PseudoCCMOVGPR:
+      case RISCV::PseudoCCMOVGPRNoX0:
         B = 4;
         E = 6;
         break;
diff --git llvm/lib/Target/RISCV/RISCVSystemOperands.td llvm/lib/Target/RISCV/RISCVSystemOperands.td
index 4c86103db99d..cabcb9eda06b 100644
--- llvm/lib/Target/RISCV/RISCVSystemOperands.td
+++ llvm/lib/Target/RISCV/RISCVSystemOperands.td
@@ -19,6 +19,9 @@ include "llvm/TableGen/SearchableTable.td"
 
 class SysReg<string name, bits<12> op> {
   string Name = name;
+  // Custom vendor CSRs have a "<vendor>." prefix. Convert these to "<vendor>_"
+  // before passing it to the SysRegEncodings GenericEnum below.
+  string EnumName = !subst(".", "_", name);
   bits<12> Encoding = op;
   // FIXME: add these additional fields when needed.
   // Privilege Access: Read and Write = 0, 1, 2; Read-Only = 3.
@@ -50,7 +53,7 @@ def SysRegsList : GenericTable {
 
 def SysRegEncodings : GenericEnum {
   let FilterClass = "SysReg";
-  let NameField = "Name";
+  let NameField = "EnumName";
   let ValueField = "Encoding";
 }
 
diff --git llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 9b364391f0fa..042530b9cd2b 100644
--- llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -48,6 +48,14 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
   /// actual target hardware.
   unsigned getEstimatedVLFor(VectorType *Ty);
 
+  /// This function calculates the costs for one or more RVV opcodes based
+  /// on the vtype and the cost kind.
+  /// \param Opcodes A list of opcodes of the RVV instruction to evaluate.
+  /// \param VT The MVT of vtype associated with the RVV instructions.
+  /// For widening/narrowing instructions where the result and source types
+  /// differ, it is important to check the spec to determine whether the vtype
+  /// refers to the result or source type.
+  /// \param CostKind The type of cost to compute.
   InstructionCost getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
                                           TTI::TargetCostKind CostKind);
 
diff --git llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
index 6c4d1b48b6ff..0960245b8362 100644
--- llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
+++ llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
@@ -62,25 +62,6 @@ private:
   DenseMap<const MachineInstr *, std::optional<MachineOperand>> DemandedVLs;
 };
 
-} // end anonymous namespace
-
-char RISCVVLOptimizer::ID = 0;
-INITIALIZE_PASS_BEGIN(RISCVVLOptimizer, DEBUG_TYPE, PASS_NAME, false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
-INITIALIZE_PASS_END(RISCVVLOptimizer, DEBUG_TYPE, PASS_NAME, false, false)
-
-FunctionPass *llvm::createRISCVVLOptimizerPass() {
-  return new RISCVVLOptimizer();
-}
-
-/// Return true if R is a physical or virtual vector register, false otherwise.
-static bool isVectorRegClass(Register R, const MachineRegisterInfo *MRI) {
-  if (R.isPhysical())
-    return RISCV::VRRegClass.contains(R);
-  const TargetRegisterClass *RC = MRI->getRegClass(R);
-  return RISCVRI::isVRegClass(RC->TSFlags);
-}
-
 /// Represents the EMUL and EEW of a MachineOperand.
 struct OperandInfo {
   // Represent as 1,2,4,8, ... and fractional indicator. This is because
@@ -121,6 +102,25 @@ struct OperandInfo {
   }
 };
 
+} // end anonymous namespace
+
+char RISCVVLOptimizer::ID = 0;
+INITIALIZE_PASS_BEGIN(RISCVVLOptimizer, DEBUG_TYPE, PASS_NAME, false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
+INITIALIZE_PASS_END(RISCVVLOptimizer, DEBUG_TYPE, PASS_NAME, false, false)
+
+FunctionPass *llvm::createRISCVVLOptimizerPass() {
+  return new RISCVVLOptimizer();
+}
+
+/// Return true if R is a physical or virtual vector register, false otherwise.
+static bool isVectorRegClass(Register R, const MachineRegisterInfo *MRI) {
+  if (R.isPhysical())
+    return RISCV::VRRegClass.contains(R);
+  const TargetRegisterClass *RC = MRI->getRegClass(R);
+  return RISCVRI::isVRegClass(RC->TSFlags);
+}
+
 LLVM_ATTRIBUTE_UNUSED
 static raw_ostream &operator<<(raw_ostream &OS, const OperandInfo &OI) {
   OI.print(OS);
@@ -137,8 +137,6 @@ static raw_ostream &operator<<(raw_ostream &OS,
   return OS;
 }
 
-namespace llvm {
-namespace RISCVVType {
 /// Return EMUL = (EEW / SEW) * LMUL where EEW comes from Log2EEW and LMUL and
 /// SEW are from the TSFlags of MI.
 static std::pair<unsigned, bool>
@@ -165,8 +163,6 @@ getEMULEqualsEEWDivSEWTimesLMUL(unsigned Log2EEW, const MachineInstr &MI) {
   Denom = MILMULIsFractional ? Denom * MILMUL / GCD : Denom / GCD;
   return std::make_pair(Num > Denom ? Num : Denom, Denom > Num);
 }
-} // end namespace RISCVVType
-} // end namespace llvm
 
 /// Dest has EEW=SEW. Source EEW=SEW/Factor (i.e. F2 => EEW/2).
 /// SEW comes from TSFlags of MI.
@@ -770,8 +766,7 @@ getOperandInfo(const MachineOperand &MO, const MachineRegisterInfo *MRI) {
   };
 
   // All others have EMUL=EEW/SEW*LMUL
-  return OperandInfo(RISCVVType::getEMULEqualsEEWDivSEWTimesLMUL(*Log2EEW, MI),
-                     *Log2EEW);
+  return OperandInfo(getEMULEqualsEEWDivSEWTimesLMUL(*Log2EEW, MI), *Log2EEW);
 }
 
 /// Return true if this optimization should consider MI for VL reduction. This
@@ -1188,6 +1183,25 @@ RISCVVLOptimizer::getMinimumVLForUser(MachineOperand &UserOp) {
     return std::nullopt;
   }
 
+  unsigned VLOpNum = RISCVII::getVLOpNum(Desc);
+  const MachineOperand &VLOp = UserMI.getOperand(VLOpNum);
+  // Looking for an immediate or a register VL that isn't X0.
+  assert((!VLOp.isReg() || VLOp.getReg() != RISCV::X0) &&
+         "Did not expect X0 VL");
+
+  // If the user is a passthru it will read the elements past VL, so
+  // abort if any of the elements past VL are demanded.
+  if (UserOp.isTied()) {
+    assert(UserOp.getOperandNo() == UserMI.getNumExplicitDefs() &&
+           RISCVII::isFirstDefTiedToFirstUse(UserMI.getDesc()));
+    auto DemandedVL = DemandedVLs[&UserMI];
+    if (!DemandedVL || !RISCV::isVLKnownLE(*DemandedVL, VLOp)) {
+      LLVM_DEBUG(dbgs() << "    Abort because user is passthru in "
+                           "instruction with demanded tail\n");
+      return std::nullopt;
+    }
+  }
+
   // Instructions like reductions may use a vector register as a scalar
   // register. In this case, we should treat it as only reading the first lane.
   if (isVectorOpUsedAsScalarOp(UserOp)) {
@@ -1200,12 +1214,6 @@ RISCVVLOptimizer::getMinimumVLForUser(MachineOperand &UserOp) {
     return MachineOperand::CreateImm(1);
   }
 
-  unsigned VLOpNum = RISCVII::getVLOpNum(Desc);
-  const MachineOperand &VLOp = UserMI.getOperand(VLOpNum);
-  // Looking for an immediate or a register VL that isn't X0.
-  assert((!VLOp.isReg() || VLOp.getReg() != RISCV::X0) &&
-         "Did not expect X0 VL");
-
   // If we know the demanded VL of UserMI, then we can reduce the VL it
   // requires.
   if (auto DemandedVL = DemandedVLs[&UserMI]) {
@@ -1227,12 +1235,6 @@ std::optional<MachineOperand> RISCVVLOptimizer::checkUsers(MachineInstr &MI) {
       return std::nullopt;
     }
 
-    // If used as a passthru, elements past VL will be read.
-    if (UserOp.isTied()) {
-      LLVM_DEBUG(dbgs() << "    Abort because user used as tied operand\n");
-      return std::nullopt;
-    }
-
     auto VLOp = getMinimumVLForUser(UserOp);
     if (!VLOp)
       return std::nullopt;
@@ -1332,6 +1334,7 @@ bool RISCVVLOptimizer::tryReduceVL(MachineInstr &MI) {
 }
 
 bool RISCVVLOptimizer::runOnMachineFunction(MachineFunction &MF) {
+  assert(DemandedVLs.size() == 0);
   if (skipFunction(MF.getFunction()))
     return false;
 
@@ -1370,5 +1373,6 @@ bool RISCVVLOptimizer::runOnMachineFunction(MachineFunction &MF) {
     }
   }
 
+  DemandedVLs.clear();
   return MadeChange;
 }
diff --git llvm/lib/Target/Sparc/SparcInstrAliases.td llvm/lib/Target/Sparc/SparcInstrAliases.td
index 673a2db59b09..906f51bb8d10 100644
--- llvm/lib/Target/Sparc/SparcInstrAliases.td
+++ llvm/lib/Target/Sparc/SparcInstrAliases.td
@@ -605,6 +605,8 @@ def : InstAlias<"unimp", (UNIMP 0), 0>;
 // interchangeable with `unimp` all the time.
 def : MnemonicAlias<"illtrap", "unimp">;
 
+def : MnemonicAlias<"setuw", "set">, Requires<[HasV9]>;
+
 def : MnemonicAlias<"iflush", "flush">;
 
 def : MnemonicAlias<"stub", "stb">;
diff --git llvm/lib/Target/SystemZ/SystemZPostRewrite.cpp llvm/lib/Target/SystemZ/SystemZPostRewrite.cpp
index e15f9027cc20..cf3073f0f209 100644
--- llvm/lib/Target/SystemZ/SystemZPostRewrite.cpp
+++ llvm/lib/Target/SystemZ/SystemZPostRewrite.cpp
@@ -107,6 +107,18 @@ void SystemZPostRewrite::selectSELRMux(MachineBasicBlock &MBB,
   bool Src1IsHigh = SystemZ::isHighReg(Src1Reg);
   bool Src2IsHigh = SystemZ::isHighReg(Src2Reg);
 
+  // In rare cases both sources are the same register (after
+  // machine-cse). This must be handled as it may lead to wrong-code (after
+  // machine-cp) if the kill flag on Src1 isn't cleared (with
+  // expandCondMove()).
+  if (Src1Reg == Src2Reg) {
+    BuildMI(*MBBI->getParent(), MBBI, MBBI->getDebugLoc(),
+            TII->get(SystemZ::COPY), DestReg)
+        .addReg(MBBI->getOperand(1).getReg(), getRegState(MBBI->getOperand(1)));
+    MBBI->eraseFromParent();
+    return;
+  }
+
   // If sources and destination aren't all high or all low, we may be able to
   // simplify the operation by moving one of the sources to the destination
   // first.  But only if this doesn't clobber the other source.
diff --git llvm/lib/Target/WebAssembly/WebAssemblyISD.def llvm/lib/Target/WebAssembly/WebAssemblyISD.def
index 1cf0d13df1ff..378ef2c8f250 100644
--- llvm/lib/Target/WebAssembly/WebAssemblyISD.def
+++ llvm/lib/Target/WebAssembly/WebAssemblyISD.def
@@ -26,6 +26,7 @@ HANDLE_NODETYPE(Wrapper)
 HANDLE_NODETYPE(WrapperREL)
 HANDLE_NODETYPE(BR_IF)
 HANDLE_NODETYPE(BR_TABLE)
+HANDLE_NODETYPE(DOT)
 HANDLE_NODETYPE(SHUFFLE)
 HANDLE_NODETYPE(SWIZZLE)
 HANDLE_NODETYPE(VEC_SHL)
diff --git llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 02db1b142a22..fedad25c775e 100644
--- llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -29,6 +29,7 @@
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsWebAssembly.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -177,6 +178,10 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
 
   // SIMD-specific configuration
   if (Subtarget->hasSIMD128()) {
+
+    // Combine partial.reduce.add before legalization gets confused.
+    setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
+
     // Combine vector mask reductions into alltrue/anytrue
     setTargetDAGCombine(ISD::SETCC);
 
@@ -406,6 +411,35 @@ MVT WebAssemblyTargetLowering::getPointerMemTy(const DataLayout &DL,
   return TargetLowering::getPointerMemTy(DL, AS);
 }
 
+bool WebAssemblyTargetLowering::shouldExpandPartialReductionIntrinsic(
+    const IntrinsicInst *I) const {
+  if (I->getIntrinsicID() != Intrinsic::experimental_vector_partial_reduce_add)
+    return true;
+
+  EVT VT = EVT::getEVT(I->getType());
+  auto Op1 = I->getOperand(1);
+
+  if (auto *InputInst = dyn_cast<Instruction>(Op1)) {
+    if (InstructionOpcodeToISD(InputInst->getOpcode()) != ISD::MUL)
+      return true;
+
+    if (isa<Instruction>(InputInst->getOperand(0)) &&
+        isa<Instruction>(InputInst->getOperand(1))) {
+      // dot only supports signed inputs but also support lowering unsigned.
+      if (cast<Instruction>(InputInst->getOperand(0))->getOpcode() !=
+          cast<Instruction>(InputInst->getOperand(1))->getOpcode())
+        return true;
+
+      EVT Op1VT = EVT::getEVT(Op1->getType());
+      if (Op1VT.getVectorElementType() == VT.getVectorElementType() &&
+          ((VT.getVectorElementCount() * 2 == Op1VT.getVectorElementCount()) ||
+           (VT.getVectorElementCount() * 4 == Op1VT.getVectorElementCount())))
+        return false;
+    }
+  }
+  return true;
+}
+
 TargetLowering::AtomicExpansionKind
 WebAssemblyTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
   // We have wasm instructions for these
@@ -2030,6 +2064,94 @@ SDValue WebAssemblyTargetLowering::LowerVASTART(SDValue Op,
                       MachinePointerInfo(SV));
 }
 
+// Try to lower partial.reduce.add to a dot or fallback to a sequence with
+// extmul and adds.
+SDValue performLowerPartialReduction(SDNode *N, SelectionDAG &DAG) {
+  assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN);
+  if (N->getConstantOperandVal(0) !=
+      Intrinsic::experimental_vector_partial_reduce_add)
+    return SDValue();
+
+  assert(N->getValueType(0) == MVT::v4i32 && "can only support v4i32");
+  SDLoc DL(N);
+  SDValue Mul = N->getOperand(2);
+  assert(Mul->getOpcode() == ISD::MUL && "expected mul input");
+
+  SDValue ExtendLHS = Mul->getOperand(0);
+  SDValue ExtendRHS = Mul->getOperand(1);
+  assert((ISD::isExtOpcode(ExtendLHS.getOpcode()) &&
+          ISD::isExtOpcode(ExtendRHS.getOpcode())) &&
+         "expected widening mul");
+  assert(ExtendLHS.getOpcode() == ExtendRHS.getOpcode() &&
+         "expected mul to use the same extend for both operands");
+
+  SDValue ExtendInLHS = ExtendLHS->getOperand(0);
+  SDValue ExtendInRHS = ExtendRHS->getOperand(0);
+  bool IsSigned = ExtendLHS->getOpcode() == ISD::SIGN_EXTEND;
+
+  if (ExtendInLHS->getValueType(0) == MVT::v8i16) {
+    if (IsSigned) {
+      // i32x4.dot_i16x8_s
+      SDValue Dot = DAG.getNode(WebAssemblyISD::DOT, DL, MVT::v4i32,
+                                ExtendInLHS, ExtendInRHS);
+      return DAG.getNode(ISD::ADD, DL, MVT::v4i32, N->getOperand(1), Dot);
+    }
+
+    unsigned LowOpc = WebAssemblyISD::EXTEND_LOW_U;
+    unsigned HighOpc = WebAssemblyISD::EXTEND_HIGH_U;
+
+    // (add (add (extmul_low_sx lhs, rhs), (extmul_high_sx lhs, rhs)))
+    SDValue LowLHS = DAG.getNode(LowOpc, DL, MVT::v4i32, ExtendInLHS);
+    SDValue LowRHS = DAG.getNode(LowOpc, DL, MVT::v4i32, ExtendInRHS);
+    SDValue HighLHS = DAG.getNode(HighOpc, DL, MVT::v4i32, ExtendInLHS);
+    SDValue HighRHS = DAG.getNode(HighOpc, DL, MVT::v4i32, ExtendInRHS);
+
+    SDValue MulLow = DAG.getNode(ISD::MUL, DL, MVT::v4i32, LowLHS, LowRHS);
+    SDValue MulHigh = DAG.getNode(ISD::MUL, DL, MVT::v4i32, HighLHS, HighRHS);
+    SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::v4i32, MulLow, MulHigh);
+    return DAG.getNode(ISD::ADD, DL, MVT::v4i32, N->getOperand(1), Add);
+  } else {
+    assert(ExtendInLHS->getValueType(0) == MVT::v16i8 &&
+           "expected v16i8 input types");
+    // Lower to a wider tree, using twice the operations compared to above.
+    if (IsSigned) {
+      // Use two dots
+      unsigned LowOpc = WebAssemblyISD::EXTEND_LOW_S;
+      unsigned HighOpc = WebAssemblyISD::EXTEND_HIGH_S;
+      SDValue LowLHS = DAG.getNode(LowOpc, DL, MVT::v8i16, ExtendInLHS);
+      SDValue LowRHS = DAG.getNode(LowOpc, DL, MVT::v8i16, ExtendInRHS);
+      SDValue HighLHS = DAG.getNode(HighOpc, DL, MVT::v8i16, ExtendInLHS);
+      SDValue HighRHS = DAG.getNode(HighOpc, DL, MVT::v8i16, ExtendInRHS);
+      SDValue DotLHS =
+          DAG.getNode(WebAssemblyISD::DOT, DL, MVT::v4i32, LowLHS, LowRHS);
+      SDValue DotRHS =
+          DAG.getNode(WebAssemblyISD::DOT, DL, MVT::v4i32, HighLHS, HighRHS);
+      SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::v4i32, DotLHS, DotRHS);
+      return DAG.getNode(ISD::ADD, DL, MVT::v4i32, N->getOperand(1), Add);
+    }
+
+    unsigned LowOpc = WebAssemblyISD::EXTEND_LOW_U;
+    unsigned HighOpc = WebAssemblyISD::EXTEND_HIGH_U;
+    SDValue LowLHS = DAG.getNode(LowOpc, DL, MVT::v8i16, ExtendInLHS);
+    SDValue LowRHS = DAG.getNode(LowOpc, DL, MVT::v8i16, ExtendInRHS);
+    SDValue HighLHS = DAG.getNode(HighOpc, DL, MVT::v8i16, ExtendInLHS);
+    SDValue HighRHS = DAG.getNode(HighOpc, DL, MVT::v8i16, ExtendInRHS);
+
+    SDValue MulLow = DAG.getNode(ISD::MUL, DL, MVT::v8i16, LowLHS, LowRHS);
+    SDValue MulHigh = DAG.getNode(ISD::MUL, DL, MVT::v8i16, HighLHS, HighRHS);
+
+    SDValue LowLow = DAG.getNode(LowOpc, DL, MVT::v4i32, MulLow);
+    SDValue LowHigh = DAG.getNode(LowOpc, DL, MVT::v4i32, MulHigh);
+    SDValue HighLow = DAG.getNode(HighOpc, DL, MVT::v4i32, MulLow);
+    SDValue HighHigh = DAG.getNode(HighOpc, DL, MVT::v4i32, MulHigh);
+
+    SDValue AddLow = DAG.getNode(ISD::ADD, DL, MVT::v4i32, LowLow, HighLow);
+    SDValue AddHigh = DAG.getNode(ISD::ADD, DL, MVT::v4i32, LowHigh, HighHigh);
+    SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::v4i32, AddLow, AddHigh);
+    return DAG.getNode(ISD::ADD, DL, MVT::v4i32, N->getOperand(1), Add);
+  }
+}
+
 SDValue WebAssemblyTargetLowering::LowerIntrinsic(SDValue Op,
                                                   SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
@@ -3126,5 +3248,7 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
     return performVectorTruncZeroCombine(N, DCI);
   case ISD::TRUNCATE:
     return performTruncateCombine(N, DCI);
+  case ISD::INTRINSIC_WO_CHAIN:
+    return performLowerPartialReduction(N, DCI.DAG);
   }
 }
diff --git llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
index d9ced1a1a527..90d31e38a707 100644
--- llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
+++ llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
@@ -45,6 +45,8 @@ private:
   /// right decision when generating code for different targets.
   const WebAssemblySubtarget *Subtarget;
 
+  bool
+  shouldExpandPartialReductionIntrinsic(const IntrinsicInst *I) const override;
   AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override;
   bool shouldScalarizeBinop(SDValue VecOp) const override;
   FastISel *createFastISel(FunctionLoweringInfo &FuncInfo,
diff --git llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 2c0543842a82..14acc623ce24 100644
--- llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -1147,11 +1147,15 @@ def : Pat<(wasm_shr_u
 }
 
 // Widening dot product: i32x4.dot_i16x8_s
+def dot_t : SDTypeProfile<1, 2, [SDTCisVT<0, v4i32>, SDTCisVT<1, v8i16>, SDTCisVT<2, v8i16>]>;
+def wasm_dot : SDNode<"WebAssemblyISD::DOT", dot_t>;
 let isCommutable = 1 in
 defm DOT : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs), (outs), (ins),
                   [(set V128:$dst, (int_wasm_dot V128:$lhs, V128:$rhs))],
                   "i32x4.dot_i16x8_s\t$dst, $lhs, $rhs", "i32x4.dot_i16x8_s",
                   186>;
+def : Pat<(wasm_dot V128:$lhs, V128:$rhs),
+          (DOT $lhs, $rhs)>;
 
 // Extending multiplication: extmul_{low,high}_P, extmul_high
 def extend_t : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>]>;
diff --git llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
index 3d678e538416..e4dc38686a44 100644
--- llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
+++ llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@@ -13,6 +13,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "WebAssemblyTargetTransformInfo.h"
+
+#include "llvm/CodeGen/CostTable.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "wasmtti"
@@ -51,8 +53,7 @@ TypeSize WebAssemblyTTIImpl::getRegisterBitWidth(
 InstructionCost WebAssemblyTTIImpl::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
     TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
-    ArrayRef<const Value *> Args,
-    const Instruction *CxtI) {
+    ArrayRef<const Value *> Args, const Instruction *CxtI) {
 
   InstructionCost Cost =
       BasicTTIImplBase<WebAssemblyTTIImpl>::getArithmeticInstrCost(
@@ -78,6 +79,109 @@ InstructionCost WebAssemblyTTIImpl::getArithmeticInstrCost(
   return Cost;
 }
 
+InstructionCost WebAssemblyTTIImpl::getCastInstrCost(
+    unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH,
+    TTI::TargetCostKind CostKind, const Instruction *I) {
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  auto SrcTy = TLI->getValueType(DL, Src);
+  auto DstTy = TLI->getValueType(DL, Dst);
+
+  if (!SrcTy.isSimple() || !DstTy.isSimple()) {
+    return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
+  }
+
+  if (!ST->hasSIMD128()) {
+    return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
+  }
+
+  auto DstVT = DstTy.getSimpleVT();
+  auto SrcVT = SrcTy.getSimpleVT();
+
+  if (I && I->hasOneUser()) {
+    auto *SingleUser = cast<Instruction>(*I->user_begin());
+    int UserISD = TLI->InstructionOpcodeToISD(SingleUser->getOpcode());
+
+    // extmul_low support
+    if (UserISD == ISD::MUL &&
+        (ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND)) {
+      // Free low extensions.
+      if ((SrcVT == MVT::v8i8 && DstVT == MVT::v8i16) ||
+          (SrcVT == MVT::v4i16 && DstVT == MVT::v4i32) ||
+          (SrcVT == MVT::v2i32 && DstVT == MVT::v2i64)) {
+        return 0;
+      }
+      // Will require an additional extlow operation for the intermediate
+      // i16/i32 value.
+      if ((SrcVT == MVT::v4i8 && DstVT == MVT::v4i32) ||
+          (SrcVT == MVT::v2i16 && DstVT == MVT::v2i64)) {
+        return 1;
+      }
+    }
+  }
+
+  // extend_low
+  static constexpr TypeConversionCostTblEntry ConversionTbl[] = {
+      {ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1},
+      {ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1},
+      {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1},
+      {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1},
+      {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1},
+      {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1},
+      {ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2},
+      {ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2},
+      {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2},
+      {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2},
+  };
+
+  if (const auto *Entry =
+          ConvertCostTableLookup(ConversionTbl, ISD, DstVT, SrcVT)) {
+    return Entry->Cost;
+  }
+
+  return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
+}
+
+InstructionCost WebAssemblyTTIImpl::getMemoryOpCost(
+    unsigned Opcode, Type *Ty, MaybeAlign Alignment, unsigned AddressSpace,
+    TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo,
+    const Instruction *I) {
+  if (!ST->hasSIMD128() || !isa<FixedVectorType>(Ty)) {
+    return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
+                                  CostKind);
+  }
+
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  if (ISD != ISD::LOAD) {
+    return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
+                                  CostKind);
+  }
+
+  EVT VT = TLI->getValueType(DL, Ty, true);
+  // Type legalization can't handle structs
+  if (VT == MVT::Other)
+    return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
+                                  CostKind);
+
+  auto LT = getTypeLegalizationCost(Ty);
+  if (!LT.first.isValid())
+    return InstructionCost::getInvalid();
+
+  // 128-bit loads are a single instruction. 32-bit and 64-bit vector loads can
+  // be lowered to load32_zero and load64_zero respectively. Assume SIMD loads
+  // are twice as expensive as scalar.
+  unsigned width = VT.getSizeInBits();
+  switch (width) {
+  default:
+    break;
+  case 32:
+  case 64:
+  case 128:
+    return 2;
+  }
+
+  return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace, CostKind);
+}
+
 InstructionCost
 WebAssemblyTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
                                        TTI::TargetCostKind CostKind,
@@ -92,6 +196,53 @@ WebAssemblyTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
   return Cost;
 }
 
+InstructionCost WebAssemblyTTIImpl::getPartialReductionCost(
+    unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
+    ElementCount VF, TTI::PartialReductionExtendKind OpAExtend,
+    TTI::PartialReductionExtendKind OpBExtend,
+    std::optional<unsigned> BinOp) const {
+  InstructionCost Invalid = InstructionCost::getInvalid();
+  if (!VF.isFixed() || !ST->hasSIMD128())
+    return Invalid;
+
+  InstructionCost Cost(TTI::TCC_Basic);
+
+  // Possible options:
+  // - i16x8.extadd_pairwise_i8x16_sx
+  // - i32x4.extadd_pairwise_i16x8_sx
+  // - i32x4.dot_i16x8_s
+  // Only try to support dot, for now.
+
+  if (Opcode != Instruction::Add)
+    return Invalid;
+
+  if (!BinOp || *BinOp != Instruction::Mul)
+    return Invalid;
+
+  if (InputTypeA != InputTypeB)
+    return Invalid;
+
+  if (OpAExtend != OpBExtend)
+    return Invalid;
+
+  EVT InputEVT = EVT::getEVT(InputTypeA);
+  EVT AccumEVT = EVT::getEVT(AccumType);
+
+  // TODO: Add i64 accumulator.
+  if (AccumEVT != MVT::i32)
+    return Invalid;
+
+  // Signed inputs can lower to dot
+  if (InputEVT == MVT::i16 && VF.getFixedValue() == 8)
+    return OpAExtend == TTI::PR_SignExtend ? Cost : Cost * 2;
+
+  // Double the size of the lowered sequence.
+  if (InputEVT == MVT::i8 && VF.getFixedValue() == 16)
+    return OpAExtend == TTI::PR_SignExtend ? Cost * 2 : Cost * 4;
+
+  return Invalid;
+}
+
 TTI::ReductionShuffle WebAssemblyTTIImpl::getPreferredExpandedReductionShuffle(
     const IntrinsicInst *II) const {
 
diff --git llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
index 9691120b2e53..4d498b154c52 100644
--- llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
+++ llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
@@ -64,11 +64,26 @@ public:
       TTI::OperandValueInfo Op1Info = {TTI::OK_AnyValue, TTI::OP_None},
       TTI::OperandValueInfo Op2Info = {TTI::OK_AnyValue, TTI::OP_None},
       ArrayRef<const Value *> Args = {}, const Instruction *CxtI = nullptr);
+
+  InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                                   TTI::CastContextHint CCH,
+                                   TTI::TargetCostKind CostKind,
+                                   const Instruction *I = nullptr);
+  InstructionCost getMemoryOpCost(
+      unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace,
+      TTI::TargetCostKind CostKind,
+      TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None},
+      const Instruction *I = nullptr);
   using BaseT::getVectorInstrCost;
   InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
                                      TTI::TargetCostKind CostKind,
                                      unsigned Index, Value *Op0, Value *Op1);
-
+  InstructionCost
+  getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB,
+                          Type *AccumType, ElementCount VF,
+                          TTI::PartialReductionExtendKind OpAExtend,
+                          TTI::PartialReductionExtendKind OpBExtend,
+                          std::optional<unsigned> BinOp = std::nullopt) const;
   TTI::ReductionShuffle
   getPreferredExpandedReductionShuffle(const IntrinsicInst *II) const;
 
diff --git llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
index bab7fe9d25e4..88b5ec8cd004 100644
--- llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
+++ llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
@@ -99,6 +99,10 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
       .widenScalarToNextPow2(0, /*Min=*/8)
       .clampScalar(0, s8, sMaxScalar);
 
+  getActionDefinitionsBuilder(G_LROUND).libcall();
+
+  getActionDefinitionsBuilder(G_LLROUND).libcall();
+
   // merge/unmerge
   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
diff --git llvm/lib/Target/X86/X86FrameLowering.cpp llvm/lib/Target/X86/X86FrameLowering.cpp
index f8ed75f189a7..a15db039a5ed 100644
--- llvm/lib/Target/X86/X86FrameLowering.cpp
+++ llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -2578,14 +2578,8 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
     --MBBI;
   }
 
-  // Windows unwinder will not invoke function's exception handler if IP is
-  // either in prologue or in epilogue.  This behavior causes a problem when a
-  // call immediately precedes an epilogue, because the return address points
-  // into the epilogue.  To cope with that, we insert an epilogue marker here,
-  // then replace it with a 'nop' if it ends up immediately after a CALL in the
-  // final emitted code.
   if (NeedsWin64CFI && MF.hasWinCFI())
-    BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_Epilogue));
+    BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_BeginEpilogue));
 
   if (!HasFP && NeedsDwarfCFI) {
     MBBI = FirstCSPop;
@@ -2630,6 +2624,9 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
   // Emit tilerelease for AMX kernel.
   if (X86FI->getAMXProgModel() == AMXProgModelEnum::ManagedRA)
     BuildMI(MBB, Terminator, DL, TII.get(X86::TILERELEASE));
+
+  if (NeedsWin64CFI && MF.hasWinCFI())
+    BuildMI(MBB, Terminator, DL, TII.get(X86::SEH_EndEpilogue));
 }
 
 StackOffset X86FrameLowering::getFrameIndexReference(const MachineFunction &MF,
diff --git llvm/lib/Target/X86/X86InstrCompiler.td llvm/lib/Target/X86/X86InstrCompiler.td
index 9bda3fd7d951..9687ae29f1c7 100644
--- llvm/lib/Target/X86/X86InstrCompiler.td
+++ llvm/lib/Target/X86/X86InstrCompiler.td
@@ -262,8 +262,10 @@ let isPseudo = 1, isMeta = 1, isNotDuplicable = 1, SchedRW = [WriteSystem] in {
 
 // Epilog instructions:
 let isPseudo = 1, isMeta = 1, SchedRW = [WriteSystem] in {
-  def SEH_Epilogue : I<0, Pseudo, (outs), (ins),
-                            "#SEH_Epilogue", []>;
+  def SEH_BeginEpilogue : I<0, Pseudo, (outs), (ins),
+                            "#SEH_BeginEpilogue", []>;
+  def SEH_EndEpilogue : I<0, Pseudo, (outs), (ins),
+                            "#SEH_EndEpilogue", []>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git llvm/lib/Target/X86/X86MCInstLower.cpp llvm/lib/Target/X86/X86MCInstLower.cpp
index 645a9baeba65..0f8fbf5be1c9 100644
--- llvm/lib/Target/X86/X86MCInstLower.cpp
+++ llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -1779,6 +1779,14 @@ void X86AsmPrinter::EmitSEHInstruction(const MachineInstr *MI) {
     OutStreamer->emitWinCFIEndProlog();
     break;
 
+  case X86::SEH_BeginEpilogue:
+    OutStreamer->emitWinCFIBeginEpilogue();
+    break;
+
+  case X86::SEH_EndEpilogue:
+    OutStreamer->emitWinCFIEndEpilogue();
+    break;
+
   default:
     llvm_unreachable("expected SEH_ instruction");
   }
@@ -2420,11 +2428,17 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
   case X86::SEH_SetFrame:
   case X86::SEH_PushFrame:
   case X86::SEH_EndPrologue:
+  case X86::SEH_EndEpilogue:
     EmitSEHInstruction(MI);
     return;
 
-  case X86::SEH_Epilogue: {
+  case X86::SEH_BeginEpilogue: {
     assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
+    // Windows unwinder will not invoke function's exception handler if IP is
+    // either in prologue or in epilogue.  This behavior causes a problem when a
+    // call immediately precedes an epilogue, because the return address points
+    // into the epilogue.  To cope with that, we insert a 'nop' if it ends up
+    // immediately after a CALL in the final emitted code.
     MachineBasicBlock::const_iterator MBBI(MI);
     // Check if preceded by a call and emit nop if so.
     for (MBBI = PrevCrossBBInst(MBBI);
@@ -2439,6 +2453,8 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
         break;
       }
     }
+
+    EmitSEHInstruction(MI);
     return;
   }
   case X86::UBSAN_UD1:
diff --git llvm/lib/Target/Xtensa/AsmParser/XtensaAsmParser.cpp llvm/lib/Target/Xtensa/AsmParser/XtensaAsmParser.cpp
index 731f9535ca25..0c9258ecd65e 100644
--- llvm/lib/Target/Xtensa/AsmParser/XtensaAsmParser.cpp
+++ llvm/lib/Target/Xtensa/AsmParser/XtensaAsmParser.cpp
@@ -73,6 +73,7 @@ class XtensaAsmParser : public MCTargetAsmParser {
                                SMLoc &EndLoc) override {
     return ParseStatus::NoMatch;
   }
+
   ParseStatus parsePCRelTarget(OperandVector &Operands);
   bool parseLiteralDirective(SMLoc L);
 
@@ -89,6 +90,10 @@ public:
       : MCTargetAsmParser(Options, STI, MII) {
     setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
   }
+
+  bool hasWindowed() const {
+    return getSTI().getFeatureBits()[Xtensa::FeatureWindowed];
+  };
 };
 
 // Return true if Expr is in the range [MinValue, MaxValue].
@@ -181,6 +186,11 @@ public:
            ((cast<MCConstantExpr>(getImm())->getValue() & 0x3) == 0);
   }
 
+  bool isentry_imm12() const {
+    return isImm(0, 32760) &&
+           ((cast<MCConstantExpr>(getImm())->getValue() % 8) == 0);
+  }
+
   bool isUimm4() const { return isImm(0, 15); }
 
   bool isUimm5() const { return isImm(0, 31); }
@@ -198,6 +208,11 @@ public:
 
   bool isImm32n_95() const { return isImm(-32, 95); }
 
+  bool isImm64n_4n() const {
+    return isImm(-64, -4) &&
+           ((cast<MCConstantExpr>(getImm())->getValue() & 0x3) == 0);
+  }
+
   bool isB4const() const {
     if (Kind != Immediate)
       return false;
@@ -491,6 +506,12 @@ bool XtensaAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   case Match_InvalidImm32n_95:
     return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
                  "expected immediate in range [-32, 95]");
+  case Match_InvalidImm64n_4n:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected immediate in range [-64, -4]");
+  case Match_InvalidImm8n_7:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected immediate in range [-8, 7]");
   case Match_InvalidShimm1_31:
     return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
                  "expected immediate in range [1, 31]");
@@ -515,6 +536,10 @@ bool XtensaAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
                  "expected immediate in range [0, 60], first 2 bits "
                  "should be zero");
+  case Match_Invalidentry_imm12:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected immediate in range [0, 32760], first 3 bits "
+                 "should be zero");
   }
 
   report_fatal_error("Unknown match type detected!");
@@ -601,6 +626,10 @@ ParseStatus XtensaAsmParser::parseRegister(OperandVector &Operands,
       getLexer().UnLex(Buf[0]);
     return ParseStatus::NoMatch;
   }
+
+  if (!Xtensa::checkRegister(RegNo, getSTI().getFeatureBits()))
+    return ParseStatus::NoMatch;
+
   if (HadParens)
     Operands.push_back(XtensaOperand::createToken("(", FirstS));
   SMLoc S = getLoc();
@@ -702,7 +731,7 @@ bool XtensaAsmParser::ParseInstructionWithSR(ParseInstructionInfo &Info,
     if (RegNo == 0)
       RegNo = MatchRegisterAltName(RegName);
 
-    if (RegNo == 0)
+    if (!Xtensa::checkRegister(RegNo, getSTI().getFeatureBits()))
       return Error(NameLoc, "invalid register name");
 
     // Parse operand
diff --git llvm/lib/Target/Xtensa/Disassembler/XtensaDisassembler.cpp llvm/lib/Target/Xtensa/Disassembler/XtensaDisassembler.cpp
index c11c4b7038bd..7ad8a87ed599 100644
--- llvm/lib/Target/Xtensa/Disassembler/XtensaDisassembler.cpp
+++ llvm/lib/Target/Xtensa/Disassembler/XtensaDisassembler.cpp
@@ -57,7 +57,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeXtensaDisassembler() {
                                          createXtensaDisassembler);
 }
 
-static const unsigned ARDecoderTable[] = {
+const unsigned ARDecoderTable[] = {
     Xtensa::A0,  Xtensa::SP,  Xtensa::A2,  Xtensa::A3, Xtensa::A4,  Xtensa::A5,
     Xtensa::A6,  Xtensa::A7,  Xtensa::A8,  Xtensa::A9, Xtensa::A10, Xtensa::A11,
     Xtensa::A12, Xtensa::A13, Xtensa::A14, Xtensa::A15};
@@ -73,17 +73,23 @@ static DecodeStatus DecodeARRegisterClass(MCInst &Inst, uint64_t RegNo,
   return MCDisassembler::Success;
 }
 
-static const unsigned SRDecoderTable[] = {Xtensa::SAR, 3};
+const unsigned SRDecoderTable[] = {
+    Xtensa::SAR, 3, Xtensa::WINDOWBASE, 72, Xtensa::WINDOWSTART, 73};
 
 static DecodeStatus DecodeSRRegisterClass(MCInst &Inst, uint64_t RegNo,
                                           uint64_t Address,
-                                          const void *Decoder) {
+                                          const MCDisassembler *Decoder) {
   if (RegNo > 255)
     return MCDisassembler::Fail;
 
   for (unsigned i = 0; i < std::size(SRDecoderTable); i += 2) {
     if (SRDecoderTable[i + 1] == RegNo) {
       unsigned Reg = SRDecoderTable[i];
+
+      if (!Xtensa::checkRegister(Reg,
+                                 Decoder->getSubtargetInfo().getFeatureBits()))
+        return MCDisassembler::Fail;
+
       Inst.addOperand(MCOperand::createReg(Reg));
       return MCDisassembler::Success;
     }
@@ -210,6 +216,29 @@ static DecodeStatus decodeImm32n_95Operand(MCInst &Inst, uint64_t Imm,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus decodeImm8n_7Operand(MCInst &Inst, uint64_t Imm,
+                                         int64_t Address, const void *Decoder) {
+  assert(isUInt<4>(Imm) && "Invalid immediate");
+  Inst.addOperand(MCOperand::createImm(Imm > 7 ? Imm - 16 : Imm));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeImm64n_4nOperand(MCInst &Inst, uint64_t Imm,
+                                           int64_t Address,
+                                           const void *Decoder) {
+  assert(isUInt<6>(Imm) && ((Imm & 0x3) == 0) && "Invalid immediate");
+  Inst.addOperand(MCOperand::createImm((~0x3f) | (Imm)));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeEntry_Imm12OpValue(MCInst &Inst, uint64_t Imm,
+                                             int64_t Address,
+                                             const void *Decoder) {
+  assert(isUInt<15>(Imm) && ((Imm & 0x7) == 0) && "Invalid immediate");
+  Inst.addOperand(MCOperand::createImm(Imm));
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus decodeShimm1_31Operand(MCInst &Inst, uint64_t Imm,
                                            int64_t Address,
                                            const void *Decoder) {
diff --git llvm/lib/Target/Xtensa/MCTargetDesc/XtensaInstPrinter.cpp llvm/lib/Target/Xtensa/MCTargetDesc/XtensaInstPrinter.cpp
index df8a0854f06f..868c7f6c0b9c 100644
--- llvm/lib/Target/Xtensa/MCTargetDesc/XtensaInstPrinter.cpp
+++ llvm/lib/Target/Xtensa/MCTargetDesc/XtensaInstPrinter.cpp
@@ -264,6 +264,28 @@ void XtensaInstPrinter::printImm32n_95_AsmOperand(const MCInst *MI, int OpNum,
     printOperand(MI, OpNum, O);
 }
 
+void XtensaInstPrinter::printImm8n_7_AsmOperand(const MCInst *MI, int OpNum,
+                                                raw_ostream &O) {
+  if (MI->getOperand(OpNum).isImm()) {
+    int64_t Value = MI->getOperand(OpNum).getImm();
+    assert((Value >= -8 && Value <= 7) &&
+           "Invalid argument, value must be in ranges <-8,7>");
+    O << Value;
+  } else
+    printOperand(MI, OpNum, O);
+}
+
+void XtensaInstPrinter::printImm64n_4n_AsmOperand(const MCInst *MI, int OpNum,
+                                                  raw_ostream &O) {
+  if (MI->getOperand(OpNum).isImm()) {
+    int64_t Value = MI->getOperand(OpNum).getImm();
+    assert((Value >= -64 && Value <= -4) & ((Value & 0x3) == 0) &&
+           "Invalid argument, value must be in ranges <-64,-4>");
+    O << Value;
+  } else
+    printOperand(MI, OpNum, O);
+}
+
 void XtensaInstPrinter::printOffset8m8_AsmOperand(const MCInst *MI, int OpNum,
                                                   raw_ostream &O) {
   if (MI->getOperand(OpNum).isImm()) {
@@ -309,6 +331,18 @@ void XtensaInstPrinter::printOffset4m32_AsmOperand(const MCInst *MI, int OpNum,
     printOperand(MI, OpNum, O);
 }
 
+void XtensaInstPrinter::printEntry_Imm12_AsmOperand(const MCInst *MI, int OpNum,
+                                                    raw_ostream &O) {
+  if (MI->getOperand(OpNum).isImm()) {
+    int64_t Value = MI->getOperand(OpNum).getImm();
+    assert((Value >= 0 && Value <= 32760) &&
+           "Invalid argument, value must be multiples of eight in range "
+           "<0,32760>");
+    O << Value;
+  } else
+    printOperand(MI, OpNum, O);
+}
+
 void XtensaInstPrinter::printB4const_AsmOperand(const MCInst *MI, int OpNum,
                                                 raw_ostream &O) {
   if (MI->getOperand(OpNum).isImm()) {
diff --git llvm/lib/Target/Xtensa/MCTargetDesc/XtensaInstPrinter.h llvm/lib/Target/Xtensa/MCTargetDesc/XtensaInstPrinter.h
index e5bc67869e10..630b4dd60108 100644
--- llvm/lib/Target/Xtensa/MCTargetDesc/XtensaInstPrinter.h
+++ llvm/lib/Target/Xtensa/MCTargetDesc/XtensaInstPrinter.h
@@ -60,10 +60,13 @@ private:
   void printImm1_16_AsmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
   void printImm1n_15_AsmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
   void printImm32n_95_AsmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+  void printImm8n_7_AsmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+  void printImm64n_4n_AsmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
   void printOffset8m8_AsmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
   void printOffset8m16_AsmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
   void printOffset8m32_AsmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
   void printOffset4m32_AsmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+  void printEntry_Imm12_AsmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
   void printB4const_AsmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
   void printB4constu_AsmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
 };
diff --git llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCCodeEmitter.cpp llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCCodeEmitter.cpp
index 51d4b8a9cc5f..e6cdd3d0020f 100644
--- llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCCodeEmitter.cpp
+++ llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCCodeEmitter.cpp
@@ -111,6 +111,18 @@ private:
                                SmallVectorImpl<MCFixup> &Fixups,
                                const MCSubtargetInfo &STI) const;
 
+  uint32_t getImm8n_7OpValue(const MCInst &MI, unsigned OpNo,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+
+  uint32_t getImm64n_4nOpValue(const MCInst &MI, unsigned OpNo,
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const;
+
+  uint32_t getEntry_Imm12OpValue(const MCInst &MI, unsigned OpNo,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
   uint32_t getShimm1_31OpValue(const MCInst &MI, unsigned OpNo,
                                SmallVectorImpl<MCFixup> &Fixups,
                                const MCSubtargetInfo &STI) const;
@@ -405,6 +417,46 @@ XtensaMCCodeEmitter::getImm32n_95OpValue(const MCInst &MI, unsigned OpNo,
   return Res;
 }
 
+uint32_t
+XtensaMCCodeEmitter::getImm8n_7OpValue(const MCInst &MI, unsigned OpNo,
+                                       SmallVectorImpl<MCFixup> &Fixups,
+                                       const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+  int32_t Res = static_cast<int32_t>(MO.getImm());
+
+  assert(((Res >= -8) && (Res <= 7)) && "Unexpected operand value!");
+
+  if (Res < 0)
+    return Res + 16;
+
+  return Res;
+}
+
+uint32_t
+XtensaMCCodeEmitter::getImm64n_4nOpValue(const MCInst &MI, unsigned OpNo,
+                                         SmallVectorImpl<MCFixup> &Fixups,
+                                         const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+  int32_t Res = static_cast<int32_t>(MO.getImm());
+
+  assert(((Res >= -64) && (Res <= -4) && ((Res & 0x3) == 0)) &&
+         "Unexpected operand value!");
+
+  return Res & 0x3f;
+}
+
+uint32_t
+XtensaMCCodeEmitter::getEntry_Imm12OpValue(const MCInst &MI, unsigned OpNo,
+                                           SmallVectorImpl<MCFixup> &Fixups,
+                                           const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+  uint32_t res = static_cast<uint32_t>(MO.getImm());
+
+  assert(((res & 0x7) == 0) && "Unexpected operand value!");
+
+  return res;
+}
+
 uint32_t
 XtensaMCCodeEmitter::getB4constOpValue(const MCInst &MI, unsigned OpNo,
                                        SmallVectorImpl<MCFixup> &Fixups,
diff --git llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp
index fc23c2356825..37dee072e5b3 100644
--- llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp
+++ llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp
@@ -74,6 +74,19 @@ bool Xtensa::isValidAddrOffsetForOpcode(unsigned Opcode, int64_t Offset) {
   return isValidAddrOffset(Scale, Offset);
 }
 
+// Verify Special Register
+bool Xtensa::checkRegister(MCRegister RegNo, const FeatureBitset &FeatureBits) {
+  switch (RegNo) {
+  case Xtensa::WINDOWBASE:
+  case Xtensa::WINDOWSTART:
+    return FeatureBits[Xtensa::FeatureWindowed];
+  case Xtensa::NoRegister:
+    return false;
+  }
+
+  return true;
+}
+
 static MCAsmInfo *createXtensaMCAsmInfo(const MCRegisterInfo &MRI,
                                         const Triple &TT,
                                         const MCTargetOptions &Options) {
diff --git llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.h llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.h
index 6be54867d84a..649073b01f5c 100644
--- llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.h
+++ llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.h
@@ -19,12 +19,14 @@
 
 namespace llvm {
 
+class FeatureBitset;
 class MCAsmBackend;
 class MCCodeEmitter;
 class MCContext;
 class MCInstrInfo;
 class MCObjectTargetWriter;
 class MCObjectWriter;
+class MCRegister;
 class MCRegisterInfo;
 class MCSubtargetInfo;
 class MCTargetOptions;
@@ -52,6 +54,9 @@ bool isValidAddrOffset(int Scale, int64_t OffsetVal);
 
 // Check address offset for load/store instructions.
 bool isValidAddrOffsetForOpcode(unsigned Opcode, int64_t Offset);
+
+// Verify if it's correct to use a special register.
+bool checkRegister(MCRegister RegNo, const FeatureBitset &FeatureBits);
 } // namespace Xtensa
 } // end namespace llvm
 
diff --git llvm/lib/Target/Xtensa/Xtensa.td llvm/lib/Target/Xtensa/Xtensa.td
index 460a15e808b3..2c4bacbe8282 100644
--- llvm/lib/Target/Xtensa/Xtensa.td
+++ llvm/lib/Target/Xtensa/Xtensa.td
@@ -17,10 +17,9 @@ include "llvm/Target/Target.td"
 //===----------------------------------------------------------------------===//
 // Subtarget Features.
 //===----------------------------------------------------------------------===//
-def FeatureDensity : SubtargetFeature<"density", "HasDensity", "true",
-                    "Enable Density instructions">;
-def HasDensity : Predicate<"Subtarget->hasDensity()">,
-                     AssemblerPredicate<(all_of FeatureDensity)>;
+
+include "XtensaFeatures.td"
+
 //===----------------------------------------------------------------------===//
 // Xtensa supported processors.
 //===----------------------------------------------------------------------===//
diff --git llvm/lib/Target/Xtensa/XtensaFeatures.td llvm/lib/Target/Xtensa/XtensaFeatures.td
new file mode 100644
index 000000000000..6f24a674ae0c
--- /dev/null
+++ llvm/lib/Target/Xtensa/XtensaFeatures.td
@@ -0,0 +1,14 @@
+//===----------------------------------------------------------------------===//
+// Xtensa subtarget features.
+//===----------------------------------------------------------------------===//
+
+// Xtensa ISA extensions (Xtensa Options).
+def FeatureDensity : SubtargetFeature<"density", "HasDensity", "true",
+                                      "Enable Density instructions">;
+def HasDensity : Predicate<"Subtarget->hasDensity()">,
+                 AssemblerPredicate<(all_of FeatureDensity)>;
+
+def FeatureWindowed : SubtargetFeature<"windowed", "HasWindowed", "true",
+                                       "Enable Xtensa Windowed Register option">;
+def HasWindowed : Predicate<"Subtarget->hasWindowed()">,
+                  AssemblerPredicate<(all_of FeatureWindowed)>;
diff --git llvm/lib/Target/Xtensa/XtensaInstrInfo.td llvm/lib/Target/Xtensa/XtensaInstrInfo.td
index 699d0d6cf804..5ef795a0e528 100644
--- llvm/lib/Target/Xtensa/XtensaInstrInfo.td
+++ llvm/lib/Target/Xtensa/XtensaInstrInfo.td
@@ -678,3 +678,104 @@ let isReturn = 1, isTerminator = 1,
     let t = 0;
   }
 }
+
+//===----------------------------------------------------------------------===//
+// Windowed instructions
+//===----------------------------------------------------------------------===//
+
+def ENTRY : BRI12_Inst<0x06, 0x3, 0x0, (outs), (ins AR:$s, entry_imm12:$imm),
+                     "entry\t$s, $imm", []>, Requires<[HasWindowed]> {
+  bits<15> imm;
+
+  let imm12{11-0} = imm{14-3};
+  let Defs = [SP];
+}
+
+let isCall = 1, Defs = [A0] in {
+  foreach i = {1,2,3} in {
+    defvar I = !mul(4, i);
+
+    def CALL#I# : CALL_Inst<0x05, (outs), (ins pcrel32call:$offset),
+                           "call"#I#"\t$offset", []>, Requires<[HasWindowed]> {
+      let n = i;
+    }
+
+    def CALLX#I# : CALLX_Inst<0x00, 0x00, 0x00, (outs), (ins AR:$s),
+                             "callx"#I#"\t$s", []>, Requires<[HasWindowed]> {
+      let m = 0x3;
+      let n = i;
+      let r = 0;
+    }
+  }
+}
+
+def MOVSP : RRR_Inst<0x00, 0x00, 0x00, (outs AR:$t), (ins AR:$s),
+                    "movsp\t$t, $s", []>, Requires<[HasWindowed]> {
+  let r = 0x01;
+}
+
+let isReturn = 1, isTerminator = 1,
+    isBarrier = 1, Uses = [A0] in {
+  def RETW_N : RRRN_Inst<0x0D, (outs), (ins),
+                        "retw.n", []>, Requires<[HasWindowed, HasDensity]> {
+    let r = 0x0F;
+    let s = 0;
+    let t = 1;
+  }
+
+  def RETW : CALLX_Inst<0x00, 0x00, 0x00, (outs), (ins),
+                       "retw", []>, Requires<[HasWindowed]> {
+    let m = 0x2;
+    let n = 0x1;
+    let s = 0;
+    let r = 0;
+  }
+}
+
+def : InstAlias<"_retw", (RETW)>;
+def : InstAlias<"_retw.n", (RETW_N)>;
+
+def S32E : RRI4_Inst<0x00, 0x09, (outs), (ins AR:$t, AR:$s, imm64n_4n:$imm),
+                    "s32e\t$t, $s, $imm", []>, Requires<[HasWindowed]> {
+  bits<6> imm;
+
+  let r = imm{5-2};
+  let imm4 = 0x4;
+  let mayStore = 1;
+}
+
+def L32E : RRI4_Inst<0x00, 0x09, (outs), (ins AR:$t, AR:$s, imm64n_4n:$imm),
+                    "l32e\t$t, $s, $imm", []>, Requires<[HasWindowed]> {
+  bits<6> imm;
+
+  let r = imm{5-2};
+  let imm4 = 0x0;
+  let mayLoad = 1;
+}
+
+def RFWU : RRR_Inst<0x00, 0x00, 0x00, (outs), (ins),
+                   "rfwu", []>, Requires<[HasWindowed]> {
+  bits<4> imm;
+
+  let r = 0x3;
+  let s = 0x5;
+  let t = 0x0;
+}
+
+def RFWO : RRR_Inst<0x00, 0x00, 0x00, (outs), (ins),
+                   "rfwo", []>, Requires<[HasWindowed]> {
+  bits<4> imm;
+
+  let r = 0x3;
+  let s = 0x4;
+  let t = 0x0;
+}
+
+def ROTW : RRR_Inst<0x00, 0x00, 0x04, (outs), (ins imm8n_7:$imm),
+                   "rotw\t$imm", []>, Requires<[HasWindowed]> {
+  bits<4> imm;
+
+  let r = 0x8;
+  let s = 0x0;
+  let t = imm{3-0};
+}
diff --git llvm/lib/Target/Xtensa/XtensaOperands.td llvm/lib/Target/Xtensa/XtensaOperands.td
index aa72fa0a56a6..402e05a5c3dd 100644
--- llvm/lib/Target/Xtensa/XtensaOperands.td
+++ llvm/lib/Target/Xtensa/XtensaOperands.td
@@ -37,6 +37,20 @@ def imm8_sh8 : Immediate<i32, [{ return Imm >= -32768 && Imm <= 32512 && ((Imm &
   let DecoderMethod = "decodeImm8_sh8Operand";
 }
 
+// imm8n_7 predicate - Immediate in the range [-8,7]
+def Imm8n_7_AsmOperand: ImmAsmOperand<"Imm8n_7">;
+def imm8n_7: Immediate<i32, [{ return Imm >= -8 && Imm <= 7; }], "Imm8n_7_AsmOperand"> {
+  let EncoderMethod = "getImm8n_7OpValue";
+  let DecoderMethod = "decodeImm8n_7Operand";
+}
+
+// imm64n_4n predicate - Immediate in the range [-64,-4]
+def Imm64n_4n_AsmOperand: ImmAsmOperand<"Imm64n_4n">;
+def imm64n_4n: Immediate<i32, [{ return Imm >= -64 && Imm <= -4; }], "Imm64n_4n_AsmOperand"> {
+  let EncoderMethod = "getImm64n_4nOpValue";
+  let DecoderMethod = "decodeImm64n_4nOperand";
+}
+
 // imm12 predicate - Immediate in the range [-2048,2047]
 def Imm12_AsmOperand : ImmAsmOperand<"Imm12">;
 def imm12 : Immediate<i32, [{ return Imm >= -2048 && Imm <= 2047; }], "Imm12_AsmOperand"> {
@@ -117,6 +131,13 @@ def offset4m32 : Immediate<i32,
     [{ return Imm >= 0 && Imm <= 60 && (Imm & 0x3 == 0); }],
     "Offset4m32_AsmOperand">;
 
+// entry_imm12 predicate - Immediate in the range [0,32760], ENTRY parameter
+def Entry_Imm12_AsmOperand: ImmAsmOperand<"entry_imm12">;
+def entry_imm12: Immediate<i32, [{ return Imm >= 0 && Imm <= 32760 && (Imm % 8 == 0); }], "Entry_Imm12_AsmOperand"> {
+  let EncoderMethod = "getEntry_Imm12OpValue";
+  let DecoderMethod = "decodeEntry_Imm12OpValue";
+}
+
 // b4const predicate - Branch Immediate 4-bit signed operand
 def B4const_AsmOperand: ImmAsmOperand<"B4const">;
 def b4const: Immediate<i32,
diff --git llvm/lib/Target/Xtensa/XtensaRegisterInfo.td llvm/lib/Target/Xtensa/XtensaRegisterInfo.td
index 5c07386b060c..09087edc8671 100644
--- llvm/lib/Target/Xtensa/XtensaRegisterInfo.td
+++ llvm/lib/Target/Xtensa/XtensaRegisterInfo.td
@@ -75,4 +75,8 @@ class SRReg<bits<8> num, string n, list<string> alt = []> : XtensaReg<n> {
 // Shift Amount Register
 def SAR : SRReg<3, "sar", ["SAR","3"]>;
 
-def SR :  RegisterClass<"Xtensa", [i32], 32, (add SAR)>;
+// Windowed Register Option registers
+def WINDOWBASE : SRReg<72, "windowbase", ["WINDOWBASE", "72"]>;
+def WINDOWSTART : SRReg<73, "windowstart", ["WINDOWSTART", "73"]>;
+
+def SR :  RegisterClass<"Xtensa", [i32], 32, (add SAR, WINDOWBASE, WINDOWSTART)>;
diff --git llvm/lib/Target/Xtensa/XtensaSubtarget.h llvm/lib/Target/Xtensa/XtensaSubtarget.h
index 948dcbc5278e..dddc0f7ef605 100644
--- llvm/lib/Target/Xtensa/XtensaSubtarget.h
+++ llvm/lib/Target/Xtensa/XtensaSubtarget.h
@@ -36,9 +36,12 @@ private:
   SelectionDAGTargetInfo TSInfo;
   XtensaFrameLowering FrameLowering;
 
-  // Enabled Xtensa Density extension
+  // Enabled Xtensa Density Option
   bool HasDensity;
 
+  // Enabled Xtensa Windowed Register Option
+  bool HasWindowed;
+
   XtensaSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
 
 public:
@@ -64,6 +67,8 @@ public:
 
   bool hasDensity() const { return HasDensity; }
 
+  bool hasWindowed() const { return HasWindowed; }
+
   // Automatically generated by tblgen.
   void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
 };
diff --git llvm/lib/TargetParser/Host.cpp llvm/lib/TargetParser/Host.cpp
index fa57ae183bb8..d6a16143fe9e 100644
--- llvm/lib/TargetParser/Host.cpp
+++ llvm/lib/TargetParser/Host.cpp
@@ -493,6 +493,7 @@ StringRef sys::detail::getHostCPUNameForRISCV(StringRef ProcCpuinfoContent) {
   }
 
   return StringSwitch<const char *>(UArch)
+      .Case("eswin,eic770x", "sifive-p550")
       .Case("sifive,u74-mc", "sifive-u74")
       .Case("sifive,bullet0", "sifive-u74")
       .Default("");
diff --git llvm/lib/Transforms/Coroutines/SpillUtils.cpp llvm/lib/Transforms/Coroutines/SpillUtils.cpp
index 573a2ea35deb..5062ee97a665 100644
--- llvm/lib/Transforms/Coroutines/SpillUtils.cpp
+++ llvm/lib/Transforms/Coroutines/SpillUtils.cpp
@@ -226,9 +226,8 @@ struct AllocaUseVisitor : PtrUseVisitor<AllocaUseVisitor> {
           if (auto *S = dyn_cast<StoreInst>(U))
             if (S->getPointerOperand() == I)
               continue;
-          if (auto *II = dyn_cast<IntrinsicInst>(U))
-            if (II->isLifetimeStartOrEnd())
-              continue;
+          if (isa<LifetimeIntrinsic>(U))
+            continue;
           // BitCastInst creats aliases of the memory location being stored
           // into.
           if (auto *BI = dyn_cast<BitCastInst>(U)) {
diff --git llvm/lib/Transforms/IPO/FunctionAttrs.cpp llvm/lib/Transforms/IPO/FunctionAttrs.cpp
index f2419e42862f..cf56f67e4de3 100644
--- llvm/lib/Transforms/IPO/FunctionAttrs.cpp
+++ llvm/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -1296,16 +1296,6 @@ static void addArgumentAttrs(const SCCNodeSet &SCCNodes,
       continue;
     }
 
-    bool SCCCaptured = false;
-    for (ArgumentGraphNode *Node : ArgumentSCC) {
-      if (Node->Uses.empty() && !Node->Definition->hasNoCaptureAttr()) {
-        SCCCaptured = true;
-        break;
-      }
-    }
-    if (SCCCaptured)
-      continue;
-
     SmallPtrSet<Argument *, 8> ArgumentSCCNodes;
     // Fill ArgumentSCCNodes with the elements of the ArgumentSCC.  Used for
     // quickly looking up whether a given Argument is in this ArgumentSCC.
@@ -1313,6 +1303,7 @@ static void addArgumentAttrs(const SCCNodeSet &SCCNodes,
       ArgumentSCCNodes.insert(I->Definition);
     }
 
+    bool SCCCaptured = false;
     for (ArgumentGraphNode *N : ArgumentSCC) {
       for (ArgumentGraphNode *Use : N->Uses) {
         Argument *A = Use->Definition;
diff --git llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 5a4791870ac7..83534059bfb6 100644
--- llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -37,11 +37,10 @@ using namespace PatternMatch;
 // How many times is a select replaced by one of its operands?
 STATISTIC(NumSel, "Number of select opts");
 
-
 /// Compute Result = In1+In2, returning true if the result overflowed for this
 /// type.
-static bool addWithOverflow(APInt &Result, const APInt &In1,
-                            const APInt &In2, bool IsSigned = false) {
+static bool addWithOverflow(APInt &Result, const APInt &In1, const APInt &In2,
+                            bool IsSigned = false) {
   bool Overflow;
   if (IsSigned)
     Result = In1.sadd_ov(In2, Overflow);
@@ -53,8 +52,8 @@ static bool addWithOverflow(APInt &Result, const APInt &In1,
 
 /// Compute Result = In1-In2, returning true if the result overflowed for this
 /// type.
-static bool subWithOverflow(APInt &Result, const APInt &In1,
-                            const APInt &In2, bool IsSigned = false) {
+static bool subWithOverflow(APInt &Result, const APInt &In1, const APInt &In2,
+                            bool IsSigned = false) {
   bool Overflow;
   if (IsSigned)
     Result = In1.ssub_ov(In2, Overflow);
@@ -810,15 +809,15 @@ Instruction *InstCombinerImpl::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
     if (GEPLHS->getNumOperands() == GEPRHS->getNumOperands() &&
         GEPLHS->getSourceElementType() == GEPRHS->getSourceElementType()) {
       // If the GEPs only differ by one index, compare it.
-      unsigned NumDifferences = 0;  // Keep track of # differences.
-      unsigned DiffOperand = 0;     // The operand that differs.
+      unsigned NumDifferences = 0; // Keep track of # differences.
+      unsigned DiffOperand = 0;    // The operand that differs.
       for (unsigned i = 1, e = GEPRHS->getNumOperands(); i != e; ++i)
         if (GEPLHS->getOperand(i) != GEPRHS->getOperand(i)) {
           Type *LHSType = GEPLHS->getOperand(i)->getType();
           Type *RHSType = GEPRHS->getOperand(i)->getType();
           // FIXME: Better support for vector of pointers.
           if (LHSType->getPrimitiveSizeInBits() !=
-                   RHSType->getPrimitiveSizeInBits() ||
+                  RHSType->getPrimitiveSizeInBits() ||
               (GEPLHS->getType()->isVectorTy() &&
                (!LHSType->isVectorTy() || !RHSType->isVectorTy()))) {
             // Irreconcilable differences.
@@ -826,15 +825,18 @@ Instruction *InstCombinerImpl::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
             break;
           }
 
-          if (NumDifferences++) break;
+          if (NumDifferences++)
+            break;
           DiffOperand = i;
         }
 
-      if (NumDifferences == 0)   // SAME GEP?
-        return replaceInstUsesWith(I, // No comparison is needed here.
-          ConstantInt::get(I.getType(), ICmpInst::isTrueWhenEqual(Cond)));
-
-      else if (NumDifferences == 1 && CanFold(NW)) {
+      if (NumDifferences == 0) // SAME GEP?
+        return replaceInstUsesWith(
+            I, // No comparison is needed here.
+            ConstantInt::get(I.getType(), ICmpInst::isTrueWhenEqual(Cond)));
+      // If two GEPs only differ by an index, compare them.
+      // Note that nowrap flags are always needed when comparing two indices.
+      else if (NumDifferences == 1 && NW != GEPNoWrapFlags::none()) {
         Value *LHSV = GEPLHS->getOperand(DiffOperand);
         Value *RHSV = GEPRHS->getOperand(DiffOperand);
         return NewICmp(NW, LHSV, RHSV);
@@ -910,8 +912,8 @@ bool InstCombinerImpl::foldAllocaCmp(AllocaInst *Alloca) {
     case 2: {
       // The alloca is only used in one icmp operand. Assume that the
       // equality is false.
-      auto *Res = ConstantInt::get(
-          ICmp->getType(), ICmp->getPredicate() == ICmpInst::ICMP_NE);
+      auto *Res = ConstantInt::get(ICmp->getType(),
+                                   ICmp->getPredicate() == ICmpInst::ICMP_NE);
       replaceInstUsesWith(*ICmp, Res);
       eraseInstFromFunction(*ICmp);
       Changed = true;
@@ -942,8 +944,8 @@ Instruction *InstCombinerImpl::foldICmpAddOpConst(Value *X, const APInt &C,
   // (X+2) <u X        --> X >u (MAXUINT-2)        --> X > 253
   // (X+MAXUINT) <u X  --> X >u (MAXUINT-MAXUINT)  --> X != 0
   if (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_ULE) {
-    Constant *R = ConstantInt::get(X->getType(),
-                                   APInt::getMaxValue(C.getBitWidth()) - C);
+    Constant *R =
+        ConstantInt::get(X->getType(), APInt::getMaxValue(C.getBitWidth()) - C);
     return new ICmpInst(ICmpInst::ICMP_UGT, X, R);
   }
 
@@ -1738,8 +1740,8 @@ Instruction *InstCombinerImpl::foldICmpAndShift(ICmpInst &Cmp,
     } else {
       Value *NewAnd = Builder.CreateAnd(
           Shift->getOperand(0), ConstantInt::get(And->getType(), NewAndCst));
-      return new ICmpInst(Cmp.getPredicate(),
-          NewAnd, ConstantInt::get(And->getType(), NewCmpCst));
+      return new ICmpInst(Cmp.getPredicate(), NewAnd,
+                          ConstantInt::get(And->getType(), NewCmpCst));
     }
   }
 
@@ -1882,17 +1884,23 @@ Instruction *InstCombinerImpl::foldICmpAndConstConst(ICmpInst &Cmp,
   // llvm.is.fpclass(X, fcInf|fcNan)
   // (icmp ne (and (bitcast X to int), ExponentMask), ExponentMask) -->
   // llvm.is.fpclass(X, ~(fcInf|fcNan))
+  // (icmp eq (and (bitcast X to int), ExponentMask), 0) -->
+  // llvm.is.fpclass(X, fcSubnormal|fcZero)
+  // (icmp ne (and (bitcast X to int), ExponentMask), 0) -->
+  // llvm.is.fpclass(X, ~(fcSubnormal|fcZero))
   Value *V;
   if (!Cmp.getParent()->getParent()->hasFnAttribute(
           Attribute::NoImplicitFloat) &&
       Cmp.isEquality() &&
       match(X, m_OneUse(m_ElementWiseBitCast(m_Value(V))))) {
     Type *FPType = V->getType()->getScalarType();
-    if (FPType->isIEEELikeFPTy() && C1 == *C2) {
+    if (FPType->isIEEELikeFPTy() && (C1.isZero() || C1 == *C2)) {
       APInt ExponentMask =
           APFloat::getInf(FPType->getFltSemantics()).bitcastToAPInt();
-      if (C1 == ExponentMask) {
-        unsigned Mask = FPClassTest::fcNan | FPClassTest::fcInf;
+      if (*C2 == ExponentMask) {
+        unsigned Mask = C1.isZero()
+                            ? FPClassTest::fcZero | FPClassTest::fcSubnormal
+                            : FPClassTest::fcNan | FPClassTest::fcInf;
         if (isICMP_NE)
           Mask = ~Mask & fcAllFlags;
         return replaceInstUsesWith(Cmp, Builder.createIsFPClass(V, Mask));
@@ -2012,13 +2020,12 @@ Instruction *InstCombinerImpl::foldICmpAndConstant(ICmpInst &Cmp,
     Value *A;
     const APInt *Addend, *Msk;
     if (match(And, m_And(m_OneUse(m_Add(m_Value(A), m_APInt(Addend))),
-                         m_APInt(Msk))) &&
-        Msk->isMask() && C.ule(*Msk)) {
+                         m_LowBitMask(Msk))) &&
+        C.ule(*Msk)) {
       APInt NewComperand = (C - *Addend) & *Msk;
-      Value* MaskA = Builder.CreateAnd(A, ConstantInt::get(A->getType(), *Msk));
-      return new ICmpInst(
-          Pred, MaskA,
-          Constant::getIntegerValue(MaskA->getType(), NewComperand));
+      Value *MaskA = Builder.CreateAnd(A, ConstantInt::get(A->getType(), *Msk));
+      return new ICmpInst(Pred, MaskA,
+                          ConstantInt::get(MaskA->getType(), NewComperand));
     }
   }
 
@@ -2319,7 +2326,7 @@ static Instruction *foldICmpShlLHSC(ICmpInst &Cmp, Instruction *Shl,
     // (1 << Y) <  1 -> Y == 31
     // (1 << Y) <  C -> Y == 31 if C is negative and not signed min.
     // Exclude signed min by subtracting 1 and lower the upper bound to 0.
-    if (Pred == ICmpInst::ICMP_SLT && (C-1).sle(0))
+    if (Pred == ICmpInst::ICMP_SLT && (C - 1).sle(0))
       return new ICmpInst(ICmpInst::ICMP_EQ, Y, BitWidthMinusOne);
   }
 
@@ -2804,7 +2811,7 @@ Instruction *InstCombinerImpl::foldICmpDivConstant(ICmpInst &Cmp,
   // (X s/ Y) == SMIN --> (X == SMIN) && (Y == 1)
   // (X s/ Y) != SMIN --> (X != SMIN) || (Y != 1)
   if (Cmp.isEquality() && Div->hasOneUse() && C.isSignBitSet() &&
-      (!DivIsSigned || C.isMinSignedValue()))   {
+      (!DivIsSigned || C.isMinSignedValue())) {
     Value *XBig = Builder.CreateICmp(Pred, X, ConstantInt::get(Ty, C));
     Value *YOne = Builder.CreateICmp(Pred, Y, ConstantInt::get(Ty, 1));
     auto Logic = Pred == ICmpInst::ICMP_EQ ? Instruction::And : Instruction::Or;
@@ -3357,14 +3364,14 @@ Instruction *InstCombinerImpl::foldICmpSelectConstant(ICmpInst &Cmp,
     // the entire original Cmp can be simplified to a false.
     Value *Cond = Builder.getFalse();
     if (TrueWhenLessThan)
-      Cond = Builder.CreateOr(Cond, Builder.CreateICmp(ICmpInst::ICMP_SLT,
-                                                       OrigLHS, OrigRHS));
+      Cond = Builder.CreateOr(
+          Cond, Builder.CreateICmp(ICmpInst::ICMP_SLT, OrigLHS, OrigRHS));
     if (TrueWhenEqual)
-      Cond = Builder.CreateOr(Cond, Builder.CreateICmp(ICmpInst::ICMP_EQ,
-                                                       OrigLHS, OrigRHS));
+      Cond = Builder.CreateOr(
+          Cond, Builder.CreateICmp(ICmpInst::ICMP_EQ, OrigLHS, OrigRHS));
     if (TrueWhenGreaterThan)
-      Cond = Builder.CreateOr(Cond, Builder.CreateICmp(ICmpInst::ICMP_SGT,
-                                                       OrigLHS, OrigRHS));
+      Cond = Builder.CreateOr(
+          Cond, Builder.CreateICmp(ICmpInst::ICMP_SGT, OrigLHS, OrigRHS));
 
     return replaceInstUsesWith(Cmp, Cond);
   }
@@ -3806,8 +3813,8 @@ Instruction *InstCombinerImpl::foldICmpEqIntrinsicWithConstant(
       APInt Mask1 = IsTrailing ? APInt::getLowBitsSet(BitWidth, Num + 1)
                                : APInt::getHighBitsSet(BitWidth, Num + 1);
       APInt Mask2 = IsTrailing
-        ? APInt::getOneBitSet(BitWidth, Num)
-        : APInt::getOneBitSet(BitWidth, BitWidth - Num - 1);
+                        ? APInt::getOneBitSet(BitWidth, Num)
+                        : APInt::getOneBitSet(BitWidth, BitWidth - Num - 1);
       return new ICmpInst(Pred, Builder.CreateAnd(II->getArgOperand(0), Mask1),
                           ConstantInt::get(Ty, Mask2));
     }
@@ -4867,18 +4874,17 @@ Value *InstCombinerImpl::foldMultiplicationOverflowCheck(ICmpInst &I) {
       return nullptr; // Wrong predicate.
     }
   } else // Look for: ((x * y) / x) !=/== y
-      if (I.isEquality() &&
-          match(&I,
-                m_c_ICmp(Pred, m_Value(Y),
-                         m_CombineAnd(
-                             m_OneUse(m_IDiv(m_CombineAnd(m_c_Mul(m_Deferred(Y),
-                                                                  m_Value(X)),
-                                                          m_Instruction(Mul)),
-                                             m_Deferred(X))),
-                             m_Instruction(Div))))) {
-    NeedNegation = Pred == ICmpInst::Predicate::ICMP_EQ;
-  } else
-    return nullptr;
+    if (I.isEquality() &&
+        match(&I, m_c_ICmp(Pred, m_Value(Y),
+                           m_CombineAnd(m_OneUse(m_IDiv(
+                                            m_CombineAnd(m_c_Mul(m_Deferred(Y),
+                                                                 m_Value(X)),
+                                                         m_Instruction(Mul)),
+                                            m_Deferred(X))),
+                                        m_Instruction(Div))))) {
+      NeedNegation = Pred == ICmpInst::Predicate::ICMP_EQ;
+    } else
+      return nullptr;
 
   BuilderTy::InsertPointGuard Guard(Builder);
   // If the pattern included (x * y), we'll want to insert new instructions
@@ -5377,7 +5383,7 @@ Instruction *InstCombinerImpl::foldICmpBinOp(ICmpInst &I,
                               ConstantExpr::getNeg(RHSC));
   }
 
-  if (Instruction * R = foldICmpXorXX(I, Q, *this))
+  if (Instruction *R = foldICmpXorXX(I, Q, *this))
     return R;
   if (Instruction *R = foldICmpOrXX(I, Q, *this))
     return R;
@@ -5412,7 +5418,7 @@ Instruction *InstCombinerImpl::foldICmpBinOp(ICmpInst &I,
         if (ICmpInst::isEquality(Pred)) {
           // If X != Y, fold (X *nw Z) eq/ne (Y *nw Z) -> Z eq/ne 0
           if (((Op0HasNSW && Op1HasNSW) || (Op0HasNUW && Op1HasNUW)) &&
-              isKnownNonEqual(X, Y, DL, &AC, &I, &DT))
+              isKnownNonEqual(X, Y, SQ))
             return new ICmpInst(Pred, Z, Constant::getNullValue(Z->getType()));
 
           KnownBits ZKnown = computeKnownBits(Z, 0, &I);
@@ -5914,8 +5920,8 @@ Instruction *InstCombinerImpl::foldICmpEquality(ICmpInst &I) {
     if (ShAmt < TypeBits && ShAmt != 0) {
       Value *Xor = Builder.CreateXor(A, B, I.getName() + ".unshifted");
       APInt AndVal = APInt::getLowBitsSet(TypeBits, TypeBits - ShAmt);
-      Value *And = Builder.CreateAnd(Xor, Builder.getInt(AndVal),
-                                      I.getName() + ".mask");
+      Value *And =
+          Builder.CreateAnd(Xor, Builder.getInt(AndVal), I.getName() + ".mask");
       return new ICmpInst(Pred, And, Constant::getNullValue(Cst1->getType()));
     }
   }
@@ -5947,10 +5953,10 @@ Instruction *InstCombinerImpl::foldICmpEquality(ICmpInst &I) {
   if (Instruction *ICmp = foldICmpIntrinsicWithIntrinsic(I, Builder))
     return ICmp;
 
-  // Match icmp eq (trunc (lshr A, BW), (ashr (trunc A), BW-1)), which checks the
-  // top BW/2 + 1 bits are all the same. Create "A >=s INT_MIN && A <=s INT_MAX",
-  // which we generate as "icmp ult (add A, 2^(BW-1)), 2^BW" to skip a few steps
-  // of instcombine.
+  // Match icmp eq (trunc (lshr A, BW), (ashr (trunc A), BW-1)), which checks
+  // the top BW/2 + 1 bits are all the same. Create "A >=s INT_MIN && A <=s
+  // INT_MAX", which we generate as "icmp ult (add A, 2^(BW-1)), 2^BW" to skip a
+  // few steps of instcombine.
   unsigned BitWidth = Op0->getType()->getScalarSizeInBits();
   if (match(Op0, m_AShr(m_Trunc(m_Value(A)), m_SpecificInt(BitWidth - 1))) &&
       match(Op1, m_Trunc(m_LShr(m_Specific(A), m_SpecificInt(BitWidth)))) &&
@@ -6084,10 +6090,10 @@ Instruction *InstCombinerImpl::foldICmpWithZextOrSext(ICmpInst &ICmp) {
     bool IsZext1 = isa<ZExtInst>(ICmp.getOperand(1));
 
     if (IsZext0 != IsZext1) {
-        // If X and Y and both i1
-        // (icmp eq/ne (zext X) (sext Y))
-        //      eq -> (icmp eq (or X, Y), 0)
-        //      ne -> (icmp ne (or X, Y), 0)
+      // If X and Y and both i1
+      // (icmp eq/ne (zext X) (sext Y))
+      //      eq -> (icmp eq (or X, Y), 0)
+      //      ne -> (icmp ne (or X, Y), 0)
       if (ICmp.isEquality() && X->getType()->isIntOrIntVectorTy(1) &&
           Y->getType()->isIntOrIntVectorTy(1))
         return new ICmpInst(ICmp.getPredicate(), Builder.CreateOr(X, Y),
@@ -6247,16 +6253,17 @@ Instruction *InstCombinerImpl::foldICmpWithCastOp(ICmpInst &ICmp) {
   return foldICmpWithZextOrSext(ICmp);
 }
 
-static bool isNeutralValue(Instruction::BinaryOps BinaryOp, Value *RHS, bool IsSigned) {
+static bool isNeutralValue(Instruction::BinaryOps BinaryOp, Value *RHS,
+                           bool IsSigned) {
   switch (BinaryOp) {
-    default:
-      llvm_unreachable("Unsupported binary op");
-    case Instruction::Add:
-    case Instruction::Sub:
-      return match(RHS, m_Zero());
-    case Instruction::Mul:
-      return !(RHS->getType()->isIntOrIntVectorTy(1) && IsSigned) &&
-             match(RHS, m_One());
+  default:
+    llvm_unreachable("Unsupported binary op");
+  case Instruction::Add:
+  case Instruction::Sub:
+    return match(RHS, m_Zero());
+  case Instruction::Mul:
+    return !(RHS->getType()->isIntOrIntVectorTy(1) && IsSigned) &&
+           match(RHS, m_One());
   }
 }
 
@@ -6265,23 +6272,23 @@ InstCombinerImpl::computeOverflow(Instruction::BinaryOps BinaryOp,
                                   bool IsSigned, Value *LHS, Value *RHS,
                                   Instruction *CxtI) const {
   switch (BinaryOp) {
-    default:
-      llvm_unreachable("Unsupported binary op");
-    case Instruction::Add:
-      if (IsSigned)
-        return computeOverflowForSignedAdd(LHS, RHS, CxtI);
-      else
-        return computeOverflowForUnsignedAdd(LHS, RHS, CxtI);
-    case Instruction::Sub:
-      if (IsSigned)
-        return computeOverflowForSignedSub(LHS, RHS, CxtI);
-      else
-        return computeOverflowForUnsignedSub(LHS, RHS, CxtI);
-    case Instruction::Mul:
-      if (IsSigned)
-        return computeOverflowForSignedMul(LHS, RHS, CxtI);
-      else
-        return computeOverflowForUnsignedMul(LHS, RHS, CxtI);
+  default:
+    llvm_unreachable("Unsupported binary op");
+  case Instruction::Add:
+    if (IsSigned)
+      return computeOverflowForSignedAdd(LHS, RHS, CxtI);
+    else
+      return computeOverflowForUnsignedAdd(LHS, RHS, CxtI);
+  case Instruction::Sub:
+    if (IsSigned)
+      return computeOverflowForSignedSub(LHS, RHS, CxtI);
+    else
+      return computeOverflowForUnsignedSub(LHS, RHS, CxtI);
+  case Instruction::Mul:
+    if (IsSigned)
+      return computeOverflowForSignedMul(LHS, RHS, CxtI);
+    else
+      return computeOverflowForUnsignedMul(LHS, RHS, CxtI);
   }
 }
 
@@ -6310,25 +6317,25 @@ bool InstCombinerImpl::OptimizeOverflowCheck(Instruction::BinaryOps BinaryOp,
   }
 
   switch (computeOverflow(BinaryOp, IsSigned, LHS, RHS, &OrigI)) {
-    case OverflowResult::MayOverflow:
-      return false;
-    case OverflowResult::AlwaysOverflowsLow:
-    case OverflowResult::AlwaysOverflowsHigh:
-      Result = Builder.CreateBinOp(BinaryOp, LHS, RHS);
-      Result->takeName(&OrigI);
-      Overflow = ConstantInt::getTrue(OverflowTy);
-      return true;
-    case OverflowResult::NeverOverflows:
-      Result = Builder.CreateBinOp(BinaryOp, LHS, RHS);
-      Result->takeName(&OrigI);
-      Overflow = ConstantInt::getFalse(OverflowTy);
-      if (auto *Inst = dyn_cast<Instruction>(Result)) {
-        if (IsSigned)
-          Inst->setHasNoSignedWrap();
-        else
-          Inst->setHasNoUnsignedWrap();
-      }
-      return true;
+  case OverflowResult::MayOverflow:
+    return false;
+  case OverflowResult::AlwaysOverflowsLow:
+  case OverflowResult::AlwaysOverflowsHigh:
+    Result = Builder.CreateBinOp(BinaryOp, LHS, RHS);
+    Result->takeName(&OrigI);
+    Overflow = ConstantInt::getTrue(OverflowTy);
+    return true;
+  case OverflowResult::NeverOverflows:
+    Result = Builder.CreateBinOp(BinaryOp, LHS, RHS);
+    Result->takeName(&OrigI);
+    Overflow = ConstantInt::getFalse(OverflowTy);
+    if (auto *Inst = dyn_cast<Instruction>(Result)) {
+      if (IsSigned)
+        Inst->setHasNoSignedWrap();
+      else
+        Inst->setHasNoUnsignedWrap();
+    }
+    return true;
   }
 
   llvm_unreachable("Unexpected overflow result");
@@ -6993,21 +7000,21 @@ static Instruction *canonicalizeICmpBool(ICmpInst &I,
   // Cases not handled by InstSimplify are always 'not' of Op0.
   if (match(B, m_Zero())) {
     switch (I.getPredicate()) {
-      case CmpInst::ICMP_EQ:  // A ==   0 -> !A
-      case CmpInst::ICMP_ULE: // A <=u  0 -> !A
-      case CmpInst::ICMP_SGE: // A >=s  0 -> !A
-        return BinaryOperator::CreateNot(A);
-      default:
-        llvm_unreachable("ICmp i1 X, C not simplified as expected.");
+    case CmpInst::ICMP_EQ:  // A ==   0 -> !A
+    case CmpInst::ICMP_ULE: // A <=u  0 -> !A
+    case CmpInst::ICMP_SGE: // A >=s  0 -> !A
+      return BinaryOperator::CreateNot(A);
+    default:
+      llvm_unreachable("ICmp i1 X, C not simplified as expected.");
     }
   } else if (match(B, m_One())) {
     switch (I.getPredicate()) {
-      case CmpInst::ICMP_NE:  // A !=  1 -> !A
-      case CmpInst::ICMP_ULT: // A <u  1 -> !A
-      case CmpInst::ICMP_SGT: // A >s -1 -> !A
-        return BinaryOperator::CreateNot(A);
-      default:
-        llvm_unreachable("ICmp i1 X, C not simplified as expected.");
+    case CmpInst::ICMP_NE:  // A !=  1 -> !A
+    case CmpInst::ICMP_ULT: // A <u  1 -> !A
+    case CmpInst::ICMP_SGT: // A >s -1 -> !A
+      return BinaryOperator::CreateNot(A);
+    default:
+      llvm_unreachable("ICmp i1 X, C not simplified as expected.");
     }
   }
 
@@ -7193,8 +7200,8 @@ static Instruction *foldICmpOfUAddOv(ICmpInst &I) {
     // extract(uadd.with.overflow(A, 1), 0) == 0
     // extract(uadd.with.overflow(A, -1), 0) != -1
     UAddOv = cast<ExtractValueInst>(Op0)->getAggregateOperand();
-  else if (match(Op1, UAddOvResultPat) &&
-           Pred == ICmpInst::ICMP_UGT && (Op0 == A || Op0 == B))
+  else if (match(Op1, UAddOvResultPat) && Pred == ICmpInst::ICMP_UGT &&
+           (Op0 == A || Op0 == B))
     // A > extract(uadd.with.overflow(A, B), 0)
     UAddOv = cast<ExtractValueInst>(Op1)->getAggregateOperand();
   else
@@ -7410,8 +7417,7 @@ Instruction *InstCombinerImpl::visitICmpInst(ICmpInst &I) {
       if (Value *V = dyn_castNegVal(SelectTrue)) {
         if (V == SelectFalse)
           return CmpInst::Create(Instruction::ICmp, I.getPredicate(), V, Op1);
-      }
-      else if (Value *V = dyn_castNegVal(SelectFalse)) {
+      } else if (Value *V = dyn_castNegVal(SelectFalse)) {
         if (V == SelectTrue)
           return CmpInst::Create(Instruction::ICmp, I.getPredicate(), V, Op1);
       }
@@ -7552,7 +7558,8 @@ Instruction *InstCombinerImpl::visitICmpInst(ICmpInst &I) {
 
   // Try to optimize equality comparisons against alloca-based pointers.
   if (Op0->getType()->isPointerTy() && I.isEquality()) {
-    assert(Op1->getType()->isPointerTy() && "Comparing pointer with non-pointer?");
+    assert(Op1->getType()->isPointerTy() &&
+           "Comparing pointer with non-pointer?");
     if (auto *Alloca = dyn_cast<AllocaInst>(getUnderlyingObject(Op0)))
       if (foldAllocaCmp(Alloca))
         return nullptr;
@@ -7722,7 +7729,8 @@ Instruction *InstCombinerImpl::foldFCmpIntToFPConst(FCmpInst &I,
   // Get the width of the mantissa.  We don't want to hack on conversions that
   // might lose information from the integer, e.g. "i64 -> float"
   int MantissaWidth = LHSI->getType()->getFPMantissaWidth();
-  if (MantissaWidth == -1) return nullptr;  // Unknown.
+  if (MantissaWidth == -1)
+    return nullptr; // Unknown.
 
   Type *IntTy = LHSI->getOperand(0)->getType();
   unsigned IntWidth = IntTy->getScalarSizeInBits();
@@ -7784,7 +7792,8 @@ Instruction *InstCombinerImpl::foldFCmpIntToFPConst(FCmpInst &I,
 
   ICmpInst::Predicate Pred;
   switch (I.getPredicate()) {
-  default: llvm_unreachable("Unexpected predicate!");
+  default:
+    llvm_unreachable("Unexpected predicate!");
   case FCmpInst::FCMP_UEQ:
   case FCmpInst::FCMP_OEQ:
     Pred = ICmpInst::ICMP_EQ;
@@ -7826,7 +7835,7 @@ Instruction *InstCombinerImpl::foldFCmpIntToFPConst(FCmpInst &I,
     SMax.convertFromAPInt(APInt::getSignedMaxValue(IntWidth), true,
                           APFloat::rmNearestTiesToEven);
     if (SMax < *RHS) { // smax < 13123.0
-      if (Pred == ICmpInst::ICMP_NE  || Pred == ICmpInst::ICMP_SLT ||
+      if (Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_SLT ||
           Pred == ICmpInst::ICMP_SLE)
         return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
       return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
@@ -7838,7 +7847,7 @@ Instruction *InstCombinerImpl::foldFCmpIntToFPConst(FCmpInst &I,
     UMax.convertFromAPInt(APInt::getMaxValue(IntWidth), false,
                           APFloat::rmNearestTiesToEven);
     if (UMax < *RHS) { // umax < 13123.0
-      if (Pred == ICmpInst::ICMP_NE  || Pred == ICmpInst::ICMP_ULT ||
+      if (Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_ULT ||
           Pred == ICmpInst::ICMP_ULE)
         return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
       return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
@@ -7882,10 +7891,11 @@ Instruction *InstCombinerImpl::foldFCmpIntToFPConst(FCmpInst &I,
       // the compare predicate and sometimes the value.  RHSC is rounded towards
       // zero at this point.
       switch (Pred) {
-      default: llvm_unreachable("Unexpected integer comparison!");
-      case ICmpInst::ICMP_NE:  // (float)int != 4.4   --> true
+      default:
+        llvm_unreachable("Unexpected integer comparison!");
+      case ICmpInst::ICMP_NE: // (float)int != 4.4   --> true
         return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
-      case ICmpInst::ICMP_EQ:  // (float)int == 4.4   --> false
+      case ICmpInst::ICMP_EQ: // (float)int == 4.4   --> false
         return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
       case ICmpInst::ICMP_ULE:
         // (float)int <= 4.4   --> int <= 4
@@ -8313,20 +8323,21 @@ Instruction *InstCombinerImpl::visitFCmpInst(FCmpInst &I) {
   assert(OpType == Op1->getType() && "fcmp with different-typed operands?");
   if (Op0 == Op1) {
     switch (Pred) {
-      default: break;
-    case FCmpInst::FCMP_UNO:    // True if unordered: isnan(X) | isnan(Y)
-    case FCmpInst::FCMP_ULT:    // True if unordered or less than
-    case FCmpInst::FCMP_UGT:    // True if unordered or greater than
-    case FCmpInst::FCMP_UNE:    // True if unordered or not equal
+    default:
+      break;
+    case FCmpInst::FCMP_UNO: // True if unordered: isnan(X) | isnan(Y)
+    case FCmpInst::FCMP_ULT: // True if unordered or less than
+    case FCmpInst::FCMP_UGT: // True if unordered or greater than
+    case FCmpInst::FCMP_UNE: // True if unordered or not equal
       // Canonicalize these to be 'fcmp uno %X, 0.0'.
       I.setPredicate(FCmpInst::FCMP_UNO);
       I.setOperand(1, Constant::getNullValue(OpType));
       return &I;
 
-    case FCmpInst::FCMP_ORD:    // True if ordered (no nans)
-    case FCmpInst::FCMP_OEQ:    // True if ordered and equal
-    case FCmpInst::FCMP_OGE:    // True if ordered and greater than or equal
-    case FCmpInst::FCMP_OLE:    // True if ordered and less than or equal
+    case FCmpInst::FCMP_ORD: // True if ordered (no nans)
+    case FCmpInst::FCMP_OEQ: // True if ordered and equal
+    case FCmpInst::FCMP_OGE: // True if ordered and greater than or equal
+    case FCmpInst::FCMP_OLE: // True if ordered and less than or equal
       // Canonicalize these to be 'fcmp ord %X, 0.0'.
       I.setPredicate(FCmpInst::FCMP_ORD);
       I.setOperand(1, Constant::getNullValue(OpType));
@@ -8484,7 +8495,7 @@ Instruction *InstCombinerImpl::visitFCmpInst(FCmpInst &I) {
                   cast<LoadInst>(LHSI), GEP, GV, I))
             return Res;
       break;
-  }
+    }
   }
 
   if (Instruction *R = foldFabsWithFcmpZero(I, *this))
diff --git llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index e2af4d4c5636..382078e85a17 100644
--- llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -1771,26 +1771,38 @@ static Value *foldSelectInstWithICmpConst(SelectInst &SI, ICmpInst *ICI,
       return Builder.CreateBinaryIntrinsic(Intrinsic::smin, V, TVal);
   }
 
-  BinaryOperator *BO;
+  // Fold icmp(X) ? f(X) : C to f(X) when f(X) is guaranteed to be equal to C
+  // for all X in the exact range of the inverse predicate.
+  Instruction *Op;
   const APInt *C;
   CmpInst::Predicate CPred;
-  if (match(&SI, m_Select(m_Specific(ICI), m_APInt(C), m_BinOp(BO))))
+  if (match(&SI, m_Select(m_Specific(ICI), m_APInt(C), m_Instruction(Op))))
     CPred = ICI->getPredicate();
-  else if (match(&SI, m_Select(m_Specific(ICI), m_BinOp(BO), m_APInt(C))))
+  else if (match(&SI, m_Select(m_Specific(ICI), m_Instruction(Op), m_APInt(C))))
     CPred = ICI->getInversePredicate();
   else
     return nullptr;
 
-  const APInt *BinOpC;
-  if (!match(BO, m_BinOp(m_Specific(V), m_APInt(BinOpC))))
-    return nullptr;
-
-  ConstantRange R = ConstantRange::makeExactICmpRegion(CPred, *CmpC)
-                        .binaryOp(BO->getOpcode(), *BinOpC);
-  if (R == *C) {
-    BO->dropPoisonGeneratingFlags();
-    return BO;
+  ConstantRange InvDomCR = ConstantRange::makeExactICmpRegion(CPred, *CmpC);
+  const APInt *OpC;
+  if (match(Op, m_BinOp(m_Specific(V), m_APInt(OpC)))) {
+    ConstantRange R = InvDomCR.binaryOp(
+        static_cast<Instruction::BinaryOps>(Op->getOpcode()), *OpC);
+    if (R == *C) {
+      Op->dropPoisonGeneratingFlags();
+      return Op;
+    }
+  }
+  if (auto *MMI = dyn_cast<MinMaxIntrinsic>(Op);
+      MMI && MMI->getLHS() == V && match(MMI->getRHS(), m_APInt(OpC))) {
+    ConstantRange R = ConstantRange::intrinsic(MMI->getIntrinsicID(),
+                                               {InvDomCR, ConstantRange(*OpC)});
+    if (R == *C) {
+      MMI->dropPoisonGeneratingAnnotations();
+      return MMI;
+    }
   }
+
   return nullptr;
 }
 
@@ -2852,10 +2864,10 @@ static Instruction *foldSelectWithFCmpToFabs(SelectInst &SI,
     if (!match(TrueVal, m_FNeg(m_Specific(X))))
       return nullptr;
 
-    // Forward-propagate nnan and ninf from the fneg to the select.
+    // Forward-propagate nnan and ninf from the fcmp to the select.
     // If all inputs are not those values, then the select is not either.
     // Note: nsz is defined differently, so it may not be correct to propagate.
-    FastMathFlags FMF = cast<FPMathOperator>(TrueVal)->getFastMathFlags();
+    FastMathFlags FMF = cast<FPMathOperator>(CondVal)->getFastMathFlags();
     if (FMF.noNaNs() && !SI.hasNoNaNs()) {
       SI.setHasNoNaNs(true);
       ChangedFMF = true;
@@ -2864,6 +2876,13 @@ static Instruction *foldSelectWithFCmpToFabs(SelectInst &SI,
       SI.setHasNoInfs(true);
       ChangedFMF = true;
     }
+    // Forward-propagate nnan from the fneg to the select.
+    // The nnan flag can be propagated iff fneg is selected when X is NaN.
+    if (!SI.hasNoNaNs() && cast<FPMathOperator>(TrueVal)->hasNoNaNs() &&
+        (Swap ? FCmpInst::isOrdered(Pred) : FCmpInst::isUnordered(Pred))) {
+      SI.setHasNoNaNs(true);
+      ChangedFMF = true;
+    }
 
     // With nsz, when 'Swap' is false:
     // fold (X < +/-0.0) ? -X : X or (X <= +/-0.0) ? -X : X to fabs(X)
diff --git llvm/lib/Transforms/InstCombine/InstructionCombining.cpp llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index a64c188575e6..5621511570b5 100644
--- llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -33,6 +33,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "InstCombineInternal.h"
+#include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
@@ -4069,6 +4070,49 @@ InstCombinerImpl::foldExtractOfOverflowIntrinsic(ExtractValueInst &EV) {
   return nullptr;
 }
 
+static Value *foldFrexpOfSelect(ExtractValueInst &EV, IntrinsicInst *FrexpCall,
+                                SelectInst *SelectInst,
+                                InstCombiner::BuilderTy &Builder) {
+  // Helper to fold frexp of select to select of frexp.
+
+  if (!SelectInst->hasOneUse() || !FrexpCall->hasOneUse())
+    return nullptr;
+  Value *Cond = SelectInst->getCondition();
+  Value *TrueVal = SelectInst->getTrueValue();
+  Value *FalseVal = SelectInst->getFalseValue();
+
+  const APFloat *ConstVal = nullptr;
+  Value *VarOp = nullptr;
+  bool ConstIsTrue = false;
+
+  if (match(TrueVal, m_APFloat(ConstVal))) {
+    VarOp = FalseVal;
+    ConstIsTrue = true;
+  } else if (match(FalseVal, m_APFloat(ConstVal))) {
+    VarOp = TrueVal;
+    ConstIsTrue = false;
+  } else {
+    return nullptr;
+  }
+
+  Builder.SetInsertPoint(&EV);
+
+  CallInst *NewFrexp =
+      Builder.CreateCall(FrexpCall->getCalledFunction(), {VarOp}, "frexp");
+  NewFrexp->copyIRFlags(FrexpCall);
+
+  Value *NewEV = Builder.CreateExtractValue(NewFrexp, 0, "mantissa");
+
+  int Exp;
+  APFloat Mantissa = frexp(*ConstVal, Exp, APFloat::rmNearestTiesToEven);
+
+  Constant *ConstantMantissa = ConstantFP::get(TrueVal->getType(), Mantissa);
+
+  Value *NewSel = Builder.CreateSelectFMF(
+      Cond, ConstIsTrue ? ConstantMantissa : NewEV,
+      ConstIsTrue ? NewEV : ConstantMantissa, SelectInst, "select.frexp");
+  return NewSel;
+}
 Instruction *InstCombinerImpl::visitExtractValueInst(ExtractValueInst &EV) {
   Value *Agg = EV.getAggregateOperand();
 
@@ -4079,6 +4123,15 @@ Instruction *InstCombinerImpl::visitExtractValueInst(ExtractValueInst &EV) {
                                           SQ.getWithInstruction(&EV)))
     return replaceInstUsesWith(EV, V);
 
+  Value *Cond, *TrueVal, *FalseVal;
+  if (match(&EV, m_ExtractValue<0>(m_Intrinsic<Intrinsic::frexp>(m_Select(
+                     m_Value(Cond), m_Value(TrueVal), m_Value(FalseVal)))))) {
+    auto *SelInst =
+        cast<SelectInst>(cast<IntrinsicInst>(Agg)->getArgOperand(0));
+    if (Value *Result =
+            foldFrexpOfSelect(EV, cast<IntrinsicInst>(Agg), SelInst, Builder))
+      return replaceInstUsesWith(EV, Result);
+  }
   if (InsertValueInst *IV = dyn_cast<InsertValueInst>(Agg)) {
     // We're extracting from an insertvalue instruction, compare the indices
     const unsigned *exti, *exte, *insi, *inse;
diff --git llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index 75a19357ea1b..645c10275269 100644
--- llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -1518,8 +1518,7 @@ bool HWAddressSanitizer::instrumentStack(memtag::StackInfo &SInfo,
 
     AI->replaceUsesWithIf(Replacement, [AICast, AILong](const Use &U) {
       auto *User = U.getUser();
-      return User != AILong && User != AICast &&
-             !memtag::isLifetimeIntrinsic(User);
+      return User != AILong && User != AICast && !isa<LifetimeIntrinsic>(User);
     });
 
     memtag::annotateDebugRecords(Info, retagMask(N));
diff --git llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp
index 94101f9663a8..8c28a8e76446 100644
--- llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp
+++ llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp
@@ -497,13 +497,8 @@ void collectMemAccessInfo(
       if (CallInst *CI = dyn_cast<CallInst>(&Inst))
         maybeMarkSanitizerLibraryCallNoBuiltin(CI, &TLI);
 
-      if (isa<MemIntrinsic>(Inst)) {
+      if (isa<MemIntrinsic, LifetimeIntrinsic>(Inst))
         MemTypeResetInsts.push_back(&Inst);
-      } else if (auto *II = dyn_cast<IntrinsicInst>(&Inst)) {
-        if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
-            II->getIntrinsicID() == Intrinsic::lifetime_end)
-          MemTypeResetInsts.push_back(&Inst);
-      }
     } else if (isa<AllocaInst>(Inst)) {
       MemTypeResetInsts.push_back(&Inst);
     }
@@ -819,11 +814,7 @@ bool TypeSanitizer::instrumentMemInst(Value *V, Instruction *ShadowBase,
           NeedsMemMove = isa<MemMoveInst>(MTI);
         }
       }
-    } else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
-      if (II->getIntrinsicID() != Intrinsic::lifetime_start &&
-          II->getIntrinsicID() != Intrinsic::lifetime_end)
-        return false;
-
+    } else if (auto *II = dyn_cast<LifetimeIntrinsic>(I)) {
       Size = II->getArgOperand(0);
       Dest = II->getArgOperand(1);
     } else if (auto *AI = dyn_cast<AllocaInst>(I)) {
diff --git llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp
index 8407726a69c0..311d3b1cfc0a 100644
--- llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp
+++ llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp
@@ -64,7 +64,7 @@ namespace {
 
 class ObjCARCContract {
   bool Changed;
-  bool CFGChanged;
+  bool CFGChanged = false;
   AAResults *AA;
   DominatorTree *DT;
   ProvenanceAnalysis PA;
diff --git llvm/lib/Transforms/Scalar/ConstraintElimination.cpp llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index e0861fbedc56..6dd26910f684 100644
--- llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -40,7 +40,6 @@
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 
-#include <cmath>
 #include <optional>
 #include <string>
 
@@ -654,7 +653,7 @@ ConstraintInfo::getConstraint(CmpInst::Predicate Pred, Value *Op0, Value *Op1,
   bool IsEq = false;
   bool IsNe = false;
 
-  // Try to convert Pred to one of ULE/SLT/SLE/SLT.
+  // Try to convert Pred to one of ULE/ULT/SLE/SLT.
   switch (Pred) {
   case CmpInst::ICMP_UGT:
   case CmpInst::ICMP_UGE:
diff --git llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 13f3de07c3c4..05b4f176bfc3 100644
--- llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -2210,7 +2210,9 @@ struct DSEState {
 
       Instruction *UpperInst = UpperDef->getMemoryInst();
       auto IsRedundantStore = [&]() {
-        if (DefInst->isIdenticalTo(UpperInst))
+        // We don't care about differences in call attributes here.
+        if (DefInst->isIdenticalToWhenDefined(UpperInst,
+                                              /*IntersectAttrs=*/true))
           return true;
         if (auto *MemSetI = dyn_cast<MemSetInst>(UpperInst)) {
           if (auto *SI = dyn_cast<StoreInst>(DefInst)) {
diff --git llvm/lib/Transforms/Scalar/GVN.cpp llvm/lib/Transforms/Scalar/GVN.cpp
index 21eb7f741d7c..3f306bb52c12 100644
--- llvm/lib/Transforms/Scalar/GVN.cpp
+++ llvm/lib/Transforms/Scalar/GVN.cpp
@@ -1096,7 +1096,7 @@ Value *AvailableValue::MaterializeAdjustedValue(LoadInst *Load,
   if (isSimpleValue()) {
     Res = getSimpleValue();
     if (Res->getType() != LoadTy) {
-      Res = getValueForLoad(Res, Offset, LoadTy, InsertPt, DL);
+      Res = getValueForLoad(Res, Offset, LoadTy, InsertPt, Load->getFunction());
 
       LLVM_DEBUG(dbgs() << "GVN COERCED NONLOCAL VAL:\nOffset: " << Offset
                         << "  " << *getSimpleValue() << '\n'
@@ -1109,7 +1109,8 @@ Value *AvailableValue::MaterializeAdjustedValue(LoadInst *Load,
       Res = CoercedLoad;
       combineMetadataForCSE(CoercedLoad, Load, false);
     } else {
-      Res = getValueForLoad(CoercedLoad, Offset, LoadTy, InsertPt, DL);
+      Res = getValueForLoad(CoercedLoad, Offset, LoadTy, InsertPt,
+                            Load->getFunction());
       // We are adding a new user for this load, for which the original
       // metadata may not hold. Additionally, the new load may have a different
       // size and type, so their metadata cannot be combined in any
@@ -1291,7 +1292,8 @@ GVNPass::AnalyzeLoadAvailability(LoadInst *Load, MemDepResult DepInfo,
 
         // If MD reported clobber, check it was nested.
         if (DepInfo.isClobber() &&
-            canCoerceMustAliasedValueToLoad(DepLoad, LoadType, DL)) {
+            canCoerceMustAliasedValueToLoad(DepLoad, LoadType,
+                                            DepLoad->getFunction())) {
           const auto ClobberOff = MD->getClobberOffset(DepLoad);
           // GVN has no deal with a negative offset.
           Offset = (ClobberOff == std::nullopt || *ClobberOff < 0)
@@ -1343,7 +1345,7 @@ GVNPass::AnalyzeLoadAvailability(LoadInst *Load, MemDepResult DepInfo,
     // different types if we have to. If the stored value is convertable to
     // the loaded value, we can reuse it.
     if (!canCoerceMustAliasedValueToLoad(S->getValueOperand(), Load->getType(),
-                                         DL))
+                                         S->getFunction()))
       return std::nullopt;
 
     // Can't forward from non-atomic to atomic without violating memory model.
@@ -1357,7 +1359,8 @@ GVNPass::AnalyzeLoadAvailability(LoadInst *Load, MemDepResult DepInfo,
     // If the types mismatch and we can't handle it, reject reuse of the load.
     // If the stored value is larger or equal to the loaded value, we can reuse
     // it.
-    if (!canCoerceMustAliasedValueToLoad(LD, Load->getType(), DL))
+    if (!canCoerceMustAliasedValueToLoad(LD, Load->getType(),
+                                         LD->getFunction()))
       return std::nullopt;
 
     // Can't forward from non-atomic to atomic without violating memory model.
diff --git llvm/lib/Transforms/Scalar/IndVarSimplify.cpp llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
index 8a3e0bc3eb97..9619dfdbf412 100644
--- llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -1262,14 +1262,14 @@ static std::optional<Value *>
 createReplacement(ICmpInst *ICmp, const Loop *L, BasicBlock *ExitingBB,
                   const SCEV *MaxIter, bool Inverted, bool SkipLastIter,
                   ScalarEvolution *SE, SCEVExpander &Rewriter) {
-  ICmpInst::Predicate Pred = ICmp->getPredicate();
+  CmpPredicate Pred = ICmp->getCmpPredicate();
   Value *LHS = ICmp->getOperand(0);
   Value *RHS = ICmp->getOperand(1);
 
   // 'LHS pred RHS' should now mean that we stay in loop.
   auto *BI = cast<BranchInst>(ExitingBB->getTerminator());
   if (Inverted)
-    Pred = CmpInst::getInversePredicate(Pred);
+    Pred = ICmpInst::getInverseCmpPredicate(Pred);
 
   const SCEV *LHSS = SE->getSCEVAtScope(LHS, L);
   const SCEV *RHSS = SE->getSCEVAtScope(RHS, L);
@@ -1460,7 +1460,6 @@ bool IndVarSimplify::canonicalizeExitCondition(Loop *L) {
     if (!match(LHS, m_ZExt(m_Value(LHSOp))) || !ICmp->isSigned())
       continue;
 
-    const DataLayout &DL = ExitingBB->getDataLayout();
     const unsigned InnerBitWidth = DL.getTypeSizeInBits(LHSOp->getType());
     const unsigned OuterBitWidth = DL.getTypeSizeInBits(RHS->getType());
     auto FullCR = ConstantRange::getFull(InnerBitWidth);
@@ -1536,8 +1535,6 @@ bool IndVarSimplify::canonicalizeExitCondition(Loop *L) {
         DeadInsts.push_back(LHS);
     };
 
-
-    const DataLayout &DL = ExitingBB->getDataLayout();
     const unsigned InnerBitWidth = DL.getTypeSizeInBits(LHSOp->getType());
     const unsigned OuterBitWidth = DL.getTypeSizeInBits(RHS->getType());
     auto FullCR = ConstantRange::getFull(InnerBitWidth);
diff --git llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index a80a85f38e74..87b27beb01a0 100644
--- llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -968,9 +968,8 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
       append_range(srcUseList, U->users());
       continue;
     }
-    if (const auto *IT = dyn_cast<IntrinsicInst>(U))
-      if (IT->isLifetimeStartOrEnd())
-        continue;
+    if (isa<LifetimeIntrinsic>(U))
+      continue;
 
     if (U != C && U != cpyLoad) {
       LLVM_DEBUG(dbgs() << "Call slot: Source accessed by " << *U << "\n");
diff --git llvm/lib/Transforms/Scalar/NewGVN.cpp llvm/lib/Transforms/Scalar/NewGVN.cpp
index d8b0bd173454..168f1d689d31 100644
--- llvm/lib/Transforms/Scalar/NewGVN.cpp
+++ llvm/lib/Transforms/Scalar/NewGVN.cpp
@@ -3056,13 +3056,8 @@ std::pair<unsigned, unsigned> NewGVN::assignDFSNumbers(BasicBlock *B,
 
 void NewGVN::updateProcessedCount(const Value *V) {
 #ifndef NDEBUG
-  if (ProcessedCount.count(V) == 0) {
-    ProcessedCount.insert({V, 1});
-  } else {
-    ++ProcessedCount[V];
-    assert(ProcessedCount[V] < 100 &&
-           "Seem to have processed the same Value a lot");
-  }
+  assert(++ProcessedCount[V] < 100 &&
+         "Seem to have processed the same Value a lot");
 #endif
 }
 
diff --git llvm/lib/Transforms/Utils/CodeExtractor.cpp llvm/lib/Transforms/Utils/CodeExtractor.cpp
index 03536e6404c7..7277603b3ec2 100644
--- llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -535,13 +535,10 @@ void CodeExtractor::findAllocas(const CodeExtractorAnalysisCache &CEAC,
 
       Instruction *Bitcast = cast<Instruction>(U);
       for (User *BU : Bitcast->users()) {
-        IntrinsicInst *IntrInst = dyn_cast<IntrinsicInst>(BU);
+        auto *IntrInst = dyn_cast<LifetimeIntrinsic>(BU);
         if (!IntrInst)
           continue;
 
-        if (!IntrInst->isLifetimeStartOrEnd())
-          continue;
-
         if (definedInRegion(Blocks, IntrInst))
           continue;
 
@@ -1083,8 +1080,8 @@ static void eraseLifetimeMarkersOnInputs(const SetVector<BasicBlock *> &Blocks,
                                          SetVector<Value *> &LifetimesStart) {
   for (BasicBlock *BB : Blocks) {
     for (Instruction &I : llvm::make_early_inc_range(*BB)) {
-      auto *II = dyn_cast<IntrinsicInst>(&I);
-      if (!II || !II->isLifetimeStartOrEnd())
+      auto *II = dyn_cast<LifetimeIntrinsic>(&I);
+      if (!II)
         continue;
 
       // Get the memory operand of the lifetime marker. If the underlying
diff --git llvm/lib/Transforms/Utils/InlineFunction.cpp llvm/lib/Transforms/Utils/InlineFunction.cpp
index adc40da07d96..b92d8b16daad 100644
--- llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -1777,9 +1777,8 @@ static Value *HandleByValArgument(Type *ByValType, Value *Arg,
 // Check whether this Value is used by a lifetime intrinsic.
 static bool isUsedByLifetimeMarker(Value *V) {
   for (User *U : V->users())
-    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U))
-      if (II->isLifetimeStartOrEnd())
-        return true;
+    if (isa<LifetimeIntrinsic>(U))
+      return true;
   return false;
 }
 
diff --git llvm/lib/Transforms/Utils/Local.cpp llvm/lib/Transforms/Utils/Local.cpp
index d5cf62e52cca..2c6328300738 100644
--- llvm/lib/Transforms/Utils/Local.cpp
+++ llvm/lib/Transforms/Utils/Local.cpp
@@ -497,10 +497,7 @@ bool llvm::wouldInstructionBeTriviallyDead(const Instruction *I,
       // are lifetime intrinsics then the intrinsics are dead.
       if (isa<AllocaInst>(Arg) || isa<GlobalValue>(Arg) || isa<Argument>(Arg))
         return llvm::all_of(Arg->uses(), [](Use &Use) {
-          if (IntrinsicInst *IntrinsicUse =
-                  dyn_cast<IntrinsicInst>(Use.getUser()))
-            return IntrinsicUse->isLifetimeStartOrEnd();
-          return false;
+          return isa<LifetimeIntrinsic>(Use.getUser());
         });
       return false;
     }
diff --git llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
index cccb9dae17df..de84a76ede7f 100644
--- llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
+++ llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
@@ -154,9 +154,7 @@ void StackInfoBuilder::visit(OptimizationRemarkEmitter &ORE,
     }
     return;
   }
-  auto *II = dyn_cast<IntrinsicInst>(&Inst);
-  if (II && (II->getIntrinsicID() == Intrinsic::lifetime_start ||
-             II->getIntrinsicID() == Intrinsic::lifetime_end)) {
+  if (auto *II = dyn_cast<LifetimeIntrinsic>(&Inst)) {
     AllocaInst *AI = findAllocaForValue(II->getArgOperand(1));
     if (!AI) {
       Info.UnrecognizedLifetimes.push_back(&Inst);
@@ -261,11 +259,6 @@ void alignAndPadAlloca(memtag::AllocaInfo &Info, llvm::Align Alignment) {
   Info.AI = NewAI;
 }
 
-bool isLifetimeIntrinsic(Value *V) {
-  auto *II = dyn_cast<IntrinsicInst>(V);
-  return II && II->isLifetimeStartOrEnd();
-}
-
 Value *readRegister(IRBuilder<> &IRB, StringRef Name) {
   Module *M = IRB.GetInsertBlock()->getParent()->getParent();
   MDNode *MD =
diff --git llvm/lib/Transforms/Utils/SimplifyCFG.cpp llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 12dd49da279b..27b7ec4629a2 100644
--- llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -2187,20 +2187,6 @@ bool SimplifyCFGOpt::hoistSuccIdenticalTerminatorToSwitchOrIf(
   return Changed;
 }
 
-// Check lifetime markers.
-static bool isLifeTimeMarker(const Instruction *I) {
-  if (auto II = dyn_cast<IntrinsicInst>(I)) {
-    switch (II->getIntrinsicID()) {
-    default:
-      break;
-    case Intrinsic::lifetime_start:
-    case Intrinsic::lifetime_end:
-      return true;
-    }
-  }
-  return false;
-}
-
 // TODO: Refine this. This should avoid cases like turning constant memcpy sizes
 // into variables.
 static bool replacingOperandWithVariableIsCheap(const Instruction *I,
@@ -2321,7 +2307,7 @@ static bool canSinkInstructions(
       // backend may handle such lifetimes incorrectly as well (#104776).
       // Don't sink lifetimes if it would introduce a phi on the pointer
       // argument.
-      if (isLifeTimeMarker(I0) && OI == 1 &&
+      if (isa<LifetimeIntrinsic>(I0) && OI == 1 &&
           any_of(Insts, [](const Instruction *I) {
             return isa<AllocaInst>(I->getOperand(1)->stripPointerCasts());
           }))
@@ -8175,8 +8161,8 @@ static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I, bool PtrValu
   if (C->isNullValue() || isa<UndefValue>(C)) {
     // Only look at the first use we can handle, avoid hurting compile time with
     // long uselists
-    auto FindUse = llvm::find_if(I->users(), [](auto *U) {
-      auto *Use = cast<Instruction>(U);
+    auto FindUse = llvm::find_if(I->uses(), [](auto &U) {
+      auto *Use = cast<Instruction>(U.getUser());
       // Change this list when we want to add new instructions.
       switch (Use->getOpcode()) {
       default:
@@ -8199,26 +8185,28 @@ static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I, bool PtrValu
         return true;
       }
     });
-    if (FindUse == I->user_end())
+    if (FindUse == I->use_end())
       return false;
-    auto *Use = cast<Instruction>(*FindUse);
-    // Bail out if Use is not in the same BB as I or Use == I or Use comes
-    // before I in the block. The latter two can be the case if Use is a
+    auto &Use = *FindUse;
+    auto *User = cast<Instruction>(Use.getUser());
+    // Bail out if User is not in the same BB as I or User == I or User comes
+    // before I in the block. The latter two can be the case if User is a
     // PHI node.
-    if (Use->getParent() != I->getParent() || Use == I || Use->comesBefore(I))
+    if (User->getParent() != I->getParent() || User == I ||
+        User->comesBefore(I))
       return false;
 
     // Now make sure that there are no instructions in between that can alter
     // control flow (eg. calls)
     auto InstrRange =
-        make_range(std::next(I->getIterator()), Use->getIterator());
+        make_range(std::next(I->getIterator()), User->getIterator());
     if (any_of(InstrRange, [](Instruction &I) {
           return !isGuaranteedToTransferExecutionToSuccessor(&I);
         }))
       return false;
 
     // Look through GEPs. A load from a GEP derived from NULL is still undefined
-    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Use))
+    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(User))
       if (GEP->getPointerOperand() == I) {
         // The current base address is null, there are four cases to consider:
         // getelementptr (TY, null, 0)                 -> null
@@ -8235,7 +8223,7 @@ static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I, bool PtrValu
       }
 
     // Look through return.
-    if (ReturnInst *Ret = dyn_cast<ReturnInst>(Use)) {
+    if (ReturnInst *Ret = dyn_cast<ReturnInst>(User)) {
       bool HasNoUndefAttr =
           Ret->getFunction()->hasRetAttribute(Attribute::NoUndef);
       // Return undefined to a noundef return value is undefined.
@@ -8249,56 +8237,45 @@ static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I, bool PtrValu
     }
 
     // Load from null is undefined.
-    if (LoadInst *LI = dyn_cast<LoadInst>(Use))
+    if (LoadInst *LI = dyn_cast<LoadInst>(User))
       if (!LI->isVolatile())
         return !NullPointerIsDefined(LI->getFunction(),
                                      LI->getPointerAddressSpace());
 
     // Store to null is undefined.
-    if (StoreInst *SI = dyn_cast<StoreInst>(Use))
+    if (StoreInst *SI = dyn_cast<StoreInst>(User))
       if (!SI->isVolatile())
         return (!NullPointerIsDefined(SI->getFunction(),
                                       SI->getPointerAddressSpace())) &&
                SI->getPointerOperand() == I;
 
     // llvm.assume(false/undef) always triggers immediate UB.
-    if (auto *Assume = dyn_cast<AssumeInst>(Use)) {
+    if (auto *Assume = dyn_cast<AssumeInst>(User)) {
       // Ignore assume operand bundles.
       if (I == Assume->getArgOperand(0))
         return true;
     }
 
-    if (auto *CB = dyn_cast<CallBase>(Use)) {
+    if (auto *CB = dyn_cast<CallBase>(User)) {
       if (C->isNullValue() && NullPointerIsDefined(CB->getFunction()))
         return false;
       // A call to null is undefined.
       if (CB->getCalledOperand() == I)
         return true;
 
-      if (C->isNullValue()) {
-        for (const llvm::Use &Arg : CB->args())
-          if (Arg == I) {
-            unsigned ArgIdx = CB->getArgOperandNo(&Arg);
-            if (CB->isPassingUndefUB(ArgIdx) &&
-                CB->paramHasAttr(ArgIdx, Attribute::NonNull)) {
-              // Passing null to a nonnnull+noundef argument is undefined.
-              return !PtrValueMayBeModified;
-            }
-          }
-      } else if (isa<UndefValue>(C)) {
+      if (CB->isArgOperand(&Use)) {
+        unsigned ArgIdx = CB->getArgOperandNo(&Use);
+        // Passing null to a nonnnull+noundef argument is undefined.
+        if (C->isNullValue() && CB->isPassingUndefUB(ArgIdx) &&
+            CB->paramHasAttr(ArgIdx, Attribute::NonNull))
+          return !PtrValueMayBeModified;
         // Passing undef to a noundef argument is undefined.
-        for (const llvm::Use &Arg : CB->args())
-          if (Arg == I) {
-            unsigned ArgIdx = CB->getArgOperandNo(&Arg);
-            if (CB->isPassingUndefUB(ArgIdx)) {
-              // Passing undef to a noundef argument is undefined.
-              return true;
-            }
-          }
+        if (isa<UndefValue>(C) && CB->isPassingUndefUB(ArgIdx))
+          return true;
       }
     }
     // Div/Rem by zero is immediate UB
-    if (match(Use, m_BinOp(m_Value(), m_Specific(I))) && Use->isIntDivRem())
+    if (match(User, m_BinOp(m_Value(), m_Specific(I))) && User->isIntDivRem())
       return true;
   }
   return false;
diff --git llvm/lib/Transforms/Utils/SimplifyIndVar.cpp llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
index f05d32d980e5..e41a1adadfcc 100644
--- llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -205,12 +205,12 @@ bool SimplifyIndvar::makeIVComparisonInvariant(ICmpInst *ICmp,
   if (!Preheader)
     return false;
   unsigned IVOperIdx = 0;
-  ICmpInst::Predicate Pred = ICmp->getPredicate();
+  CmpPredicate Pred = ICmp->getCmpPredicate();
   if (IVOperand != ICmp->getOperand(0)) {
     // Swapped
     assert(IVOperand == ICmp->getOperand(1) && "Can't find IVOperand");
     IVOperIdx = 1;
-    Pred = ICmpInst::getSwappedPredicate(Pred);
+    Pred = ICmpInst::getSwappedCmpPredicate(Pred);
   }
 
   // Get the SCEVs for the ICmp operands (in the specific context of the
@@ -249,13 +249,13 @@ bool SimplifyIndvar::makeIVComparisonInvariant(ICmpInst *ICmp,
 void SimplifyIndvar::eliminateIVComparison(ICmpInst *ICmp,
                                            Instruction *IVOperand) {
   unsigned IVOperIdx = 0;
-  ICmpInst::Predicate Pred = ICmp->getPredicate();
+  CmpPredicate Pred = ICmp->getCmpPredicate();
   ICmpInst::Predicate OriginalPred = Pred;
   if (IVOperand != ICmp->getOperand(0)) {
     // Swapped
     assert(IVOperand == ICmp->getOperand(1) && "Can't find IVOperand");
     IVOperIdx = 1;
-    Pred = ICmpInst::getSwappedPredicate(Pred);
+    Pred = ICmpInst::getSwappedCmpPredicate(Pred);
   }
 
   // Get the SCEVs for the ICmp operands (in the specific context of the
diff --git llvm/lib/Transforms/Utils/VNCoercion.cpp llvm/lib/Transforms/Utils/VNCoercion.cpp
index 7a61ab741663..c1bce01239dc 100644
--- llvm/lib/Transforms/Utils/VNCoercion.cpp
+++ llvm/lib/Transforms/Utils/VNCoercion.cpp
@@ -15,30 +15,42 @@ static bool isFirstClassAggregateOrScalableType(Type *Ty) {
 
 /// Return true if coerceAvailableValueToLoadType will succeed.
 bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
-                                     const DataLayout &DL) {
+                                     Function *F) {
   Type *StoredTy = StoredVal->getType();
-
   if (StoredTy == LoadTy)
     return true;
 
+  const DataLayout &DL = F->getDataLayout();
+  TypeSize MinStoreSize = DL.getTypeSizeInBits(StoredTy);
+  TypeSize LoadSize = DL.getTypeSizeInBits(LoadTy);
   if (isa<ScalableVectorType>(StoredTy) && isa<ScalableVectorType>(LoadTy) &&
-      DL.getTypeSizeInBits(StoredTy) == DL.getTypeSizeInBits(LoadTy))
+      MinStoreSize == LoadSize)
     return true;
 
-  // If the loaded/stored value is a first class array/struct, or scalable type,
-  // don't try to transform them. We need to be able to bitcast to integer.
-  if (isFirstClassAggregateOrScalableType(LoadTy) ||
-      isFirstClassAggregateOrScalableType(StoredTy))
+  // If the loaded/stored value is a first class array/struct, don't try to
+  // transform them. We need to be able to bitcast to integer. For scalable
+  // vectors forwarded to fixed-sized vectors @llvm.vector.extract is used.
+  if (isa<ScalableVectorType>(StoredTy) && isa<FixedVectorType>(LoadTy)) {
+    if (StoredTy->getScalarType() != LoadTy->getScalarType())
+      return false;
+
+    // If it is known at compile-time that the VScale is larger than one,
+    // use that information to allow for wider loads.
+    const auto &Attrs = F->getAttributes().getFnAttrs();
+    unsigned MinVScale = Attrs.getVScaleRangeMin();
+    MinStoreSize =
+        TypeSize::getFixed(MinStoreSize.getKnownMinValue() * MinVScale);
+  } else if (isFirstClassAggregateOrScalableType(LoadTy) ||
+             isFirstClassAggregateOrScalableType(StoredTy)) {
     return false;
-
-  uint64_t StoreSize = DL.getTypeSizeInBits(StoredTy).getFixedValue();
+  }
 
   // The store size must be byte-aligned to support future type casts.
-  if (llvm::alignTo(StoreSize, 8) != StoreSize)
+  if (llvm::alignTo(MinStoreSize, 8) != MinStoreSize)
     return false;
 
   // The store has to be at least as big as the load.
-  if (StoreSize < DL.getTypeSizeInBits(LoadTy).getFixedValue())
+  if (!TypeSize::isKnownGE(MinStoreSize, LoadSize))
     return false;
 
   bool StoredNI = DL.isNonIntegralPointerType(StoredTy->getScalarType());
@@ -57,11 +69,10 @@ bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
     return false;
   }
 
-
   // The implementation below uses inttoptr for vectors of unequal size; we
   // can't allow this for non integral pointers. We could teach it to extract
   // exact subvectors if desired.
-  if (StoredNI && StoreSize != DL.getTypeSizeInBits(LoadTy).getFixedValue())
+  if (StoredNI && (StoredTy->isScalableTy() || MinStoreSize != LoadSize))
     return false;
 
   if (StoredTy->isTargetExtTy() || LoadTy->isTargetExtTy())
@@ -77,16 +88,24 @@ bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
 ///
 /// If we can't do it, return null.
 Value *coerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy,
-                                      IRBuilderBase &Helper,
-                                      const DataLayout &DL) {
-  assert(canCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, DL) &&
+                                      IRBuilderBase &Helper, Function *F) {
+  assert(canCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, F) &&
          "precondition violation - materialization can't fail");
+  const DataLayout &DL = F->getDataLayout();
   if (auto *C = dyn_cast<Constant>(StoredVal))
     StoredVal = ConstantFoldConstant(C, DL);
 
   // If this is already the right type, just return it.
   Type *StoredValTy = StoredVal->getType();
 
+  // If this is a scalable vector forwarded to a fixed vector load, create
+  // a @llvm.vector.extract instead of bitcasts.
+  if (isa<ScalableVectorType>(StoredVal->getType()) &&
+      isa<FixedVectorType>(LoadedTy)) {
+    return Helper.CreateIntrinsic(LoadedTy, Intrinsic::vector_extract,
+                                  {StoredVal, Helper.getInt64(0)});
+  }
+
   TypeSize StoredValSize = DL.getTypeSizeInBits(StoredValTy);
   TypeSize LoadedValSize = DL.getTypeSizeInBits(LoadedTy);
 
@@ -220,7 +239,7 @@ int analyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr,
   if (isFirstClassAggregateOrScalableType(StoredVal->getType()))
     return -1;
 
-  if (!canCoerceMustAliasedValueToLoad(StoredVal, LoadTy, DL))
+  if (!canCoerceMustAliasedValueToLoad(StoredVal, LoadTy, DepSI->getFunction()))
     return -1;
 
   Value *StorePtr = DepSI->getPointerOperand();
@@ -235,11 +254,11 @@ int analyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr,
 /// the other load can feed into the second load.
 int analyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr, LoadInst *DepLI,
                                   const DataLayout &DL) {
-  // Cannot handle reading from store of first-class aggregate yet.
-  if (DepLI->getType()->isStructTy() || DepLI->getType()->isArrayTy())
+  // Cannot handle reading from store of first-class aggregate or scalable type.
+  if (isFirstClassAggregateOrScalableType(DepLI->getType()))
     return -1;
 
-  if (!canCoerceMustAliasedValueToLoad(DepLI, LoadTy, DL))
+  if (!canCoerceMustAliasedValueToLoad(DepLI, LoadTy, DepLI->getFunction()))
     return -1;
 
   Value *DepPtr = DepLI->getPointerOperand();
@@ -315,6 +334,16 @@ static Value *getStoreValueForLoadHelper(Value *SrcVal, unsigned Offset,
     return SrcVal;
   }
 
+  // For the case of a scalable vector being forwarded to a fixed-sized load,
+  // only equal element types are allowed and a @llvm.vector.extract will be
+  // used instead of bitcasts.
+  if (isa<ScalableVectorType>(SrcVal->getType()) &&
+      isa<FixedVectorType>(LoadTy)) {
+    assert(Offset == 0 &&
+           SrcVal->getType()->getScalarType() == LoadTy->getScalarType());
+    return SrcVal;
+  }
+
   uint64_t StoreSize =
       (DL.getTypeSizeInBits(SrcVal->getType()).getFixedValue() + 7) / 8;
   uint64_t LoadSize = (DL.getTypeSizeInBits(LoadTy).getFixedValue() + 7) / 8;
@@ -344,20 +373,24 @@ static Value *getStoreValueForLoadHelper(Value *SrcVal, unsigned Offset,
 }
 
 Value *getValueForLoad(Value *SrcVal, unsigned Offset, Type *LoadTy,
-                       Instruction *InsertPt, const DataLayout &DL) {
+                       Instruction *InsertPt, Function *F) {
+  const DataLayout &DL = F->getDataLayout();
 #ifndef NDEBUG
-  TypeSize SrcValSize = DL.getTypeStoreSize(SrcVal->getType());
+  TypeSize MinSrcValSize = DL.getTypeStoreSize(SrcVal->getType());
   TypeSize LoadSize = DL.getTypeStoreSize(LoadTy);
-  assert(SrcValSize.isScalable() == LoadSize.isScalable());
-  assert((SrcValSize.isScalable() || Offset + LoadSize <= SrcValSize) &&
+  if (MinSrcValSize.isScalable() && !LoadSize.isScalable())
+    MinSrcValSize =
+        TypeSize::getFixed(MinSrcValSize.getKnownMinValue() *
+                           F->getAttributes().getFnAttrs().getVScaleRangeMin());
+  assert((MinSrcValSize.isScalable() || Offset + LoadSize <= MinSrcValSize) &&
          "Expected Offset + LoadSize <= SrcValSize");
-  assert(
-      (!SrcValSize.isScalable() || (Offset == 0 && LoadSize == SrcValSize)) &&
-      "Expected scalable type sizes to match");
+  assert((!MinSrcValSize.isScalable() ||
+          (Offset == 0 && TypeSize::isKnownLE(LoadSize, MinSrcValSize))) &&
+         "Expected offset of zero and LoadSize <= SrcValSize");
 #endif
   IRBuilder<> Builder(InsertPt);
   SrcVal = getStoreValueForLoadHelper(SrcVal, Offset, LoadTy, Builder, DL);
-  return coerceAvailableValueToLoadType(SrcVal, LoadTy, Builder, DL);
+  return coerceAvailableValueToLoadType(SrcVal, LoadTy, Builder, F);
 }
 
 Constant *getConstantValueForLoad(Constant *SrcVal, unsigned Offset,
@@ -408,7 +441,8 @@ Value *getMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
       ++NumBytesSet;
     }
 
-    return coerceAvailableValueToLoadType(Val, LoadTy, Builder, DL);
+    return coerceAvailableValueToLoadType(Val, LoadTy, Builder,
+                                          InsertPt->getFunction());
   }
 
   // Otherwise, this is a memcpy/memmove from a constant global.
diff --git llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index bc44ec11edb7..ed3e45dd2c6c 100644
--- llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -40,6 +40,7 @@ class OptimizationRemarkEmitter;
 class TargetTransformInfo;
 class TargetLibraryInfo;
 class VPRecipeBuilder;
+struct VFRange;
 
 /// VPlan-based builder utility analogous to IRBuilder.
 class VPBuilder {
@@ -140,6 +141,9 @@ public:
     InsertPt = IP->getIterator();
   }
 
+  /// Insert \p R at the current insertion point.
+  void insert(VPRecipeBase *R) { BB->insert(R, InsertPt); }
+
   /// Create an N-ary operation with \p Opcode, \p Operands and set \p Inst as
   /// its underlying Instruction.
   VPInstruction *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands,
diff --git llvm/lib/Transforms/Vectorize/LoopVectorize.cpp llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 5375d2be9c87..660a6ef57457 100644
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -59,6 +59,7 @@
 #include "VPlan.h"
 #include "VPlanAnalysis.h"
 #include "VPlanHCFGBuilder.h"
+#include "VPlanHelpers.h"
 #include "VPlanPatternMatch.h"
 #include "VPlanTransforms.h"
 #include "VPlanUtils.h"
@@ -988,7 +989,10 @@ public:
                              InterleavedAccessInfo &IAI)
       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
-        Hints(Hints), InterleaveInfo(IAI), CostKind(TTI::TCK_RecipThroughput) {}
+        Hints(Hints), InterleaveInfo(IAI), CostKind(TTI::TCK_RecipThroughput) {
+    if (TTI.supportsScalableVectors() || ForceTargetSupportsScalableVectors)
+      initializeVScaleForTuning();
+  }
 
   /// \return An upper bound for the vectorization factors (both fixed and
   /// scalable). If the factors are 0, vectorization and interleaving should be
@@ -1564,9 +1568,34 @@ public:
   /// trivially hoistable.
   bool shouldConsiderInvariant(Value *Op);
 
+  /// Return the value of vscale used for tuning the cost model.
+  std::optional<unsigned> getVScaleForTuning() const { return VScaleForTuning; }
+
 private:
   unsigned NumPredStores = 0;
 
+  /// Used to store the value of vscale used for tuning the cost model. It is
+  /// initialized during object construction.
+  std::optional<unsigned> VScaleForTuning;
+
+  /// Initializes the value of vscale used for tuning the cost model. If
+  /// vscale_range.min == vscale_range.max then return vscale_range.max, else
+  /// return the value returned by the corresponding TTI method.
+  void initializeVScaleForTuning() {
+    const Function *Fn = TheLoop->getHeader()->getParent();
+    if (Fn->hasFnAttribute(Attribute::VScaleRange)) {
+      auto Attr = Fn->getFnAttribute(Attribute::VScaleRange);
+      auto Min = Attr.getVScaleRangeMin();
+      auto Max = Attr.getVScaleRangeMax();
+      if (Max && Min == Max) {
+        VScaleForTuning = Max;
+        return;
+      }
+    }
+
+    VScaleForTuning = TTI.getVScaleForTuning();
+  }
+
   /// \return An upper bound for the vectorization factors for both
   /// fixed and scalable vectorization, where the minimum-known number of
   /// elements is a power-of-2 larger than zero. If scalable vectorization is
@@ -4241,33 +4270,15 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
   return MaxVF;
 }
 
-/// Convenience function that returns the value of vscale_range iff
-/// vscale_range.min == vscale_range.max or otherwise returns the value
-/// returned by the corresponding TTI method.
-static std::optional<unsigned>
-getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) {
-  const Function *Fn = L->getHeader()->getParent();
-  if (Fn->hasFnAttribute(Attribute::VScaleRange)) {
-    auto Attr = Fn->getFnAttribute(Attribute::VScaleRange);
-    auto Min = Attr.getVScaleRangeMin();
-    auto Max = Attr.getVScaleRangeMax();
-    if (Max && Min == Max)
-      return Max;
-  }
-
-  return TTI.getVScaleForTuning();
-}
-
 /// This function attempts to return a value that represents the vectorization
 /// factor at runtime. For fixed-width VFs we know this precisely at compile
 /// time, but for scalable VFs we calculate it based on an estimate of the
 /// vscale value.
-static unsigned getEstimatedRuntimeVF(const Loop *L,
-                                      const TargetTransformInfo &TTI,
-                                      ElementCount VF) {
+static unsigned getEstimatedRuntimeVF(ElementCount VF,
+                                      std::optional<unsigned> VScale) {
   unsigned EstimatedVF = VF.getKnownMinValue();
   if (VF.isScalable())
-    if (std::optional<unsigned> VScale = getVScaleForTuning(L, TTI))
+    if (VScale)
       EstimatedVF *= *VScale;
   assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
   return EstimatedVF;
@@ -4282,7 +4293,7 @@ bool LoopVectorizationPlanner::isMoreProfitable(
   // Improve estimate for the vector width if it is scalable.
   unsigned EstimatedWidthA = A.Width.getKnownMinValue();
   unsigned EstimatedWidthB = B.Width.getKnownMinValue();
-  if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) {
+  if (std::optional<unsigned> VScale = CM.getVScaleForTuning()) {
     if (A.Width.isScalable())
       EstimatedWidthA *= *VScale;
     if (B.Width.isScalable())
@@ -4575,13 +4586,13 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
       InstructionCost C = CM.expectedCost(VF);
       VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
 
-      unsigned Width = getEstimatedRuntimeVF(OrigLoop, TTI, Candidate.Width);
+      unsigned Width =
+          getEstimatedRuntimeVF(Candidate.Width, CM.getVScaleForTuning());
       LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
                         << " costs: " << (Candidate.Cost / Width));
       if (VF.isScalable())
         LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
-                          << getVScaleForTuning(OrigLoop, TTI).value_or(1)
-                          << ")");
+                          << CM.getVScaleForTuning().value_or(1) << ")");
       LLVM_DEBUG(dbgs() << ".\n");
 
       if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
@@ -4670,7 +4681,8 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
   unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0
                                 ? EpilogueVectorizationMinVF
                                 : TTI.getEpilogueVectorizationMinVF();
-  return getEstimatedRuntimeVF(TheLoop, TTI, VF * Multiplier) >= MinVFThreshold;
+  return getEstimatedRuntimeVF(VF * Multiplier, VScaleForTuning) >=
+         MinVFThreshold;
 }
 
 VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
@@ -4722,8 +4734,8 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
   // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
   // the main loop handles 8 lanes per iteration. We could still benefit from
   // vectorizing the epilogue loop with VF=4.
-  ElementCount EstimatedRuntimeVF =
-      ElementCount::getFixed(getEstimatedRuntimeVF(OrigLoop, TTI, MainLoopVF));
+  ElementCount EstimatedRuntimeVF = ElementCount::getFixed(
+      getEstimatedRuntimeVF(MainLoopVF, CM.getVScaleForTuning()));
 
   ScalarEvolution &SE = *PSE.getSE();
   Type *TCType = Legal->getWidestInductionType();
@@ -4969,7 +4981,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
   }
 
-  unsigned EstimatedVF = getEstimatedRuntimeVF(TheLoop, TTI, VF);
+  unsigned EstimatedVF = getEstimatedRuntimeVF(VF, VScaleForTuning);
   unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
   if (KnownTC > 0) {
     // At least one iteration must be scalar when this constraint holds. So the
@@ -6625,8 +6637,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
     // fold away.  We can generalize this for all operations using the notion
     // of neutral elements.  (TODO)
     if (I->getOpcode() == Instruction::Mul &&
-        (PSE.getSCEV(I->getOperand(0))->isOne() ||
-         PSE.getSCEV(I->getOperand(1))->isOne()))
+        ((TheLoop->isLoopInvariant(I->getOperand(0)) &&
+          PSE.getSCEV(I->getOperand(0))->isOne()) ||
+         (TheLoop->isLoopInvariant(I->getOperand(1)) &&
+          PSE.getSCEV(I->getOperand(1))->isOne())))
       return 0;
 
     // Detect reduction patterns
@@ -7398,7 +7412,7 @@ InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
   // Now compute and add the VPlan-based cost.
   Cost += Plan.cost(VF, CostCtx);
 #ifndef NDEBUG
-  unsigned EstimatedWidth = getEstimatedRuntimeVF(OrigLoop, CM.TTI, VF);
+  unsigned EstimatedWidth = getEstimatedRuntimeVF(VF, CM.getVScaleForTuning());
   LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
                     << " (Estimated cost per lane: ");
   if (Cost.isValid()) {
@@ -8309,7 +8323,7 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
                                                 : GEPNoWrapFlags::none(),
                                             I->getDebugLoc());
     }
-    Builder.getInsertBlock()->appendRecipe(VectorPtr);
+    Builder.insert(VectorPtr);
     Ptr = VectorPtr;
   }
   if (LoadInst *Load = dyn_cast<LoadInst>(I))
@@ -8576,6 +8590,8 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
       // to replace operands with constants.
       ScalarEvolution &SE = *PSE.getSE();
       auto GetConstantViaSCEV = [this, &SE](VPValue *Op) {
+        if (!Op->isLiveIn())
+          return Op;
         Value *V = Op->getUnderlyingValue();
         if (isa<Constant>(V) || !SE.isSCEVable(V->getType()))
           return Op;
@@ -8628,8 +8644,9 @@ void VPRecipeBuilder::fixHeaderPhis() {
   }
 }
 
-VPReplicateRecipe *VPRecipeBuilder::handleReplication(Instruction *I,
-                                                      VFRange &Range) {
+VPReplicateRecipe *
+VPRecipeBuilder::handleReplication(Instruction *I, ArrayRef<VPValue *> Operands,
+                                   VFRange &Range) {
   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
       Range);
@@ -8685,8 +8702,8 @@ VPReplicateRecipe *VPRecipeBuilder::handleReplication(Instruction *I,
   assert((Range.Start.isScalar() || !IsUniform || !IsPredicated ||
           (Range.Start.isScalable() && isa<IntrinsicInst>(I))) &&
          "Should not predicate a uniform recipe");
-  auto *Recipe = new VPReplicateRecipe(I, mapToVPValues(I->operands()),
-                                       IsUniform, BlockInMask);
+  auto *Recipe = new VPReplicateRecipe(
+      I, make_range(Operands.begin(), Operands.end()), IsUniform, BlockInMask);
   return Recipe;
 }
 
@@ -9349,7 +9366,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
         if (!Legal->isInvariantStoreOfReduction(SI))
           continue;
         auto *Recipe = new VPReplicateRecipe(
-            SI, RecipeBuilder.mapToVPValues(Instr->operands()),
+            SI, make_range(Operands.begin(), Operands.end()),
             true /* IsUniform */);
         Recipe->insertBefore(*MiddleVPBB, MBIP);
         continue;
@@ -9358,7 +9375,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
       VPRecipeBase *Recipe =
           RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB);
       if (!Recipe)
-        Recipe = RecipeBuilder.handleReplication(Instr, Range);
+        Recipe = RecipeBuilder.handleReplication(Instr, Operands, Range);
 
       RecipeBuilder.setRecipe(Instr, Recipe);
       if (isa<VPHeaderPHIRecipe>(Recipe)) {
@@ -9407,14 +9424,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
 
   if (auto *UncountableExitingBlock =
           Legal->getUncountableEarlyExitingBlock()) {
-    if (!VPlanTransforms::handleUncountableEarlyExit(
-            *Plan, *PSE.getSE(), OrigLoop, UncountableExitingBlock,
-            RecipeBuilder)) {
-      reportVectorizationFailure(
-          "Some exit values in loop with uncountable exit not supported yet",
-          "UncountableEarlyExitLoopsUnsupportedExitValue", ORE, OrigLoop);
-      return nullptr;
-    }
+    VPlanTransforms::runPass(VPlanTransforms::handleUncountableEarlyExit, *Plan,
+                             *PSE.getSE(), OrigLoop, UncountableExitingBlock,
+                             RecipeBuilder);
   }
   DenseMap<VPValue *, VPValue *> IVEndValues;
   addScalarResumePhis(RecipeBuilder, *Plan, IVEndValues);
@@ -10066,9 +10078,9 @@ static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
 
 static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
                                        VectorizationFactor &VF, Loop *L,
-                                       const TargetTransformInfo &TTI,
                                        PredicatedScalarEvolution &PSE,
-                                       ScalarEpilogueLowering SEL) {
+                                       ScalarEpilogueLowering SEL,
+                                       std::optional<unsigned> VScale) {
   InstructionCost CheckCost = Checks.getCost();
   if (!CheckCost.isValid())
     return false;
@@ -10118,7 +10130,7 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
   // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
   // the computations are performed on doubles, not integers and the result
   // is rounded up, hence we get an upper estimate of the TC.
-  unsigned IntVF = getEstimatedRuntimeVF(L, TTI, VF.Width);
+  unsigned IntVF = getEstimatedRuntimeVF(VF.Width, VScale);
   uint64_t RtC = *CheckCost.getValue();
   uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue();
   uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div);
@@ -10555,7 +10567,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     bool ForceVectorization =
         Hints.getForce() == LoopVectorizeHints::FK_Enabled;
     if (!ForceVectorization &&
-        !areRuntimeChecksProfitable(Checks, VF, L, *TTI, PSE, SEL)) {
+        !areRuntimeChecksProfitable(Checks, VF, L, PSE, SEL,
+                                    CM.getVScaleForTuning())) {
       ORE->emit([&]() {
         return OptimizationRemarkAnalysisAliasing(
                    DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
diff --git llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 5c02bc7bfa90..539c9227af7e 100644
--- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1395,7 +1395,7 @@ public:
 
   /// \returns the cost incurred by unwanted spills and fills, caused by
   /// holding live values over call sites.
-  InstructionCost getSpillCost() const;
+  InstructionCost getSpillCost();
 
   /// \returns the vectorization cost of the subtree that starts at \p VL.
   /// A negative number means that this is profitable.
@@ -2958,7 +2958,7 @@ public:
   }
 
   /// Check if the value is vectorized in the tree.
-  bool isVectorized(Value *V) const {
+  bool isVectorized(const Value *V) const {
     assert(V && "V cannot be nullptr.");
     return ScalarToTreeEntries.contains(V);
   }
@@ -10354,7 +10354,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
     SameNodesEstimated = false;
     if (!E2 && InVectors.size() == 1) {
       unsigned VF = E1.getVectorFactor();
-      if (Value *V1 = InVectors.front().dyn_cast<Value *>()) {
+      if (Value *V1 = dyn_cast<Value *>(InVectors.front())) {
         VF = std::max(VF,
                       cast<FixedVectorType>(V1->getType())->getNumElements());
       } else {
@@ -10370,7 +10370,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
       auto P = InVectors.front();
       Cost += createShuffle(&E1, E2, Mask);
       unsigned VF = Mask.size();
-      if (Value *V1 = P.dyn_cast<Value *>()) {
+      if (Value *V1 = dyn_cast<Value *>(P)) {
         VF = std::max(VF,
                       getNumElements(V1->getType()));
       } else {
@@ -12160,16 +12160,15 @@ bool BoUpSLP::isTreeNotExtendable() const {
   return Res;
 }
 
-InstructionCost BoUpSLP::getSpillCost() const {
+InstructionCost BoUpSLP::getSpillCost() {
   // Walk from the bottom of the tree to the top, tracking which values are
   // live. When we see a call instruction that is not part of our tree,
   // query TTI to see if there is a cost to keeping values live over it
   // (for example, if spills and fills are required).
-  unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
   InstructionCost Cost = 0;
 
-  SmallPtrSet<Instruction *, 4> LiveValues;
-  Instruction *PrevInst = nullptr;
+  SmallPtrSet<const TreeEntry *, 4> LiveEntries;
+  const TreeEntry *Prev = nullptr;
 
   // The entries in VectorizableTree are not necessarily ordered by their
   // position in basic blocks. Collect them and order them by dominance so later
@@ -12177,101 +12176,100 @@ InstructionCost BoUpSLP::getSpillCost() const {
   // different basic blocks, we only scan to the beginning of the block, so
   // their order does not matter, as long as all instructions in a basic block
   // are grouped together. Using dominance ensures a deterministic order.
-  SmallVector<Instruction *, 16> OrderedScalars;
+  SmallVector<TreeEntry *, 16> OrderedEntries;
   for (const auto &TEPtr : VectorizableTree) {
-    if (TEPtr->State != TreeEntry::Vectorize)
+    if (TEPtr->isGather())
       continue;
-    Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
-    if (!Inst)
-      continue;
-    OrderedScalars.push_back(Inst);
-  }
-  llvm::sort(OrderedScalars, [&](Instruction *A, Instruction *B) {
-    auto *NodeA = DT->getNode(A->getParent());
-    auto *NodeB = DT->getNode(B->getParent());
+    OrderedEntries.push_back(TEPtr.get());
+  }
+  llvm::stable_sort(OrderedEntries, [&](const TreeEntry *TA,
+                                        const TreeEntry *TB) {
+    Instruction &A = getLastInstructionInBundle(TA);
+    Instruction &B = getLastInstructionInBundle(TB);
+    auto *NodeA = DT->getNode(A.getParent());
+    auto *NodeB = DT->getNode(B.getParent());
     assert(NodeA && "Should only process reachable instructions");
     assert(NodeB && "Should only process reachable instructions");
     assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
            "Different nodes should have different DFS numbers");
     if (NodeA != NodeB)
       return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
-    return B->comesBefore(A);
+    return B.comesBefore(&A);
   });
 
-  for (Instruction *Inst : OrderedScalars) {
-    if (!PrevInst) {
-      PrevInst = Inst;
+  for (const TreeEntry *TE : OrderedEntries) {
+    if (!Prev) {
+      Prev = TE;
       continue;
     }
 
-    // Update LiveValues.
-    LiveValues.erase(PrevInst);
-    for (auto &J : PrevInst->operands()) {
-      if (isa<Instruction>(&*J) && isVectorized(&*J))
-        LiveValues.insert(cast<Instruction>(&*J));
+    LiveEntries.erase(Prev);
+    for (unsigned I : seq<unsigned>(Prev->getNumOperands())) {
+      const TreeEntry *Op = getVectorizedOperand(Prev, I);
+      if (!Op)
+        continue;
+      assert(!Op->isGather() && "Expected vectorized operand.");
+      LiveEntries.insert(Op);
     }
 
     LLVM_DEBUG({
-      dbgs() << "SLP: #LV: " << LiveValues.size();
-      for (auto *X : LiveValues)
-        dbgs() << " " << X->getName();
+      dbgs() << "SLP: #LV: " << LiveEntries.size();
+      for (auto *X : LiveEntries)
+        X->dump();
       dbgs() << ", Looking at ";
-      Inst->dump();
+      TE->dump();
     });
 
     // Now find the sequence of instructions between PrevInst and Inst.
     unsigned NumCalls = 0;
-    BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
-                                 PrevInstIt =
-                                     PrevInst->getIterator().getReverse();
+    const Instruction *PrevInst = &getLastInstructionInBundle(Prev);
+    BasicBlock::const_reverse_iterator
+        InstIt = ++getLastInstructionInBundle(TE).getIterator().getReverse(),
+        PrevInstIt = PrevInst->getIterator().getReverse();
     while (InstIt != PrevInstIt) {
       if (PrevInstIt == PrevInst->getParent()->rend()) {
-        PrevInstIt = Inst->getParent()->rbegin();
+        PrevInstIt = getLastInstructionInBundle(TE).getParent()->rbegin();
         continue;
       }
 
-      auto NoCallIntrinsic = [this](Instruction *I) {
-        if (auto *II = dyn_cast<IntrinsicInst>(I)) {
-          if (II->isAssumeLikeIntrinsic())
-            return true;
-          FastMathFlags FMF;
-          SmallVector<Type *, 4> Tys;
-          for (auto &ArgOp : II->args())
-            Tys.push_back(ArgOp->getType());
-          if (auto *FPMO = dyn_cast<FPMathOperator>(II))
-            FMF = FPMO->getFastMathFlags();
-          IntrinsicCostAttributes ICA(II->getIntrinsicID(), II->getType(), Tys,
-                                      FMF);
-          InstructionCost IntrCost =
-              TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput);
-          InstructionCost CallCost = TTI->getCallInstrCost(
-              nullptr, II->getType(), Tys, TTI::TCK_RecipThroughput);
-          if (IntrCost < CallCost)
-            return true;
-        }
-        return false;
+      auto NoCallIntrinsic = [this](const Instruction *I) {
+        const auto *II = dyn_cast<IntrinsicInst>(I);
+        if (!II)
+          return false;
+        if (II->isAssumeLikeIntrinsic())
+          return true;
+        IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II);
+        InstructionCost IntrCost =
+            TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput);
+        InstructionCost CallCost =
+            TTI->getCallInstrCost(nullptr, II->getType(), ICA.getArgTypes(),
+                                  TTI::TCK_RecipThroughput);
+        return IntrCost < CallCost;
       };
 
       // Debug information does not impact spill cost.
-      if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
-          &*PrevInstIt != PrevInst)
+      // Vectorized calls, represented as vector intrinsics, do not impact spill
+      // cost.
+      if (const auto *CB = dyn_cast<CallBase>(&*PrevInstIt);
+          CB && !NoCallIntrinsic(CB) && !isVectorized(CB))
         NumCalls++;
 
       ++PrevInstIt;
     }
 
     if (NumCalls) {
-      SmallVector<Type *, 4> V;
-      for (auto *II : LiveValues) {
-        auto *ScalarTy = II->getType();
-        if (auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))
-          ScalarTy = VectorTy->getElementType();
-        V.push_back(getWidenedType(ScalarTy, BundleWidth));
+      SmallVector<Type *, 4> EntriesTypes;
+      for (const TreeEntry *TE : LiveEntries) {
+        auto *ScalarTy = TE->getMainOp()->getType();
+        auto It = MinBWs.find(TE);
+        if (It != MinBWs.end())
+          ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
+        EntriesTypes.push_back(getWidenedType(ScalarTy, TE->getVectorFactor()));
       }
-      Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V);
+      Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(EntriesTypes);
     }
 
-    PrevInst = Inst;
+    Prev = TE;
   }
 
   return Cost;
@@ -15090,7 +15088,9 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
       }
     }
     if (!GatherShuffles.empty()) {
-      unsigned SliceSize = getPartNumElems(E->Scalars.size(), NumParts);
+      unsigned SliceSize =
+          getPartNumElems(E->Scalars.size(),
+                          ::getNumberOfParts(*TTI, VecTy, E->Scalars.size()));
       SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
       for (const auto [I, TEs] : enumerate(Entries)) {
         if (TEs.empty()) {
@@ -20147,7 +20147,7 @@ public:
         }
         V.reorderTopToBottom();
         // No need to reorder the root node at all.
-        V.reorderBottomToTop(/*IgnoreReorder=*/true);
+        V.reorderBottomToTop(!V.doesRootHaveInTreeUses());
         // Keep extracted other reduction values, if they are used in the
         // vectorization trees.
         BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
diff --git llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 87c97d1edd7b..e81247c98568 100644
--- llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -23,6 +23,7 @@ class LoopVectorizationCostModel;
 class TargetLibraryInfo;
 class TargetTransformInfo;
 struct HistogramInfo;
+struct VFRange;
 
 /// A chain of instructions that form a partial reduction.
 /// Designed to match: reduction_bin_op (bin_op (extend (A), (extend (B))),
@@ -218,10 +219,12 @@ public:
     return Ingredient2Recipe[I];
   }
 
-  /// Build a VPReplicationRecipe for \p I. If it is predicated, add the mask as
-  /// last operand. Range.End may be decreased to ensure same recipe behavior
-  /// from \p Range.Start to \p Range.End.
-  VPReplicateRecipe *handleReplication(Instruction *I, VFRange &Range);
+  /// Build a VPReplicationRecipe for \p I using \p Operands. If it is
+  /// predicated, add the mask as last operand. Range.End may be decreased to
+  /// ensure same recipe behavior from \p Range.Start to \p Range.End.
+  VPReplicateRecipe *handleReplication(Instruction *I,
+                                       ArrayRef<VPValue *> Operands,
+                                       VFRange &Range);
 
   /// Add the incoming values from the backedge to reduction & first-order
   /// recurrence cross-iteration phis.
diff --git llvm/lib/Transforms/Vectorize/VPlan.cpp llvm/lib/Transforms/Vectorize/VPlan.cpp
index 52d15c535276..5a88ebeffb18 100644
--- llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -19,6 +19,7 @@
 #include "VPlan.h"
 #include "LoopVectorizationPlanner.h"
 #include "VPlanCFG.h"
+#include "VPlanHelpers.h"
 #include "VPlanPatternMatch.h"
 #include "VPlanTransforms.h"
 #include "VPlanUtils.h"
@@ -400,8 +401,8 @@ void VPTransformState::packScalarIntoVectorValue(VPValue *Def,
   set(Def, VectorValue);
 }
 
-BasicBlock *
-VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) {
+BasicBlock *VPBasicBlock::createEmptyBasicBlock(VPTransformState &State) {
+  auto &CFG = State.CFG;
   // BB stands for IR BasicBlocks. VPBB stands for VPlan VPBasicBlocks.
   // Pred stands for Predessor. Prev stands for Previous - last visited/created.
   BasicBlock *PrevBB = CFG.PrevBB;
@@ -412,7 +413,8 @@ VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) {
   return NewBB;
 }
 
-void VPBasicBlock::connectToPredecessors(VPTransformState::CFGState &CFG) {
+void VPBasicBlock::connectToPredecessors(VPTransformState &State) {
+  auto &CFG = State.CFG;
   BasicBlock *NewBB = CFG.VPBB2IRBB[this];
   // Hook up the new basic block to its predecessors.
   for (VPBlockBase *PredVPBlock : getHierarchicalPredecessors()) {
@@ -467,7 +469,7 @@ void VPIRBasicBlock::execute(VPTransformState *State) {
         "other blocks must be terminated by a branch");
   }
 
-  connectToPredecessors(State->CFG);
+  connectToPredecessors(*State);
 }
 
 VPIRBasicBlock *VPIRBasicBlock::clone() {
@@ -494,20 +496,27 @@ void VPBasicBlock::execute(VPTransformState *State) {
     //  * the exit of a replicate region.
     State->CFG.VPBB2IRBB[this] = NewBB;
   } else {
-    NewBB = createEmptyBasicBlock(State->CFG);
+    NewBB = createEmptyBasicBlock(*State);
 
     State->Builder.SetInsertPoint(NewBB);
     // Temporarily terminate with unreachable until CFG is rewired.
     UnreachableInst *Terminator = State->Builder.CreateUnreachable();
     // Register NewBB in its loop. In innermost loops its the same for all
     // BB's.
-    if (State->CurrentParentLoop)
-      State->CurrentParentLoop->addBasicBlockToLoop(NewBB, *State->LI);
+    Loop *ParentLoop = State->CurrentParentLoop;
+    // If this block has a sole successor that is an exit block then it needs
+    // adding to the same parent loop as the exit block.
+    VPBlockBase *SuccVPBB = getSingleSuccessor();
+    if (SuccVPBB && State->Plan->isExitBlock(SuccVPBB))
+      ParentLoop = State->LI->getLoopFor(
+          cast<VPIRBasicBlock>(SuccVPBB)->getIRBasicBlock());
+    if (ParentLoop)
+      ParentLoop->addBasicBlockToLoop(NewBB, *State->LI);
     State->Builder.SetInsertPoint(Terminator);
 
     State->CFG.PrevBB = NewBB;
     State->CFG.VPBB2IRBB[this] = NewBB;
-    connectToPredecessors(State->CFG);
+    connectToPredecessors(*State);
   }
 
   // 2. Fill the IR basic block with IR instructions.
@@ -616,6 +625,11 @@ bool VPBasicBlock::isExiting() const {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPBlockBase::print(raw_ostream &O) const {
+  VPSlotTracker SlotTracker(getPlan());
+  print(O, "", SlotTracker);
+}
+
 void VPBlockBase::printSuccessors(raw_ostream &O, const Twine &Indent) const {
   if (getSuccessors().empty()) {
     O << Indent << "No successors\n";
@@ -950,6 +964,10 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
   }
 }
 
+bool VPlan::isExitBlock(VPBlockBase *VPBB) {
+  return isa<VPIRBasicBlock>(VPBB) && VPBB->getNumSuccessors() == 0;
+}
+
 /// Generate the code inside the preheader and body of the vectorized loop.
 /// Assumes a single pre-header basic-block was created for this. Introduce
 /// additional basic-blocks as needed, and fill them all.
@@ -1460,58 +1478,6 @@ void VPUser::printOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const {
 }
 #endif
 
-void VPInterleavedAccessInfo::visitRegion(VPRegionBlock *Region,
-                                          Old2NewTy &Old2New,
-                                          InterleavedAccessInfo &IAI) {
-  ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>>
-      RPOT(Region->getEntry());
-  for (VPBlockBase *Base : RPOT) {
-    visitBlock(Base, Old2New, IAI);
-  }
-}
-
-void VPInterleavedAccessInfo::visitBlock(VPBlockBase *Block, Old2NewTy &Old2New,
-                                         InterleavedAccessInfo &IAI) {
-  if (VPBasicBlock *VPBB = dyn_cast<VPBasicBlock>(Block)) {
-    for (VPRecipeBase &VPI : *VPBB) {
-      if (isa<VPWidenPHIRecipe>(&VPI))
-        continue;
-      assert(isa<VPInstruction>(&VPI) && "Can only handle VPInstructions");
-      auto *VPInst = cast<VPInstruction>(&VPI);
-
-      auto *Inst = dyn_cast_or_null<Instruction>(VPInst->getUnderlyingValue());
-      if (!Inst)
-        continue;
-      auto *IG = IAI.getInterleaveGroup(Inst);
-      if (!IG)
-        continue;
-
-      auto NewIGIter = Old2New.find(IG);
-      if (NewIGIter == Old2New.end())
-        Old2New[IG] = new InterleaveGroup<VPInstruction>(
-            IG->getFactor(), IG->isReverse(), IG->getAlign());
-
-      if (Inst == IG->getInsertPos())
-        Old2New[IG]->setInsertPos(VPInst);
-
-      InterleaveGroupMap[VPInst] = Old2New[IG];
-      InterleaveGroupMap[VPInst]->insertMember(
-          VPInst, IG->getIndex(Inst),
-          Align(IG->isReverse() ? (-1) * int(IG->getFactor())
-                                : IG->getFactor()));
-    }
-  } else if (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
-    visitRegion(Region, Old2New, IAI);
-  else
-    llvm_unreachable("Unsupported kind of VPBlock.");
-}
-
-VPInterleavedAccessInfo::VPInterleavedAccessInfo(VPlan &Plan,
-                                                 InterleavedAccessInfo &IAI) {
-  Old2NewTy Old2New;
-  visitRegion(Plan.getVectorLoopRegion(), Old2New, IAI);
-}
-
 void VPSlotTracker::assignName(const VPValue *V) {
   assert(!VPValue2Name.contains(V) && "VPValue already has a name!");
   auto *UV = V->getUnderlyingValue();
diff --git llvm/lib/Transforms/Vectorize/VPlan.h llvm/lib/Transforms/Vectorize/VPlan.h
index a1ff684b2b80..fac207287e0b 100644
--- llvm/lib/Transforms/Vectorize/VPlan.h
+++ llvm/lib/Transforms/Vectorize/VPlan.h
@@ -17,7 +17,6 @@
 /// 4. VPInstruction, a concrete Recipe and VPUser modeling a single planned
 ///    instruction;
 /// 5. The VPlan class holding a candidate for vectorization;
-/// 6. The VPlanPrinter class providing a way to print a plan in dot format;
 /// These are documented in docs/VectorizationPlan.rst.
 //
 //===----------------------------------------------------------------------===//
@@ -34,10 +33,7 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/ADT/ilist.h"
 #include "llvm/ADT/ilist_node.h"
-#include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/IVDescriptors.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/FMF.h"
@@ -54,7 +50,7 @@ class BasicBlock;
 class DominatorTree;
 class InnerLoopVectorizer;
 class IRBuilderBase;
-class LoopInfo;
+struct VPTransformState;
 class raw_ostream;
 class RecurrenceDescriptor;
 class SCEV;
@@ -63,11 +59,11 @@ class VPBasicBlock;
 class VPBuilder;
 class VPRegionBlock;
 class VPlan;
+class VPLane;
 class VPReplicateRecipe;
 class VPlanSlp;
 class Value;
 class LoopVectorizationCostModel;
-class LoopVersioning;
 
 struct VPCostContext;
 
@@ -75,318 +71,8 @@ namespace Intrinsic {
 typedef unsigned ID;
 }
 
-/// Returns a calculation for the total number of elements for a given \p VF.
-/// For fixed width vectors this value is a constant, whereas for scalable
-/// vectors it is an expression determined at runtime.
-Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF);
-
-/// Return a value for Step multiplied by VF.
-Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
-                       int64_t Step);
-
-/// A helper function that returns the reciprocal of the block probability of
-/// predicated blocks. If we return X, we are assuming the predicated block
-/// will execute once for every X iterations of the loop header.
-///
-/// TODO: We should use actual block probability here, if available. Currently,
-///       we always assume predicated blocks have a 50% chance of executing.
-inline unsigned getReciprocalPredBlockProb() { return 2; }
-
-/// A range of powers-of-2 vectorization factors with fixed start and
-/// adjustable end. The range includes start and excludes end, e.g.,:
-/// [1, 16) = {1, 2, 4, 8}
-struct VFRange {
-  // A power of 2.
-  const ElementCount Start;
-
-  // A power of 2. If End <= Start range is empty.
-  ElementCount End;
-
-  bool isEmpty() const {
-    return End.getKnownMinValue() <= Start.getKnownMinValue();
-  }
-
-  VFRange(const ElementCount &Start, const ElementCount &End)
-      : Start(Start), End(End) {
-    assert(Start.isScalable() == End.isScalable() &&
-           "Both Start and End should have the same scalable flag");
-    assert(isPowerOf2_32(Start.getKnownMinValue()) &&
-           "Expected Start to be a power of 2");
-    assert(isPowerOf2_32(End.getKnownMinValue()) &&
-           "Expected End to be a power of 2");
-  }
-
-  /// Iterator to iterate over vectorization factors in a VFRange.
-  class iterator
-      : public iterator_facade_base<iterator, std::forward_iterator_tag,
-                                    ElementCount> {
-    ElementCount VF;
-
-  public:
-    iterator(ElementCount VF) : VF(VF) {}
-
-    bool operator==(const iterator &Other) const { return VF == Other.VF; }
-
-    ElementCount operator*() const { return VF; }
-
-    iterator &operator++() {
-      VF *= 2;
-      return *this;
-    }
-  };
-
-  iterator begin() { return iterator(Start); }
-  iterator end() {
-    assert(isPowerOf2_32(End.getKnownMinValue()));
-    return iterator(End);
-  }
-};
-
 using VPlanPtr = std::unique_ptr<VPlan>;
 
-/// In what follows, the term "input IR" refers to code that is fed into the
-/// vectorizer whereas the term "output IR" refers to code that is generated by
-/// the vectorizer.
-
-/// VPLane provides a way to access lanes in both fixed width and scalable
-/// vectors, where for the latter the lane index sometimes needs calculating
-/// as a runtime expression.
-class VPLane {
-public:
-  /// Kind describes how to interpret Lane.
-  enum class Kind : uint8_t {
-    /// For First, Lane is the index into the first N elements of a
-    /// fixed-vector <N x <ElTy>> or a scalable vector <vscale x N x <ElTy>>.
-    First,
-    /// For ScalableLast, Lane is the offset from the start of the last
-    /// N-element subvector in a scalable vector <vscale x N x <ElTy>>. For
-    /// example, a Lane of 0 corresponds to lane `(vscale - 1) * N`, a Lane of
-    /// 1 corresponds to `((vscale - 1) * N) + 1`, etc.
-    ScalableLast
-  };
-
-private:
-  /// in [0..VF)
-  unsigned Lane;
-
-  /// Indicates how the Lane should be interpreted, as described above.
-  Kind LaneKind;
-
-public:
-  VPLane(unsigned Lane) : Lane(Lane), LaneKind(VPLane::Kind::First) {}
-  VPLane(unsigned Lane, Kind LaneKind) : Lane(Lane), LaneKind(LaneKind) {}
-
-  static VPLane getFirstLane() { return VPLane(0, VPLane::Kind::First); }
-
-  static VPLane getLaneFromEnd(const ElementCount &VF, unsigned Offset) {
-    assert(Offset > 0 && Offset <= VF.getKnownMinValue() &&
-           "trying to extract with invalid offset");
-    unsigned LaneOffset = VF.getKnownMinValue() - Offset;
-    Kind LaneKind;
-    if (VF.isScalable())
-      // In this case 'LaneOffset' refers to the offset from the start of the
-      // last subvector with VF.getKnownMinValue() elements.
-      LaneKind = VPLane::Kind::ScalableLast;
-    else
-      LaneKind = VPLane::Kind::First;
-    return VPLane(LaneOffset, LaneKind);
-  }
-
-  static VPLane getLastLaneForVF(const ElementCount &VF) {
-    return getLaneFromEnd(VF, 1);
-  }
-
-  /// Returns a compile-time known value for the lane index and asserts if the
-  /// lane can only be calculated at runtime.
-  unsigned getKnownLane() const {
-    assert(LaneKind == Kind::First);
-    return Lane;
-  }
-
-  /// Returns an expression describing the lane index that can be used at
-  /// runtime.
-  Value *getAsRuntimeExpr(IRBuilderBase &Builder, const ElementCount &VF) const;
-
-  /// Returns the Kind of lane offset.
-  Kind getKind() const { return LaneKind; }
-
-  /// Returns true if this is the first lane of the whole vector.
-  bool isFirstLane() const { return Lane == 0 && LaneKind == Kind::First; }
-
-  /// Maps the lane to a cache index based on \p VF.
-  unsigned mapToCacheIndex(const ElementCount &VF) const {
-    switch (LaneKind) {
-    case VPLane::Kind::ScalableLast:
-      assert(VF.isScalable() && Lane < VF.getKnownMinValue());
-      return VF.getKnownMinValue() + Lane;
-    default:
-      assert(Lane < VF.getKnownMinValue());
-      return Lane;
-    }
-  }
-};
-
-/// VPTransformState holds information passed down when "executing" a VPlan,
-/// needed for generating the output IR.
-struct VPTransformState {
-  VPTransformState(const TargetTransformInfo *TTI, ElementCount VF, unsigned UF,
-                   LoopInfo *LI, DominatorTree *DT, IRBuilderBase &Builder,
-                   InnerLoopVectorizer *ILV, VPlan *Plan,
-                   Loop *CurrentParentLoop, Type *CanonicalIVTy);
-  /// Target Transform Info.
-  const TargetTransformInfo *TTI;
-
-  /// The chosen Vectorization Factor of the loop being vectorized.
-  ElementCount VF;
-
-  /// Hold the index to generate specific scalar instructions. Null indicates
-  /// that all instances are to be generated, using either scalar or vector
-  /// instructions.
-  std::optional<VPLane> Lane;
-
-  struct DataState {
-    // Each value from the original loop, when vectorized, is represented by a
-    // vector value in the map.
-    DenseMap<VPValue *, Value *> VPV2Vector;
-
-    DenseMap<VPValue *, SmallVector<Value *, 4>> VPV2Scalars;
-  } Data;
-
-  /// Get the generated vector Value for a given VPValue \p Def if \p IsScalar
-  /// is false, otherwise return the generated scalar. \See set.
-  Value *get(VPValue *Def, bool IsScalar = false);
-
-  /// Get the generated Value for a given VPValue and given Part and Lane.
-  Value *get(VPValue *Def, const VPLane &Lane);
-
-  bool hasVectorValue(VPValue *Def) { return Data.VPV2Vector.contains(Def); }
-
-  bool hasScalarValue(VPValue *Def, VPLane Lane) {
-    auto I = Data.VPV2Scalars.find(Def);
-    if (I == Data.VPV2Scalars.end())
-      return false;
-    unsigned CacheIdx = Lane.mapToCacheIndex(VF);
-    return CacheIdx < I->second.size() && I->second[CacheIdx];
-  }
-
-  /// Set the generated vector Value for a given VPValue, if \p
-  /// IsScalar is false. If \p IsScalar is true, set the scalar in lane 0.
-  void set(VPValue *Def, Value *V, bool IsScalar = false) {
-    if (IsScalar) {
-      set(Def, V, VPLane(0));
-      return;
-    }
-    assert((VF.isScalar() || V->getType()->isVectorTy()) &&
-           "scalar values must be stored as (0, 0)");
-    Data.VPV2Vector[Def] = V;
-  }
-
-  /// Reset an existing vector value for \p Def and a given \p Part.
-  void reset(VPValue *Def, Value *V) {
-    assert(Data.VPV2Vector.contains(Def) && "need to overwrite existing value");
-    Data.VPV2Vector[Def] = V;
-  }
-
-  /// Set the generated scalar \p V for \p Def and the given \p Lane.
-  void set(VPValue *Def, Value *V, const VPLane &Lane) {
-    auto &Scalars = Data.VPV2Scalars[Def];
-    unsigned CacheIdx = Lane.mapToCacheIndex(VF);
-    if (Scalars.size() <= CacheIdx)
-      Scalars.resize(CacheIdx + 1);
-    assert(!Scalars[CacheIdx] && "should overwrite existing value");
-    Scalars[CacheIdx] = V;
-  }
-
-  /// Reset an existing scalar value for \p Def and a given \p Lane.
-  void reset(VPValue *Def, Value *V, const VPLane &Lane) {
-    auto Iter = Data.VPV2Scalars.find(Def);
-    assert(Iter != Data.VPV2Scalars.end() &&
-           "need to overwrite existing value");
-    unsigned CacheIdx = Lane.mapToCacheIndex(VF);
-    assert(CacheIdx < Iter->second.size() &&
-           "need to overwrite existing value");
-    Iter->second[CacheIdx] = V;
-  }
-
-  /// Add additional metadata to \p To that was not present on \p Orig.
-  ///
-  /// Currently this is used to add the noalias annotations based on the
-  /// inserted memchecks.  Use this for instructions that are *cloned* into the
-  /// vector loop.
-  void addNewMetadata(Instruction *To, const Instruction *Orig);
-
-  /// Add metadata from one instruction to another.
-  ///
-  /// This includes both the original MDs from \p From and additional ones (\see
-  /// addNewMetadata).  Use this for *newly created* instructions in the vector
-  /// loop.
-  void addMetadata(Value *To, Instruction *From);
-
-  /// Set the debug location in the builder using the debug location \p DL.
-  void setDebugLocFrom(DebugLoc DL);
-
-  /// Construct the vector value of a scalarized value \p V one lane at a time.
-  void packScalarIntoVectorValue(VPValue *Def, const VPLane &Lane);
-
-  /// Hold state information used when constructing the CFG of the output IR,
-  /// traversing the VPBasicBlocks and generating corresponding IR BasicBlocks.
-  struct CFGState {
-    /// The previous VPBasicBlock visited. Initially set to null.
-    VPBasicBlock *PrevVPBB = nullptr;
-
-    /// The previous IR BasicBlock created or used. Initially set to the new
-    /// header BasicBlock.
-    BasicBlock *PrevBB = nullptr;
-
-    /// The last IR BasicBlock in the output IR. Set to the exit block of the
-    /// vector loop.
-    BasicBlock *ExitBB = nullptr;
-
-    /// A mapping of each VPBasicBlock to the corresponding BasicBlock. In case
-    /// of replication, maps the BasicBlock of the last replica created.
-    SmallDenseMap<VPBasicBlock *, BasicBlock *> VPBB2IRBB;
-
-    /// Updater for the DominatorTree.
-    DomTreeUpdater DTU;
-
-    CFGState(DominatorTree *DT)
-        : DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy) {}
-
-    /// Returns the BasicBlock* mapped to the pre-header of the loop region
-    /// containing \p R.
-    BasicBlock *getPreheaderBBFor(VPRecipeBase *R);
-  } CFG;
-
-  /// Hold a pointer to LoopInfo to register new basic blocks in the loop.
-  LoopInfo *LI;
-
-  /// Hold a reference to the IRBuilder used to generate output IR code.
-  IRBuilderBase &Builder;
-
-  /// Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods.
-  InnerLoopVectorizer *ILV;
-
-  /// Pointer to the VPlan code is generated for.
-  VPlan *Plan;
-
-  /// The parent loop object for the current scope, or nullptr.
-  Loop *CurrentParentLoop = nullptr;
-
-  /// LoopVersioning.  It's only set up (non-null) if memchecks were
-  /// used.
-  ///
-  /// This is currently only used to add no-alias metadata based on the
-  /// memchecks.  The actually versioning is performed manually.
-  LoopVersioning *LVer = nullptr;
-
-  /// Map SCEVs to their expanded values. Populated when executing
-  /// VPExpandSCEVRecipes.
-  DenseMap<const SCEV *, Value *> ExpandedSCEVs;
-
-  /// VPlan-based type analysis.
-  VPTypeAnalysis TypeAnalysis;
-};
-
 /// VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
 /// A VPBlockBase can be either a VPBasicBlock or a VPRegionBlock.
 class VPBlockBase {
@@ -654,10 +340,7 @@ public:
                      VPSlotTracker &SlotTracker) const = 0;
 
   /// Print plain-text dump of this VPlan to \p O.
-  void print(raw_ostream &O) const {
-    VPSlotTracker SlotTracker(getPlan());
-    print(O, "", SlotTracker);
-  }
+  void print(raw_ostream &O) const;
 
   /// Print the successors of this block to \p O, prefixing all lines with \p
   /// Indent.
@@ -673,34 +356,6 @@ public:
   virtual VPBlockBase *clone() = 0;
 };
 
-/// Struct to hold various analysis needed for cost computations.
-struct VPCostContext {
-  const TargetTransformInfo &TTI;
-  const TargetLibraryInfo &TLI;
-  VPTypeAnalysis Types;
-  LLVMContext &LLVMCtx;
-  LoopVectorizationCostModel &CM;
-  SmallPtrSet<Instruction *, 8> SkipCostComputation;
-  TargetTransformInfo::TargetCostKind CostKind;
-
-  VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI,
-                Type *CanIVTy, LoopVectorizationCostModel &CM,
-                TargetTransformInfo::TargetCostKind CostKind)
-      : TTI(TTI), TLI(TLI), Types(CanIVTy), LLVMCtx(CanIVTy->getContext()),
-        CM(CM), CostKind(CostKind) {}
-
-  /// Return the cost for \p UI with \p VF using the legacy cost model as
-  /// fallback until computing the cost of all recipes migrates to VPlan.
-  InstructionCost getLegacyCost(Instruction *UI, ElementCount VF) const;
-
-  /// Return true if the cost for \p UI shouldn't be computed, e.g. because it
-  /// has already been pre-computed.
-  bool skipCostComputation(Instruction *UI, bool IsVector) const;
-
-  /// Returns the OperandInfo for \p V, if it is a live-in.
-  TargetTransformInfo::OperandValueInfo getOperandInfo(VPValue *V) const;
-};
-
 /// VPRecipeBase is a base class modeling a sequence of one or more output IR
 /// instructions. VPRecipeBase owns the VPValues it defines through VPDef
 /// and is responsible for deleting its defined values. Single-value
@@ -1058,6 +713,7 @@ public:
            R->getVPDefID() == VPRecipeBase::VPWidenEVLSC ||
            R->getVPDefID() == VPRecipeBase::VPWidenGEPSC ||
            R->getVPDefID() == VPRecipeBase::VPWidenCastSC ||
+           R->getVPDefID() == VPRecipeBase::VPWidenIntrinsicSC ||
            R->getVPDefID() == VPRecipeBase::VPReplicateSC ||
            R->getVPDefID() == VPRecipeBase::VPReverseVectorPointerSC ||
            R->getVPDefID() == VPRecipeBase::VPVectorPointerSC;
@@ -1223,6 +879,9 @@ public:
     // Returns a scalar boolean value, which is true if any lane of its (only
     // boolean) vector operand is true.
     AnyOf,
+    // Extracts the first active lane of a vector, where the first operand is
+    // the predicate, and the second operand is the vector to extract.
+    ExtractFirstActive,
   };
 
 private:
@@ -3667,12 +3326,12 @@ protected:
 
   /// Connect the VPBBs predecessors' in the VPlan CFG to the IR basic block
   /// generated for this VPBB.
-  void connectToPredecessors(VPTransformState::CFGState &CFG);
+  void connectToPredecessors(VPTransformState &State);
 
 private:
   /// Create an IR BasicBlock to hold the output instructions generated by this
   /// VPBasicBlock, and return it. Update the CFGState accordingly.
-  BasicBlock *createEmptyBasicBlock(VPTransformState::CFGState &CFG);
+  BasicBlock *createEmptyBasicBlock(VPTransformState &State);
 };
 
 /// A special type of VPBasicBlock that wraps an existing IR basic block.
@@ -3967,6 +3626,9 @@ public:
   /// of VPBlockShallowTraversalWrapper.
   auto getExitBlocks();
 
+  /// Returns true if \p VPBB is an exit block.
+  bool isExitBlock(VPBlockBase *VPBB);
+
   /// The trip count of the original loop.
   VPValue *getTripCount() const {
     assert(TripCount && "trip count needs to be set before accessing it");
@@ -4139,55 +3801,6 @@ public:
 };
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-/// VPlanPrinter prints a given VPlan to a given output stream. The printing is
-/// indented and follows the dot format.
-class VPlanPrinter {
-  raw_ostream &OS;
-  const VPlan &Plan;
-  unsigned Depth = 0;
-  unsigned TabWidth = 2;
-  std::string Indent;
-  unsigned BID = 0;
-  SmallDenseMap<const VPBlockBase *, unsigned> BlockID;
-
-  VPSlotTracker SlotTracker;
-
-  /// Handle indentation.
-  void bumpIndent(int b) { Indent = std::string((Depth += b) * TabWidth, ' '); }
-
-  /// Print a given \p Block of the Plan.
-  void dumpBlock(const VPBlockBase *Block);
-
-  /// Print the information related to the CFG edges going out of a given
-  /// \p Block, followed by printing the successor blocks themselves.
-  void dumpEdges(const VPBlockBase *Block);
-
-  /// Print a given \p BasicBlock, including its VPRecipes, followed by printing
-  /// its successor blocks.
-  void dumpBasicBlock(const VPBasicBlock *BasicBlock);
-
-  /// Print a given \p Region of the Plan.
-  void dumpRegion(const VPRegionBlock *Region);
-
-  unsigned getOrCreateBID(const VPBlockBase *Block) {
-    return BlockID.count(Block) ? BlockID[Block] : BlockID[Block] = BID++;
-  }
-
-  Twine getOrCreateName(const VPBlockBase *Block);
-
-  Twine getUID(const VPBlockBase *Block);
-
-  /// Print the information related to a CFG edge between two VPBlockBases.
-  void drawEdge(const VPBlockBase *From, const VPBlockBase *To, bool Hidden,
-                const Twine &Label);
-
-public:
-  VPlanPrinter(raw_ostream &O, const VPlan &P)
-      : OS(O), Plan(P), SlotTracker(&P) {}
-
-  LLVM_DUMP_METHOD void dump();
-};
-
 struct VPlanIngredient {
   const Value *V;
 
@@ -4207,139 +3820,6 @@ inline raw_ostream &operator<<(raw_ostream &OS, const VPlan &Plan) {
 }
 #endif
 
-class VPInterleavedAccessInfo {
-  DenseMap<VPInstruction *, InterleaveGroup<VPInstruction> *>
-      InterleaveGroupMap;
-
-  /// Type for mapping of instruction based interleave groups to VPInstruction
-  /// interleave groups
-  using Old2NewTy = DenseMap<InterleaveGroup<Instruction> *,
-                             InterleaveGroup<VPInstruction> *>;
-
-  /// Recursively \p Region and populate VPlan based interleave groups based on
-  /// \p IAI.
-  void visitRegion(VPRegionBlock *Region, Old2NewTy &Old2New,
-                   InterleavedAccessInfo &IAI);
-  /// Recursively traverse \p Block and populate VPlan based interleave groups
-  /// based on \p IAI.
-  void visitBlock(VPBlockBase *Block, Old2NewTy &Old2New,
-                  InterleavedAccessInfo &IAI);
-
-public:
-  VPInterleavedAccessInfo(VPlan &Plan, InterleavedAccessInfo &IAI);
-
-  ~VPInterleavedAccessInfo() {
-    SmallPtrSet<InterleaveGroup<VPInstruction> *, 4> DelSet;
-    // Avoid releasing a pointer twice.
-    for (auto &I : InterleaveGroupMap)
-      DelSet.insert(I.second);
-    for (auto *Ptr : DelSet)
-      delete Ptr;
-  }
-
-  /// Get the interleave group that \p Instr belongs to.
-  ///
-  /// \returns nullptr if doesn't have such group.
-  InterleaveGroup<VPInstruction> *
-  getInterleaveGroup(VPInstruction *Instr) const {
-    return InterleaveGroupMap.lookup(Instr);
-  }
-};
-
-/// Class that maps (parts of) an existing VPlan to trees of combined
-/// VPInstructions.
-class VPlanSlp {
-  enum class OpMode { Failed, Load, Opcode };
-
-  /// A DenseMapInfo implementation for using SmallVector<VPValue *, 4> as
-  /// DenseMap keys.
-  struct BundleDenseMapInfo {
-    static SmallVector<VPValue *, 4> getEmptyKey() {
-      return {reinterpret_cast<VPValue *>(-1)};
-    }
-
-    static SmallVector<VPValue *, 4> getTombstoneKey() {
-      return {reinterpret_cast<VPValue *>(-2)};
-    }
-
-    static unsigned getHashValue(const SmallVector<VPValue *, 4> &V) {
-      return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
-    }
-
-    static bool isEqual(const SmallVector<VPValue *, 4> &LHS,
-                        const SmallVector<VPValue *, 4> &RHS) {
-      return LHS == RHS;
-    }
-  };
-
-  /// Mapping of values in the original VPlan to a combined VPInstruction.
-  DenseMap<SmallVector<VPValue *, 4>, VPInstruction *, BundleDenseMapInfo>
-      BundleToCombined;
-
-  VPInterleavedAccessInfo &IAI;
-
-  /// Basic block to operate on. For now, only instructions in a single BB are
-  /// considered.
-  const VPBasicBlock &BB;
-
-  /// Indicates whether we managed to combine all visited instructions or not.
-  bool CompletelySLP = true;
-
-  /// Width of the widest combined bundle in bits.
-  unsigned WidestBundleBits = 0;
-
-  using MultiNodeOpTy =
-      typename std::pair<VPInstruction *, SmallVector<VPValue *, 4>>;
-
-  // Input operand bundles for the current multi node. Each multi node operand
-  // bundle contains values not matching the multi node's opcode. They will
-  // be reordered in reorderMultiNodeOps, once we completed building a
-  // multi node.
-  SmallVector<MultiNodeOpTy, 4> MultiNodeOps;
-
-  /// Indicates whether we are building a multi node currently.
-  bool MultiNodeActive = false;
-
-  /// Check if we can vectorize Operands together.
-  bool areVectorizable(ArrayRef<VPValue *> Operands) const;
-
-  /// Add combined instruction \p New for the bundle \p Operands.
-  void addCombined(ArrayRef<VPValue *> Operands, VPInstruction *New);
-
-  /// Indicate we hit a bundle we failed to combine. Returns nullptr for now.
-  VPInstruction *markFailed();
-
-  /// Reorder operands in the multi node to maximize sequential memory access
-  /// and commutative operations.
-  SmallVector<MultiNodeOpTy, 4> reorderMultiNodeOps();
-
-  /// Choose the best candidate to use for the lane after \p Last. The set of
-  /// candidates to choose from are values with an opcode matching \p Last's
-  /// or loads consecutive to \p Last.
-  std::pair<OpMode, VPValue *> getBest(OpMode Mode, VPValue *Last,
-                                       SmallPtrSetImpl<VPValue *> &Candidates,
-                                       VPInterleavedAccessInfo &IAI);
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  /// Print bundle \p Values to dbgs().
-  void dumpBundle(ArrayRef<VPValue *> Values);
-#endif
-
-public:
-  VPlanSlp(VPInterleavedAccessInfo &IAI, VPBasicBlock &BB) : IAI(IAI), BB(BB) {}
-
-  ~VPlanSlp() = default;
-
-  /// Tries to build an SLP tree rooted at \p Operands and returns a
-  /// VPInstruction combining \p Operands, if they can be combined.
-  VPInstruction *buildGraph(ArrayRef<VPValue *> Operands);
-
-  /// Return the width of the widest combined bundle in bits.
-  unsigned getWidestBundleBits() const { return WidestBundleBits; }
-
-  /// Return true if all visited instruction can be combined.
-  bool isCompletelySLP() const { return CompletelySLP; }
-};
 } // end namespace llvm
 
 #endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
diff --git llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 27357ff04b5f..71fb6d42116c 100644
--- llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -78,6 +78,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
   case VPInstruction::CanonicalIVIncrementForPart:
   case VPInstruction::AnyOf:
     return SetResultTyFromOp();
+  case VPInstruction::ExtractFirstActive:
   case VPInstruction::ExtractFromEnd: {
     Type *BaseTy = inferScalarType(R->getOperand(0));
     if (auto *VecTy = dyn_cast<VectorType>(BaseTy))
diff --git llvm/lib/Transforms/Vectorize/VPlanHelpers.h llvm/lib/Transforms/Vectorize/VPlanHelpers.h
new file mode 100644
index 000000000000..74713daf904f
--- /dev/null
+++ llvm/lib/Transforms/Vectorize/VPlanHelpers.h
@@ -0,0 +1,468 @@
+//===- VPlanHelpers.h - VPlan-related auxiliary helpers -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file contains the declarations of different VPlan-related auxiliary
+/// helpers.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLANHELPERS_H
+#define LLVM_TRANSFORMS_VECTORIZE_VPLANHELPERS_H
+
+#include "VPlanAnalysis.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/Support/InstructionCost.h"
+
+namespace llvm {
+
+class BasicBlock;
+class DominatorTree;
+class InnerLoopVectorizer;
+class IRBuilderBase;
+class LoopInfo;
+class SCEV;
+class Type;
+class VPBasicBlock;
+class VPRegionBlock;
+class VPlan;
+class Value;
+class LoopVersioning;
+
+/// Returns a calculation for the total number of elements for a given \p VF.
+/// For fixed width vectors this value is a constant, whereas for scalable
+/// vectors it is an expression determined at runtime.
+Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF);
+
+/// Return a value for Step multiplied by VF.
+Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
+                       int64_t Step);
+
+/// A helper function that returns the reciprocal of the block probability of
+/// predicated blocks. If we return X, we are assuming the predicated block
+/// will execute once for every X iterations of the loop header.
+///
+/// TODO: We should use actual block probability here, if available. Currently,
+///       we always assume predicated blocks have a 50% chance of executing.
+inline unsigned getReciprocalPredBlockProb() { return 2; }
+
+/// A range of powers-of-2 vectorization factors with fixed start and
+/// adjustable end. The range includes start and excludes end, e.g.,:
+/// [1, 16) = {1, 2, 4, 8}
+struct VFRange {
+  // A power of 2.
+  const ElementCount Start;
+
+  // A power of 2. If End <= Start range is empty.
+  ElementCount End;
+
+  bool isEmpty() const {
+    return End.getKnownMinValue() <= Start.getKnownMinValue();
+  }
+
+  VFRange(const ElementCount &Start, const ElementCount &End)
+      : Start(Start), End(End) {
+    assert(Start.isScalable() == End.isScalable() &&
+           "Both Start and End should have the same scalable flag");
+    assert(isPowerOf2_32(Start.getKnownMinValue()) &&
+           "Expected Start to be a power of 2");
+    assert(isPowerOf2_32(End.getKnownMinValue()) &&
+           "Expected End to be a power of 2");
+  }
+
+  /// Iterator to iterate over vectorization factors in a VFRange.
+  class iterator
+      : public iterator_facade_base<iterator, std::forward_iterator_tag,
+                                    ElementCount> {
+    ElementCount VF;
+
+  public:
+    iterator(ElementCount VF) : VF(VF) {}
+
+    bool operator==(const iterator &Other) const { return VF == Other.VF; }
+
+    ElementCount operator*() const { return VF; }
+
+    iterator &operator++() {
+      VF *= 2;
+      return *this;
+    }
+  };
+
+  iterator begin() { return iterator(Start); }
+  iterator end() {
+    assert(isPowerOf2_32(End.getKnownMinValue()));
+    return iterator(End);
+  }
+};
+
+/// In what follows, the term "input IR" refers to code that is fed into the
+/// vectorizer whereas the term "output IR" refers to code that is generated by
+/// the vectorizer.
+
+/// VPLane provides a way to access lanes in both fixed width and scalable
+/// vectors, where for the latter the lane index sometimes needs calculating
+/// as a runtime expression.
+class VPLane {
+public:
+  /// Kind describes how to interpret Lane.
+  enum class Kind : uint8_t {
+    /// For First, Lane is the index into the first N elements of a
+    /// fixed-vector <N x <ElTy>> or a scalable vector <vscale x N x <ElTy>>.
+    First,
+    /// For ScalableLast, Lane is the offset from the start of the last
+    /// N-element subvector in a scalable vector <vscale x N x <ElTy>>. For
+    /// example, a Lane of 0 corresponds to lane `(vscale - 1) * N`, a Lane of
+    /// 1 corresponds to `((vscale - 1) * N) + 1`, etc.
+    ScalableLast
+  };
+
+private:
+  /// in [0..VF)
+  unsigned Lane;
+
+  /// Indicates how the Lane should be interpreted, as described above.
+  Kind LaneKind = Kind::First;
+
+public:
+  VPLane(unsigned Lane) : Lane(Lane) {}
+  VPLane(unsigned Lane, Kind LaneKind) : Lane(Lane), LaneKind(LaneKind) {}
+
+  static VPLane getFirstLane() { return VPLane(0, VPLane::Kind::First); }
+
+  static VPLane getLaneFromEnd(const ElementCount &VF, unsigned Offset) {
+    assert(Offset > 0 && Offset <= VF.getKnownMinValue() &&
+           "trying to extract with invalid offset");
+    unsigned LaneOffset = VF.getKnownMinValue() - Offset;
+    Kind LaneKind;
+    if (VF.isScalable())
+      // In this case 'LaneOffset' refers to the offset from the start of the
+      // last subvector with VF.getKnownMinValue() elements.
+      LaneKind = VPLane::Kind::ScalableLast;
+    else
+      LaneKind = VPLane::Kind::First;
+    return VPLane(LaneOffset, LaneKind);
+  }
+
+  static VPLane getLastLaneForVF(const ElementCount &VF) {
+    return getLaneFromEnd(VF, 1);
+  }
+
+  /// Returns a compile-time known value for the lane index and asserts if the
+  /// lane can only be calculated at runtime.
+  unsigned getKnownLane() const {
+    assert(LaneKind == Kind::First &&
+           "can only get known lane from the beginning");
+    return Lane;
+  }
+
+  /// Returns an expression describing the lane index that can be used at
+  /// runtime.
+  Value *getAsRuntimeExpr(IRBuilderBase &Builder, const ElementCount &VF) const;
+
+  /// Returns the Kind of lane offset.
+  Kind getKind() const { return LaneKind; }
+
+  /// Returns true if this is the first lane of the whole vector.
+  bool isFirstLane() const { return Lane == 0 && LaneKind == Kind::First; }
+
+  /// Maps the lane to a cache index based on \p VF.
+  unsigned mapToCacheIndex(const ElementCount &VF) const {
+    switch (LaneKind) {
+    case VPLane::Kind::ScalableLast:
+      assert(VF.isScalable() && Lane < VF.getKnownMinValue() &&
+             "ScalableLast can only be used with scalable VFs");
+      return VF.getKnownMinValue() + Lane;
+    default:
+      assert(Lane < VF.getKnownMinValue() &&
+             "Cannot extract lane larger than VF");
+      return Lane;
+    }
+  }
+};
+
+/// VPTransformState holds information passed down when "executing" a VPlan,
+/// needed for generating the output IR.
+struct VPTransformState {
+  VPTransformState(const TargetTransformInfo *TTI, ElementCount VF, unsigned UF,
+                   LoopInfo *LI, DominatorTree *DT, IRBuilderBase &Builder,
+                   InnerLoopVectorizer *ILV, VPlan *Plan,
+                   Loop *CurrentParentLoop, Type *CanonicalIVTy);
+  /// Target Transform Info.
+  const TargetTransformInfo *TTI;
+
+  /// The chosen Vectorization Factor of the loop being vectorized.
+  ElementCount VF;
+
+  /// Hold the index to generate specific scalar instructions. Null indicates
+  /// that all instances are to be generated, using either scalar or vector
+  /// instructions.
+  std::optional<VPLane> Lane;
+
+  struct DataState {
+    // Each value from the original loop, when vectorized, is represented by a
+    // vector value in the map.
+    DenseMap<VPValue *, Value *> VPV2Vector;
+
+    DenseMap<VPValue *, SmallVector<Value *, 4>> VPV2Scalars;
+  } Data;
+
+  /// Get the generated vector Value for a given VPValue \p Def if \p IsScalar
+  /// is false, otherwise return the generated scalar. \See set.
+  Value *get(VPValue *Def, bool IsScalar = false);
+
+  /// Get the generated Value for a given VPValue and given Part and Lane.
+  Value *get(VPValue *Def, const VPLane &Lane);
+
+  bool hasVectorValue(VPValue *Def) { return Data.VPV2Vector.contains(Def); }
+
+  bool hasScalarValue(VPValue *Def, VPLane Lane) {
+    auto I = Data.VPV2Scalars.find(Def);
+    if (I == Data.VPV2Scalars.end())
+      return false;
+    unsigned CacheIdx = Lane.mapToCacheIndex(VF);
+    return CacheIdx < I->second.size() && I->second[CacheIdx];
+  }
+
+  /// Set the generated vector Value for a given VPValue, if \p
+  /// IsScalar is false. If \p IsScalar is true, set the scalar in lane 0.
+  void set(VPValue *Def, Value *V, bool IsScalar = false) {
+    if (IsScalar) {
+      set(Def, V, VPLane(0));
+      return;
+    }
+    assert((VF.isScalar() || V->getType()->isVectorTy()) &&
+           "scalar values must be stored as (0, 0)");
+    Data.VPV2Vector[Def] = V;
+  }
+
+  /// Reset an existing vector value for \p Def and a given \p Part.
+  void reset(VPValue *Def, Value *V) {
+    assert(Data.VPV2Vector.contains(Def) && "need to overwrite existing value");
+    Data.VPV2Vector[Def] = V;
+  }
+
+  /// Set the generated scalar \p V for \p Def and the given \p Lane.
+  void set(VPValue *Def, Value *V, const VPLane &Lane) {
+    auto &Scalars = Data.VPV2Scalars[Def];
+    unsigned CacheIdx = Lane.mapToCacheIndex(VF);
+    if (Scalars.size() <= CacheIdx)
+      Scalars.resize(CacheIdx + 1);
+    assert(!Scalars[CacheIdx] && "should overwrite existing value");
+    Scalars[CacheIdx] = V;
+  }
+
+  /// Reset an existing scalar value for \p Def and a given \p Lane.
+  void reset(VPValue *Def, Value *V, const VPLane &Lane) {
+    auto Iter = Data.VPV2Scalars.find(Def);
+    assert(Iter != Data.VPV2Scalars.end() &&
+           "need to overwrite existing value");
+    unsigned CacheIdx = Lane.mapToCacheIndex(VF);
+    assert(CacheIdx < Iter->second.size() &&
+           "need to overwrite existing value");
+    Iter->second[CacheIdx] = V;
+  }
+
+  /// Add additional metadata to \p To that was not present on \p Orig.
+  ///
+  /// Currently this is used to add the noalias annotations based on the
+  /// inserted memchecks.  Use this for instructions that are *cloned* into the
+  /// vector loop.
+  void addNewMetadata(Instruction *To, const Instruction *Orig);
+
+  /// Add metadata from one instruction to another.
+  ///
+  /// This includes both the original MDs from \p From and additional ones (\see
+  /// addNewMetadata).  Use this for *newly created* instructions in the vector
+  /// loop.
+  void addMetadata(Value *To, Instruction *From);
+
+  /// Set the debug location in the builder using the debug location \p DL.
+  void setDebugLocFrom(DebugLoc DL);
+
+  /// Construct the vector value of a scalarized value \p V one lane at a time.
+  void packScalarIntoVectorValue(VPValue *Def, const VPLane &Lane);
+
+  /// Hold state information used when constructing the CFG of the output IR,
+  /// traversing the VPBasicBlocks and generating corresponding IR BasicBlocks.
+  struct CFGState {
+    /// The previous VPBasicBlock visited. Initially set to null.
+    VPBasicBlock *PrevVPBB = nullptr;
+
+    /// The previous IR BasicBlock created or used. Initially set to the new
+    /// header BasicBlock.
+    BasicBlock *PrevBB = nullptr;
+
+    /// The last IR BasicBlock in the output IR. Set to the exit block of the
+    /// vector loop.
+    BasicBlock *ExitBB = nullptr;
+
+    /// A mapping of each VPBasicBlock to the corresponding BasicBlock. In case
+    /// of replication, maps the BasicBlock of the last replica created.
+    SmallDenseMap<VPBasicBlock *, BasicBlock *> VPBB2IRBB;
+
+    /// Updater for the DominatorTree.
+    DomTreeUpdater DTU;
+
+    CFGState(DominatorTree *DT)
+        : DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy) {}
+
+    /// Returns the BasicBlock* mapped to the pre-header of the loop region
+    /// containing \p R.
+    BasicBlock *getPreheaderBBFor(VPRecipeBase *R);
+  } CFG;
+
+  /// Hold a pointer to LoopInfo to register new basic blocks in the loop.
+  LoopInfo *LI;
+
+  /// Hold a reference to the IRBuilder used to generate output IR code.
+  IRBuilderBase &Builder;
+
+  /// Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods.
+  InnerLoopVectorizer *ILV;
+
+  /// Pointer to the VPlan code is generated for.
+  VPlan *Plan;
+
+  /// The parent loop object for the current scope, or nullptr.
+  Loop *CurrentParentLoop = nullptr;
+
+  /// LoopVersioning.  It's only set up (non-null) if memchecks were
+  /// used.
+  ///
+  /// This is currently only used to add no-alias metadata based on the
+  /// memchecks.  The actually versioning is performed manually.
+  LoopVersioning *LVer = nullptr;
+
+  /// Map SCEVs to their expanded values. Populated when executing
+  /// VPExpandSCEVRecipes.
+  DenseMap<const SCEV *, Value *> ExpandedSCEVs;
+
+  /// VPlan-based type analysis.
+  VPTypeAnalysis TypeAnalysis;
+};
+
+/// Struct to hold various analysis needed for cost computations.
+struct VPCostContext {
+  const TargetTransformInfo &TTI;
+  const TargetLibraryInfo &TLI;
+  VPTypeAnalysis Types;
+  LLVMContext &LLVMCtx;
+  LoopVectorizationCostModel &CM;
+  SmallPtrSet<Instruction *, 8> SkipCostComputation;
+  TargetTransformInfo::TargetCostKind CostKind;
+
+  VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI,
+                Type *CanIVTy, LoopVectorizationCostModel &CM,
+                TargetTransformInfo::TargetCostKind CostKind)
+      : TTI(TTI), TLI(TLI), Types(CanIVTy), LLVMCtx(CanIVTy->getContext()),
+        CM(CM), CostKind(CostKind) {}
+
+  /// Return the cost for \p UI with \p VF using the legacy cost model as
+  /// fallback until computing the cost of all recipes migrates to VPlan.
+  InstructionCost getLegacyCost(Instruction *UI, ElementCount VF) const;
+
+  /// Return true if the cost for \p UI shouldn't be computed, e.g. because it
+  /// has already been pre-computed.
+  bool skipCostComputation(Instruction *UI, bool IsVector) const;
+
+  /// Returns the OperandInfo for \p V, if it is a live-in.
+  TargetTransformInfo::OperandValueInfo getOperandInfo(VPValue *V) const;
+};
+
+/// This class can be used to assign names to VPValues. For VPValues without
+/// underlying value, assign consecutive numbers and use those as names (wrapped
+/// in vp<>). Otherwise, use the name from the underlying value (wrapped in
+/// ir<>), appending a .V version number if there are multiple uses of the same
+/// name. Allows querying names for VPValues for printing, similar to the
+/// ModuleSlotTracker for IR values.
+class VPSlotTracker {
+  /// Keep track of versioned names assigned to VPValues with underlying IR
+  /// values.
+  DenseMap<const VPValue *, std::string> VPValue2Name;
+  /// Keep track of the next number to use to version the base name.
+  StringMap<unsigned> BaseName2Version;
+
+  /// Number to assign to the next VPValue without underlying value.
+  unsigned NextSlot = 0;
+
+  void assignName(const VPValue *V);
+  void assignNames(const VPlan &Plan);
+  void assignNames(const VPBasicBlock *VPBB);
+
+public:
+  VPSlotTracker(const VPlan *Plan = nullptr) {
+    if (Plan)
+      assignNames(*Plan);
+  }
+
+  /// Returns the name assigned to \p V, if there is one, otherwise try to
+  /// construct one from the underlying value, if there's one; else return
+  /// <badref>.
+  std::string getOrCreateName(const VPValue *V) const;
+};
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+/// VPlanPrinter prints a given VPlan to a given output stream. The printing is
+/// indented and follows the dot format.
+class VPlanPrinter {
+  raw_ostream &OS;
+  const VPlan &Plan;
+  unsigned Depth = 0;
+  unsigned TabWidth = 2;
+  std::string Indent;
+  unsigned BID = 0;
+  SmallDenseMap<const VPBlockBase *, unsigned> BlockID;
+
+  VPSlotTracker SlotTracker;
+
+  /// Handle indentation.
+  void bumpIndent(int b) { Indent = std::string((Depth += b) * TabWidth, ' '); }
+
+  /// Print a given \p Block of the Plan.
+  void dumpBlock(const VPBlockBase *Block);
+
+  /// Print the information related to the CFG edges going out of a given
+  /// \p Block, followed by printing the successor blocks themselves.
+  void dumpEdges(const VPBlockBase *Block);
+
+  /// Print a given \p BasicBlock, including its VPRecipes, followed by printing
+  /// its successor blocks.
+  void dumpBasicBlock(const VPBasicBlock *BasicBlock);
+
+  /// Print a given \p Region of the Plan.
+  void dumpRegion(const VPRegionBlock *Region);
+
+  unsigned getOrCreateBID(const VPBlockBase *Block) {
+    return BlockID.count(Block) ? BlockID[Block] : BlockID[Block] = BID++;
+  }
+
+  Twine getOrCreateName(const VPBlockBase *Block);
+
+  Twine getUID(const VPBlockBase *Block);
+
+  /// Print the information related to a CFG edge between two VPBlockBases.
+  void drawEdge(const VPBlockBase *From, const VPBlockBase *To, bool Hidden,
+                const Twine &Label);
+
+public:
+  VPlanPrinter(raw_ostream &O, const VPlan &P)
+      : OS(O), Plan(P), SlotTracker(&P) {}
+
+  LLVM_DUMP_METHOD void dump();
+};
+#endif
+
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
diff --git llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 2679ed6b26b5..c84a93d7398f 100644
--- llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -14,12 +14,14 @@
 #include "LoopVectorizationPlanner.h"
 #include "VPlan.h"
 #include "VPlanAnalysis.h"
+#include "VPlanHelpers.h"
 #include "VPlanPatternMatch.h"
 #include "VPlanUtils.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/IVDescriptors.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
@@ -697,7 +699,13 @@ Value *VPInstruction::generate(VPTransformState &State) {
     Value *A = State.get(getOperand(0));
     return Builder.CreateOrReduce(A);
   }
-
+  case VPInstruction::ExtractFirstActive: {
+    Value *Vec = State.get(getOperand(0));
+    Value *Mask = State.get(getOperand(1));
+    Value *Ctz = Builder.CreateCountTrailingZeroElems(
+        Builder.getInt64Ty(), Mask, true, "first.active.lane");
+    return Builder.CreateExtractElement(Vec, Ctz, "early.exit.value");
+  }
   default:
     llvm_unreachable("Unsupported opcode for instruction");
   }
@@ -705,6 +713,7 @@ Value *VPInstruction::generate(VPTransformState &State) {
 
 bool VPInstruction::isVectorToScalar() const {
   return getOpcode() == VPInstruction::ExtractFromEnd ||
+         getOpcode() == VPInstruction::ExtractFirstActive ||
          getOpcode() == VPInstruction::ComputeReductionResult ||
          getOpcode() == VPInstruction::AnyOf;
 }
@@ -769,6 +778,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
   case VPInstruction::CalculateTripCountMinusVF:
   case VPInstruction::CanonicalIVIncrementForPart:
   case VPInstruction::ExtractFromEnd:
+  case VPInstruction::ExtractFirstActive:
   case VPInstruction::FirstOrderRecurrenceSplice:
   case VPInstruction::LogicalAnd:
   case VPInstruction::Not:
@@ -888,6 +898,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
   case VPInstruction::AnyOf:
     O << "any-of";
     break;
+  case VPInstruction::ExtractFirstActive:
+    O << "extract-first-active";
+    break;
   default:
     O << Instruction::getOpcodeName(getOpcode());
   }
diff --git llvm/lib/Transforms/Vectorize/VPlanSLP.cpp llvm/lib/Transforms/Vectorize/VPlanSLP.cpp
index 98ccf2169463..e943c7a29eb8 100644
--- llvm/lib/Transforms/Vectorize/VPlanSLP.cpp
+++ llvm/lib/Transforms/Vectorize/VPlanSLP.cpp
@@ -14,10 +14,13 @@
 ///
 //===----------------------------------------------------------------------===//
 
+#include "VPlanSLP.h"
 #include "VPlan.h"
+#include "VPlanCFG.h"
 #include "VPlanValue.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
@@ -39,6 +42,57 @@ using namespace llvm;
 // Number of levels to look ahead when re-ordering multi node operands.
 static unsigned LookaheadMaxDepth = 5;
 
+void VPInterleavedAccessInfo::visitRegion(VPRegionBlock *Region,
+                                          Old2NewTy &Old2New,
+                                          InterleavedAccessInfo &IAI) {
+  ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
+      Region->getEntry());
+  for (VPBlockBase *Base : RPOT) {
+    visitBlock(Base, Old2New, IAI);
+  }
+}
+
+void VPInterleavedAccessInfo::visitBlock(VPBlockBase *Block, Old2NewTy &Old2New,
+                                         InterleavedAccessInfo &IAI) {
+  if (VPBasicBlock *VPBB = dyn_cast<VPBasicBlock>(Block)) {
+    for (VPRecipeBase &VPI : *VPBB) {
+      if (isa<VPWidenPHIRecipe>(&VPI))
+        continue;
+      auto *VPInst = cast<VPInstruction>(&VPI);
+      auto *Inst = dyn_cast_or_null<Instruction>(VPInst->getUnderlyingValue());
+      if (!Inst)
+        continue;
+      auto *IG = IAI.getInterleaveGroup(Inst);
+      if (!IG)
+        continue;
+
+      auto NewIGIter = Old2New.find(IG);
+      if (NewIGIter == Old2New.end())
+        Old2New[IG] = new InterleaveGroup<VPInstruction>(
+            IG->getFactor(), IG->isReverse(), IG->getAlign());
+
+      if (Inst == IG->getInsertPos())
+        Old2New[IG]->setInsertPos(VPInst);
+
+      InterleaveGroupMap[VPInst] = Old2New[IG];
+      InterleaveGroupMap[VPInst]->insertMember(
+          VPInst, IG->getIndex(Inst),
+          Align(IG->isReverse() ? (-1) * int(IG->getFactor())
+                                : IG->getFactor()));
+    }
+  } else if (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block)) {
+    visitRegion(Region, Old2New, IAI);
+  } else {
+    llvm_unreachable("Unsupported kind of VPBlock.");
+  }
+}
+
+VPInterleavedAccessInfo::VPInterleavedAccessInfo(VPlan &Plan,
+                                                 InterleavedAccessInfo &IAI) {
+  Old2NewTy Old2New;
+  visitRegion(Plan.getVectorLoopRegion(), Old2New, IAI);
+}
+
 VPInstruction *VPlanSlp::markFailed() {
   // FIXME: Currently this is used to signal we hit instructions we cannot
   //        trivially SLP'ize.
diff --git llvm/lib/Transforms/Vectorize/VPlanSLP.h llvm/lib/Transforms/Vectorize/VPlanSLP.h
new file mode 100644
index 000000000000..a40ebd28deea
--- /dev/null
+++ llvm/lib/Transforms/Vectorize/VPlanSLP.h
@@ -0,0 +1,166 @@
+//===- VPlan.h - VPlan-based SLP ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file contains the declarations for VPlan-based SLP.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLANSLP_H
+#define LLVM_TRANSFORMS_VECTORIZE_VPLANSLP_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/VectorUtils.h"
+
+namespace llvm {
+
+class VPBasicBlock;
+class VPBlockBase;
+class VPRegionBlock;
+class VPlan;
+class VPValue;
+class VPInstruction;
+
+class VPInterleavedAccessInfo {
+  DenseMap<VPInstruction *, InterleaveGroup<VPInstruction> *>
+      InterleaveGroupMap;
+
+  /// Type for mapping of instruction based interleave groups to VPInstruction
+  /// interleave groups
+  using Old2NewTy = DenseMap<InterleaveGroup<Instruction> *,
+                             InterleaveGroup<VPInstruction> *>;
+
+  /// Recursively \p Region and populate VPlan based interleave groups based on
+  /// \p IAI.
+  void visitRegion(VPRegionBlock *Region, Old2NewTy &Old2New,
+                   InterleavedAccessInfo &IAI);
+  /// Recursively traverse \p Block and populate VPlan based interleave groups
+  /// based on \p IAI.
+  void visitBlock(VPBlockBase *Block, Old2NewTy &Old2New,
+                  InterleavedAccessInfo &IAI);
+
+public:
+  VPInterleavedAccessInfo(VPlan &Plan, InterleavedAccessInfo &IAI);
+
+  ~VPInterleavedAccessInfo() {
+    SmallPtrSet<InterleaveGroup<VPInstruction> *, 4> DelSet;
+    // Avoid releasing a pointer twice.
+    for (auto &I : InterleaveGroupMap)
+      DelSet.insert(I.second);
+    for (auto *Ptr : DelSet)
+      delete Ptr;
+  }
+
+  /// Get the interleave group that \p Instr belongs to.
+  ///
+  /// \returns nullptr if doesn't have such group.
+  InterleaveGroup<VPInstruction> *
+  getInterleaveGroup(VPInstruction *Instr) const {
+    return InterleaveGroupMap.lookup(Instr);
+  }
+};
+
+/// Class that maps (parts of) an existing VPlan to trees of combined
+/// VPInstructions.
+class VPlanSlp {
+  enum class OpMode { Failed, Load, Opcode };
+
+  /// A DenseMapInfo implementation for using SmallVector<VPValue *, 4> as
+  /// DenseMap keys.
+  struct BundleDenseMapInfo {
+    static SmallVector<VPValue *, 4> getEmptyKey() {
+      return {reinterpret_cast<VPValue *>(-1)};
+    }
+
+    static SmallVector<VPValue *, 4> getTombstoneKey() {
+      return {reinterpret_cast<VPValue *>(-2)};
+    }
+
+    static unsigned getHashValue(const SmallVector<VPValue *, 4> &V) {
+      return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
+    }
+
+    static bool isEqual(const SmallVector<VPValue *, 4> &LHS,
+                        const SmallVector<VPValue *, 4> &RHS) {
+      return LHS == RHS;
+    }
+  };
+
+  /// Mapping of values in the original VPlan to a combined VPInstruction.
+  DenseMap<SmallVector<VPValue *, 4>, VPInstruction *, BundleDenseMapInfo>
+      BundleToCombined;
+
+  VPInterleavedAccessInfo &IAI;
+
+  /// Basic block to operate on. For now, only instructions in a single BB are
+  /// considered.
+  const VPBasicBlock &BB;
+
+  /// Indicates whether we managed to combine all visited instructions or not.
+  bool CompletelySLP = true;
+
+  /// Width of the widest combined bundle in bits.
+  unsigned WidestBundleBits = 0;
+
+  using MultiNodeOpTy =
+      typename std::pair<VPInstruction *, SmallVector<VPValue *, 4>>;
+
+  // Input operand bundles for the current multi node. Each multi node operand
+  // bundle contains values not matching the multi node's opcode. They will
+  // be reordered in reorderMultiNodeOps, once we completed building a
+  // multi node.
+  SmallVector<MultiNodeOpTy, 4> MultiNodeOps;
+
+  /// Indicates whether we are building a multi node currently.
+  bool MultiNodeActive = false;
+
+  /// Check if we can vectorize Operands together.
+  bool areVectorizable(ArrayRef<VPValue *> Operands) const;
+
+  /// Add combined instruction \p New for the bundle \p Operands.
+  void addCombined(ArrayRef<VPValue *> Operands, VPInstruction *New);
+
+  /// Indicate we hit a bundle we failed to combine. Returns nullptr for now.
+  VPInstruction *markFailed();
+
+  /// Reorder operands in the multi node to maximize sequential memory access
+  /// and commutative operations.
+  SmallVector<MultiNodeOpTy, 4> reorderMultiNodeOps();
+
+  /// Choose the best candidate to use for the lane after \p Last. The set of
+  /// candidates to choose from are values with an opcode matching \p Last's
+  /// or loads consecutive to \p Last.
+  std::pair<OpMode, VPValue *> getBest(OpMode Mode, VPValue *Last,
+                                       SmallPtrSetImpl<VPValue *> &Candidates,
+                                       VPInterleavedAccessInfo &IAI);
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print bundle \p Values to dbgs().
+  void dumpBundle(ArrayRef<VPValue *> Values);
+#endif
+
+public:
+  VPlanSlp(VPInterleavedAccessInfo &IAI, VPBasicBlock &BB) : IAI(IAI), BB(BB) {}
+
+  ~VPlanSlp() = default;
+
+  /// Tries to build an SLP tree rooted at \p Operands and returns a
+  /// VPInstruction combining \p Operands, if they can be combined.
+  VPInstruction *buildGraph(ArrayRef<VPValue *> Operands);
+
+  /// Return the width of the widest combined bundle in bits.
+  unsigned getWidestBundleBits() const { return WidestBundleBits; }
+
+  /// Return true if all visited instruction can be combined.
+  bool isCompletelySLP() const { return CompletelySLP; }
+};
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
diff --git llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 242dc8a636a6..7e9ef4613393 100644
--- llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -25,6 +25,7 @@
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Analysis/IVDescriptors.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/PatternMatch.h"
@@ -2064,7 +2065,7 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan) {
   }
 }
 
-bool VPlanTransforms::handleUncountableEarlyExit(
+void VPlanTransforms::handleUncountableEarlyExit(
     VPlan &Plan, ScalarEvolution &SE, Loop *OrigLoop,
     BasicBlock *UncountableExitingBlock, VPRecipeBuilder &RecipeBuilder) {
   VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
@@ -2101,12 +2102,17 @@ bool VPlanTransforms::handleUncountableEarlyExit(
       Builder.createNaryOp(VPInstruction::AnyOf, {EarlyExitTakenCond});
 
   VPBasicBlock *NewMiddle = Plan.createVPBasicBlock("middle.split");
+  VPBasicBlock *VectorEarlyExitVPBB =
+      Plan.createVPBasicBlock("vector.early.exit");
   VPBlockUtils::insertOnEdge(LoopRegion, MiddleVPBB, NewMiddle);
-  VPBlockUtils::connectBlocks(NewMiddle, VPEarlyExitBlock);
+  VPBlockUtils::connectBlocks(NewMiddle, VectorEarlyExitVPBB);
   NewMiddle->swapSuccessors();
 
+  VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, VPEarlyExitBlock);
+
   // Update the exit phis in the early exit block.
   VPBuilder MiddleBuilder(NewMiddle);
+  VPBuilder EarlyExitB(VectorEarlyExitVPBB);
   for (VPRecipeBase &R : *VPEarlyExitBlock) {
     auto *ExitIRI = cast<VPIRInstruction>(&R);
     auto *ExitPhi = dyn_cast<PHINode>(&ExitIRI->getInstruction());
@@ -2115,9 +2121,6 @@ bool VPlanTransforms::handleUncountableEarlyExit(
 
     VPValue *IncomingFromEarlyExit = RecipeBuilder.getVPValueOrAddLiveIn(
         ExitPhi->getIncomingValueForBlock(UncountableExitingBlock));
-    // The incoming value from the early exit must be a live-in for now.
-    if (!IncomingFromEarlyExit->isLiveIn())
-      return false;
 
     if (OrigLoop->getUniqueExitBlock()) {
       // If there's a unique exit block, VPEarlyExitBlock has 2 predecessors
@@ -2129,6 +2132,10 @@ bool VPlanTransforms::handleUncountableEarlyExit(
       ExitIRI->extractLastLaneOfOperand(MiddleBuilder);
     }
     // Add the incoming value from the early exit.
+    if (!IncomingFromEarlyExit->isLiveIn())
+      IncomingFromEarlyExit =
+          EarlyExitB.createNaryOp(VPInstruction::ExtractFirstActive,
+                                  {IncomingFromEarlyExit, EarlyExitTakenCond});
     ExitIRI->addOperand(IncomingFromEarlyExit);
   }
   MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {IsEarlyExitTaken});
@@ -2146,5 +2153,4 @@ bool VPlanTransforms::handleUncountableEarlyExit(
       Instruction::Or, {IsEarlyExitTaken, IsLatchExitTaken});
   Builder.createNaryOp(VPInstruction::BranchOnCond, AnyExitTaken);
   LatchExitingBranch->eraseFromParent();
-  return true;
 }
diff --git llvm/lib/Transforms/Vectorize/VPlanTransforms.h llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index ad24d9f14682..0cd4cf1f22a7 100644
--- llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -155,7 +155,7 @@ struct VPlanTransforms {
   ///    exit conditions
   ///  * splitting the original middle block to branch to the early exit block
   ///    if taken.
-  static bool handleUncountableEarlyExit(VPlan &Plan, ScalarEvolution &SE,
+  static void handleUncountableEarlyExit(VPlan &Plan, ScalarEvolution &SE,
                                          Loop *OrigLoop,
                                          BasicBlock *UncountableExitingBlock,
                                          VPRecipeBuilder &RecipeBuilder);
diff --git llvm/lib/Transforms/Vectorize/VPlanValue.h llvm/lib/Transforms/Vectorize/VPlanValue.h
index 23e39ce89a3a..aabc4ab571e7 100644
--- llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -435,41 +435,6 @@ public:
 #endif
 };
 
-class VPlan;
-class VPBasicBlock;
-
-/// This class can be used to assign names to VPValues. For VPValues without
-/// underlying value, assign consecutive numbers and use those as names (wrapped
-/// in vp<>). Otherwise, use the name from the underlying value (wrapped in
-/// ir<>), appending a .V version number if there are multiple uses of the same
-/// name. Allows querying names for VPValues for printing, similar to the
-/// ModuleSlotTracker for IR values.
-class VPSlotTracker {
-  /// Keep track of versioned names assigned to VPValues with underlying IR
-  /// values.
-  DenseMap<const VPValue *, std::string> VPValue2Name;
-  /// Keep track of the next number to use to version the base name.
-  StringMap<unsigned> BaseName2Version;
-
-  /// Number to assign to the next VPValue without underlying value.
-  unsigned NextSlot = 0;
-
-  void assignName(const VPValue *V);
-  void assignNames(const VPlan &Plan);
-  void assignNames(const VPBasicBlock *VPBB);
-
-public:
-  VPSlotTracker(const VPlan *Plan = nullptr) {
-    if (Plan)
-      assignNames(*Plan);
-  }
-
-  /// Returns the name assigned to \p V, if there is one, otherwise try to
-  /// construct one from the underlying value, if there's one; else return
-  /// <badref>.
-  std::string getOrCreateName(const VPValue *V) const;
-};
-
 } // namespace llvm
 
 #endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_VALUE_H
diff --git llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 0f151c897d93..96156de444f8 100644
--- llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -209,7 +209,9 @@ bool VPlanVerifier::verifyVPBasicBlock(const VPBasicBlock *VPBB) {
         auto *UI = cast<VPRecipeBase>(U);
         // TODO: check dominance of incoming values for phis properly.
         if (!UI ||
-            isa<VPHeaderPHIRecipe, VPWidenPHIRecipe, VPPredInstPHIRecipe>(UI))
+            isa<VPHeaderPHIRecipe, VPWidenPHIRecipe, VPPredInstPHIRecipe>(UI) ||
+            (isa<VPIRInstruction>(UI) &&
+             isa<PHINode>(cast<VPIRInstruction>(UI)->getInstruction())))
           continue;
 
         // If the user is in the same block, check it comes after R in the
diff --git llvm/lib/Transforms/Vectorize/VectorCombine.cpp llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 59920b5a4dd2..3758d81d5522 100644
--- llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -108,6 +108,7 @@ private:
                        Instruction &I);
   bool foldExtractExtract(Instruction &I);
   bool foldInsExtFNeg(Instruction &I);
+  bool foldInsExtBinop(Instruction &I);
   bool foldInsExtVectorToShuffle(Instruction &I);
   bool foldBitcastShuffle(Instruction &I);
   bool scalarizeBinopOrCmp(Instruction &I);
@@ -738,6 +739,64 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) {
   return true;
 }
 
+/// Try to fold insert(binop(x,y),binop(a,b),idx)
+///         --> binop(insert(x,a,idx),insert(y,b,idx))
+bool VectorCombine::foldInsExtBinop(Instruction &I) {
+  BinaryOperator *VecBinOp, *SclBinOp;
+  uint64_t Index;
+  if (!match(&I,
+             m_InsertElt(m_OneUse(m_BinOp(VecBinOp)),
+                         m_OneUse(m_BinOp(SclBinOp)), m_ConstantInt(Index))))
+    return false;
+
+  // TODO: Add support for addlike etc.
+  Instruction::BinaryOps BinOpcode = VecBinOp->getOpcode();
+  if (BinOpcode != SclBinOp->getOpcode())
+    return false;
+
+  auto *ResultTy = dyn_cast<FixedVectorType>(I.getType());
+  if (!ResultTy)
+    return false;
+
+  // TODO: Attempt to detect m_ExtractElt for scalar operands and convert to
+  // shuffle?
+
+  InstructionCost OldCost = TTI.getInstructionCost(&I, CostKind) +
+                            TTI.getInstructionCost(VecBinOp, CostKind) +
+                            TTI.getInstructionCost(SclBinOp, CostKind);
+  InstructionCost NewCost =
+      TTI.getArithmeticInstrCost(BinOpcode, ResultTy, CostKind) +
+      TTI.getVectorInstrCost(Instruction::InsertElement, ResultTy, CostKind,
+                             Index, VecBinOp->getOperand(0),
+                             SclBinOp->getOperand(0)) +
+      TTI.getVectorInstrCost(Instruction::InsertElement, ResultTy, CostKind,
+                             Index, VecBinOp->getOperand(1),
+                             SclBinOp->getOperand(1));
+
+  LLVM_DEBUG(dbgs() << "Found an insertion of two binops: " << I
+                    << "\n  OldCost: " << OldCost << " vs NewCost: " << NewCost
+                    << "\n");
+  if (NewCost > OldCost)
+    return false;
+
+  Value *NewIns0 = Builder.CreateInsertElement(VecBinOp->getOperand(0),
+                                               SclBinOp->getOperand(0), Index);
+  Value *NewIns1 = Builder.CreateInsertElement(VecBinOp->getOperand(1),
+                                               SclBinOp->getOperand(1), Index);
+  Value *NewBO = Builder.CreateBinOp(BinOpcode, NewIns0, NewIns1);
+
+  // Intersect flags from the old binops.
+  if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
+    NewInst->copyIRFlags(VecBinOp);
+    NewInst->andIRFlags(SclBinOp);
+  }
+
+  Worklist.pushValue(NewIns0);
+  Worklist.pushValue(NewIns1);
+  replaceValue(I, *NewBO);
+  return true;
+}
+
 /// If this is a bitcast of a shuffle, try to bitcast the source vector to the
 /// destination type followed by shuffle. This can enable further transforms by
 /// moving bitcasts or shuffles together.
@@ -3206,6 +3265,7 @@ bool VectorCombine::run() {
       switch (Opcode) {
       case Instruction::InsertElement:
         MadeChange |= foldInsExtFNeg(I);
+        MadeChange |= foldInsExtBinop(I);
         MadeChange |= foldInsExtVectorToShuffle(I);
         break;
       case Instruction::ShuffleVector:
diff --git llvm/test/Analysis/CostModel/AArch64/arith-fp-sve.ll llvm/test/Analysis/CostModel/AArch64/sve-arith-fp.ll
similarity index 100%
rename from llvm/test/Analysis/CostModel/AArch64/arith-fp-sve.ll
rename to llvm/test/Analysis/CostModel/AArch64/sve-arith-fp.ll
diff --git llvm/test/Analysis/CostModel/AArch64/getIntrinsicInstrCost-vector-reverse.ll llvm/test/Analysis/CostModel/AArch64/vector-reverse.ll
similarity index 100%
rename from llvm/test/Analysis/CostModel/AArch64/getIntrinsicInstrCost-vector-reverse.ll
rename to llvm/test/Analysis/CostModel/AArch64/vector-reverse.ll
diff --git llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll
index 0245a0f7ee6c..698cce6f641f 100644
--- llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll
+++ llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin | FileCheck %s
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin --type-based-intrinsic-cost=true | FileCheck %s --check-prefixes=TYPEBASED
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin | FileCheck %s --check-prefixes=CHECK,ARGBASED
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin --type-based-intrinsic-cost=true | FileCheck %s --check-prefixes=CHECK,TYPEBASED
 
 define void @unsupported_fp_ops(<vscale x 4 x float> %vec, i32 %extraarg) {
 ; CHECK-LABEL: 'unsupported_fp_ops'
@@ -8,21 +8,15 @@ define void @unsupported_fp_ops(<vscale x 4 x float> %vec, i32 %extraarg) {
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %powi = call <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float> %vec, i32 %extraarg)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-; TYPEBASED-LABEL: 'unsupported_fp_ops'
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %pow = call <vscale x 4 x float> @llvm.pow.nxv4f32(<vscale x 4 x float> %vec, <vscale x 4 x float> %vec)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %powi = call <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float> %vec, i32 %extraarg)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-
   %pow = call <vscale x 4 x float> @llvm.pow.nxv4f32(<vscale x 4 x float> %vec, <vscale x 4 x float> %vec)
   %powi = call <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float> %vec, i32 %extraarg)
   ret void
 }
 
 define void @powi(<vscale x 4 x float> %vec) {
-; CHECK-LABEL: 'powi'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %powi = call <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float> %vec, i32 42)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; ARGBASED-LABEL: 'powi'
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %powi = call <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float> %vec, i32 42)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; TYPEBASED-LABEL: 'powi'
 ; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %powi = call <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float> %vec, i32 42)
@@ -36,10 +30,6 @@ define void @fshr(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, <vscale x 1 x i3
 ; CHECK-LABEL: 'fshr'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %1 = call <vscale x 1 x i32> @llvm.fshr.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, <vscale x 1 x i32> %c)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; TYPEBASED-LABEL: 'fshr'
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %1 = call <vscale x 1 x i32> @llvm.fshr.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, <vscale x 1 x i32> %c)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   call <vscale x 1 x i32> @llvm.fshr.nxv4i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, <vscale x 1 x i32> %c)
   ret void
@@ -49,10 +39,6 @@ define void @fshl(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, <vscale x 1 x i3
 ; CHECK-LABEL: 'fshl'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %1 = call <vscale x 1 x i32> @llvm.fshl.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, <vscale x 1 x i32> %c)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; TYPEBASED-LABEL: 'fshl'
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %1 = call <vscale x 1 x i32> @llvm.fshl.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, <vscale x 1 x i32> %c)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   call <vscale x 1 x i32> @llvm.fshl.nxv4i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, <vscale x 1 x i32> %c)
   ret void
@@ -99,47 +85,6 @@ define void @vp_fshr() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %37 = call <vscale x 4 x i64> @llvm.vp.fshr.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> undef, <vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %38 = call <vscale x 8 x i64> @llvm.vp.fshr.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef, <vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; TYPEBASED-LABEL: 'vp_fshr'
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %1 = call <2 x i8> @llvm.vp.fshr.v2i8(<2 x i8> undef, <2 x i8> undef, <2 x i8> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %2 = call <4 x i8> @llvm.vp.fshr.v4i8(<4 x i8> undef, <4 x i8> undef, <4 x i8> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %3 = call <8 x i8> @llvm.vp.fshr.v8i8(<8 x i8> undef, <8 x i8> undef, <8 x i8> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %4 = call <16 x i8> @llvm.vp.fshr.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i8> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %5 = call <vscale x 1 x i8> @llvm.vp.fshr.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i8> undef, <vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %6 = call <vscale x 2 x i8> @llvm.vp.fshr.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i8> undef, <vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %7 = call <vscale x 4 x i8> @llvm.vp.fshr.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i8> undef, <vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %8 = call <vscale x 8 x i8> @llvm.vp.fshr.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i8> undef, <vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %9 = call <vscale x 16 x i8> @llvm.vp.fshr.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, <vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %10 = call <vscale x 32 x i8> @llvm.vp.fshr.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i8> undef, <vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %11 = call <vscale x 64 x i8> @llvm.vp.fshr.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i8> undef, <vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %12 = call <2 x i16> @llvm.vp.fshr.v2i16(<2 x i16> undef, <2 x i16> undef, <2 x i16> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %13 = call <4 x i16> @llvm.vp.fshr.v4i16(<4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %14 = call <8 x i16> @llvm.vp.fshr.v8i16(<8 x i16> undef, <8 x i16> undef, <8 x i16> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %15 = call <16 x i16> @llvm.vp.fshr.v16i16(<16 x i16> undef, <16 x i16> undef, <16 x i16> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %16 = call <vscale x 1 x i16> @llvm.vp.fshr.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i16> undef, <vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %17 = call <vscale x 2 x i16> @llvm.vp.fshr.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i16> undef, <vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %18 = call <vscale x 4 x i16> @llvm.vp.fshr.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i16> undef, <vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %19 = call <vscale x 8 x i16> @llvm.vp.fshr.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16> undef, <vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %20 = call <vscale x 16 x i16> @llvm.vp.fshr.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i16> undef, <vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %21 = call <vscale x 32 x i16> @llvm.vp.fshr.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i16> undef, <vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %22 = call <2 x i32> @llvm.vp.fshr.v2i32(<2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %23 = call <4 x i32> @llvm.vp.fshr.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i32> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %24 = call <8 x i32> @llvm.vp.fshr.v8i32(<8 x i32> undef, <8 x i32> undef, <8 x i32> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %25 = call <16 x i32> @llvm.vp.fshr.v16i32(<16 x i32> undef, <16 x i32> undef, <16 x i32> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %26 = call <vscale x 1 x i32> @llvm.vp.fshr.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i32> undef, <vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %27 = call <vscale x 2 x i32> @llvm.vp.fshr.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i32> undef, <vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %28 = call <vscale x 4 x i32> @llvm.vp.fshr.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef, <vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %29 = call <vscale x 8 x i32> @llvm.vp.fshr.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i32> undef, <vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %30 = call <vscale x 16 x i32> @llvm.vp.fshr.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i32> undef, <vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %31 = call <2 x i64> @llvm.vp.fshr.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i64> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %32 = call <4 x i64> @llvm.vp.fshr.v4i64(<4 x i64> undef, <4 x i64> undef, <4 x i64> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %33 = call <8 x i64> @llvm.vp.fshr.v8i64(<8 x i64> undef, <8 x i64> undef, <8 x i64> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %34 = call <16 x i64> @llvm.vp.fshr.v16i64(<16 x i64> undef, <16 x i64> undef, <16 x i64> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %35 = call <vscale x 1 x i64> @llvm.vp.fshr.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> undef, <vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %36 = call <vscale x 2 x i64> @llvm.vp.fshr.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %37 = call <vscale x 4 x i64> @llvm.vp.fshr.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> undef, <vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %38 = call <vscale x 8 x i64> @llvm.vp.fshr.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef, <vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   call <2 x i8> @llvm.vp.fshr.v2i8(<2 x i8> undef, <2 x i8> undef, <2 x i8> undef, <2 x i1> undef, i32 undef)
   call <4 x i8> @llvm.vp.fshr.v4i8(<4 x i8> undef, <4 x i8> undef, <4 x i8> undef, <4 x i1> undef, i32 undef)
@@ -223,47 +168,6 @@ define void @vp_fshl() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %37 = call <vscale x 4 x i64> @llvm.vp.fshl.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> undef, <vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %38 = call <vscale x 8 x i64> @llvm.vp.fshl.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef, <vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; TYPEBASED-LABEL: 'vp_fshl'
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %1 = call <2 x i8> @llvm.vp.fshl.v2i8(<2 x i8> undef, <2 x i8> undef, <2 x i8> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %2 = call <4 x i8> @llvm.vp.fshl.v4i8(<4 x i8> undef, <4 x i8> undef, <4 x i8> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %3 = call <8 x i8> @llvm.vp.fshl.v8i8(<8 x i8> undef, <8 x i8> undef, <8 x i8> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %4 = call <16 x i8> @llvm.vp.fshl.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i8> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %5 = call <vscale x 1 x i8> @llvm.vp.fshl.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i8> undef, <vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %6 = call <vscale x 2 x i8> @llvm.vp.fshl.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i8> undef, <vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %7 = call <vscale x 4 x i8> @llvm.vp.fshl.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i8> undef, <vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %8 = call <vscale x 8 x i8> @llvm.vp.fshl.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i8> undef, <vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %9 = call <vscale x 16 x i8> @llvm.vp.fshl.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, <vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %10 = call <vscale x 32 x i8> @llvm.vp.fshl.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i8> undef, <vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %11 = call <vscale x 64 x i8> @llvm.vp.fshl.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i8> undef, <vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %12 = call <2 x i16> @llvm.vp.fshl.v2i16(<2 x i16> undef, <2 x i16> undef, <2 x i16> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %13 = call <4 x i16> @llvm.vp.fshl.v4i16(<4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %14 = call <8 x i16> @llvm.vp.fshl.v8i16(<8 x i16> undef, <8 x i16> undef, <8 x i16> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %15 = call <16 x i16> @llvm.vp.fshl.v16i16(<16 x i16> undef, <16 x i16> undef, <16 x i16> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %16 = call <vscale x 1 x i16> @llvm.vp.fshl.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i16> undef, <vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %17 = call <vscale x 2 x i16> @llvm.vp.fshl.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i16> undef, <vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %18 = call <vscale x 4 x i16> @llvm.vp.fshl.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i16> undef, <vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %19 = call <vscale x 8 x i16> @llvm.vp.fshl.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16> undef, <vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %20 = call <vscale x 16 x i16> @llvm.vp.fshl.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i16> undef, <vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %21 = call <vscale x 32 x i16> @llvm.vp.fshl.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i16> undef, <vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %22 = call <2 x i32> @llvm.vp.fshl.v2i32(<2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %23 = call <4 x i32> @llvm.vp.fshl.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i32> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %24 = call <8 x i32> @llvm.vp.fshl.v8i32(<8 x i32> undef, <8 x i32> undef, <8 x i32> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %25 = call <16 x i32> @llvm.vp.fshl.v16i32(<16 x i32> undef, <16 x i32> undef, <16 x i32> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %26 = call <vscale x 1 x i32> @llvm.vp.fshl.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i32> undef, <vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %27 = call <vscale x 2 x i32> @llvm.vp.fshl.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i32> undef, <vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %28 = call <vscale x 4 x i32> @llvm.vp.fshl.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef, <vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %29 = call <vscale x 8 x i32> @llvm.vp.fshl.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i32> undef, <vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %30 = call <vscale x 16 x i32> @llvm.vp.fshl.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i32> undef, <vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %31 = call <2 x i64> @llvm.vp.fshl.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i64> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %32 = call <4 x i64> @llvm.vp.fshl.v4i64(<4 x i64> undef, <4 x i64> undef, <4 x i64> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %33 = call <8 x i64> @llvm.vp.fshl.v8i64(<8 x i64> undef, <8 x i64> undef, <8 x i64> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %34 = call <16 x i64> @llvm.vp.fshl.v16i64(<16 x i64> undef, <16 x i64> undef, <16 x i64> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %35 = call <vscale x 1 x i64> @llvm.vp.fshl.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> undef, <vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %36 = call <vscale x 2 x i64> @llvm.vp.fshl.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %37 = call <vscale x 4 x i64> @llvm.vp.fshl.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> undef, <vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %38 = call <vscale x 8 x i64> @llvm.vp.fshl.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef, <vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   call <2 x i8> @llvm.vp.fshl.v2i8(<2 x i8> undef, <2 x i8> undef, <2 x i8> undef, <2 x i1> undef, i32 undef)
   call <4 x i8> @llvm.vp.fshl.v4i8(<4 x i8> undef, <4 x i8> undef, <4 x i8> undef, <4 x i1> undef, i32 undef)
@@ -373,73 +277,6 @@ define void @add() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t62 = call <vscale x 16 x i64> @llvm.vp.add.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t63 = add <vscale x 16 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; TYPEBASED-LABEL: 'add'
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t0 = call <2 x i8> @llvm.vp.add.v2i8(<2 x i8> undef, <2 x i8> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t1 = add <2 x i8> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t2 = call <4 x i8> @llvm.vp.add.v4i8(<4 x i8> undef, <4 x i8> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t3 = add <4 x i8> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t4 = call <8 x i8> @llvm.vp.add.v8i8(<8 x i8> undef, <8 x i8> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t5 = add <8 x i8> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t6 = call <16 x i8> @llvm.vp.add.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t7 = add <16 x i8> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t8 = call <2 x i16> @llvm.vp.add.v2i16(<2 x i16> undef, <2 x i16> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t9 = add <2 x i16> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t10 = call <4 x i16> @llvm.vp.add.v4i16(<4 x i16> undef, <4 x i16> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t11 = add <4 x i16> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t12 = call <8 x i16> @llvm.vp.add.v8i16(<8 x i16> undef, <8 x i16> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t13 = add <8 x i16> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t14 = call <16 x i16> @llvm.vp.add.v16i16(<16 x i16> undef, <16 x i16> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t15 = add <16 x i16> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t16 = call <2 x i32> @llvm.vp.add.v2i32(<2 x i32> undef, <2 x i32> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t17 = add <2 x i32> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t18 = call <4 x i32> @llvm.vp.add.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t19 = add <4 x i32> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t20 = call <8 x i32> @llvm.vp.add.v8i32(<8 x i32> undef, <8 x i32> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t21 = add <8 x i32> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t22 = call <16 x i32> @llvm.vp.add.v16i32(<16 x i32> undef, <16 x i32> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t23 = add <16 x i32> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t24 = call <2 x i64> @llvm.vp.add.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t25 = add <2 x i64> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t26 = call <4 x i64> @llvm.vp.add.v4i64(<4 x i64> undef, <4 x i64> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t27 = add <4 x i64> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t28 = call <8 x i64> @llvm.vp.add.v8i64(<8 x i64> undef, <8 x i64> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t29 = add <8 x i64> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t30 = call <16 x i64> @llvm.vp.add.v16i64(<16 x i64> undef, <16 x i64> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t31 = add <16 x i64> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t32 = call <vscale x 2 x i8> @llvm.vp.add.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t33 = add <vscale x 2 x i8> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t34 = call <vscale x 4 x i8> @llvm.vp.add.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t35 = add <vscale x 4 x i8> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t36 = call <vscale x 8 x i8> @llvm.vp.add.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t37 = add <vscale x 8 x i8> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t38 = call <vscale x 16 x i8> @llvm.vp.add.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t39 = add <vscale x 16 x i8> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t40 = call <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t41 = add <vscale x 2 x i16> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t42 = call <vscale x 4 x i16> @llvm.vp.add.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t43 = add <vscale x 4 x i16> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t44 = call <vscale x 8 x i16> @llvm.vp.add.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t45 = add <vscale x 8 x i16> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t46 = call <vscale x 16 x i16> @llvm.vp.add.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t47 = add <vscale x 16 x i16> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t48 = call <vscale x 2 x i32> @llvm.vp.add.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t49 = add <vscale x 2 x i32> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t50 = call <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t51 = add <vscale x 4 x i32> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t52 = call <vscale x 8 x i32> @llvm.vp.add.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t53 = add <vscale x 8 x i32> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t54 = call <vscale x 16 x i32> @llvm.vp.add.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t55 = add <vscale x 16 x i32> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t56 = call <vscale x 2 x i64> @llvm.vp.add.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t57 = add <vscale x 2 x i64> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t58 = call <vscale x 4 x i64> @llvm.vp.add.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t59 = add <vscale x 4 x i64> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t60 = call <vscale x 8 x i64> @llvm.vp.add.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t61 = add <vscale x 8 x i64> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t62 = call <vscale x 16 x i64> @llvm.vp.add.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t63 = add <vscale x 16 x i64> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %t0 = call <2 x i8> @llvm.vp.add.v2i8(<2 x i8> undef, <2 x i8> undef, <2 x i1> undef, i32 undef)
   %t1 = add <2 x i8> undef, undef
@@ -575,73 +412,6 @@ define void @and() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t62 = call <vscale x 16 x i64> @llvm.vp.and.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t63 = and <vscale x 16 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; TYPEBASED-LABEL: 'and'
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t0 = call <2 x i8> @llvm.vp.and.v2i8(<2 x i8> undef, <2 x i8> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t1 = and <2 x i8> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t2 = call <4 x i8> @llvm.vp.and.v4i8(<4 x i8> undef, <4 x i8> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t3 = and <4 x i8> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t4 = call <8 x i8> @llvm.vp.and.v8i8(<8 x i8> undef, <8 x i8> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t5 = and <8 x i8> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t6 = call <16 x i8> @llvm.vp.and.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t7 = and <16 x i8> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t8 = call <2 x i16> @llvm.vp.and.v2i16(<2 x i16> undef, <2 x i16> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t9 = and <2 x i16> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t10 = call <4 x i16> @llvm.vp.and.v4i16(<4 x i16> undef, <4 x i16> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t11 = and <4 x i16> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t12 = call <8 x i16> @llvm.vp.and.v8i16(<8 x i16> undef, <8 x i16> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t13 = and <8 x i16> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t14 = call <16 x i16> @llvm.vp.and.v16i16(<16 x i16> undef, <16 x i16> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t15 = and <16 x i16> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t16 = call <2 x i32> @llvm.vp.and.v2i32(<2 x i32> undef, <2 x i32> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t17 = and <2 x i32> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t18 = call <4 x i32> @llvm.vp.and.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t19 = and <4 x i32> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t20 = call <8 x i32> @llvm.vp.and.v8i32(<8 x i32> undef, <8 x i32> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t21 = and <8 x i32> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t22 = call <16 x i32> @llvm.vp.and.v16i32(<16 x i32> undef, <16 x i32> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t23 = and <16 x i32> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t24 = call <2 x i64> @llvm.vp.and.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t25 = and <2 x i64> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t26 = call <4 x i64> @llvm.vp.and.v4i64(<4 x i64> undef, <4 x i64> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t27 = and <4 x i64> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t28 = call <8 x i64> @llvm.vp.and.v8i64(<8 x i64> undef, <8 x i64> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t29 = and <8 x i64> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t30 = call <16 x i64> @llvm.vp.and.v16i64(<16 x i64> undef, <16 x i64> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t31 = and <16 x i64> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t32 = call <vscale x 2 x i8> @llvm.vp.and.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t33 = and <vscale x 2 x i8> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t34 = call <vscale x 4 x i8> @llvm.vp.and.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t35 = and <vscale x 4 x i8> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t36 = call <vscale x 8 x i8> @llvm.vp.and.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t37 = and <vscale x 8 x i8> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t38 = call <vscale x 16 x i8> @llvm.vp.and.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t39 = and <vscale x 16 x i8> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t40 = call <vscale x 2 x i16> @llvm.vp.and.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t41 = and <vscale x 2 x i16> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t42 = call <vscale x 4 x i16> @llvm.vp.and.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t43 = and <vscale x 4 x i16> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t44 = call <vscale x 8 x i16> @llvm.vp.and.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t45 = and <vscale x 8 x i16> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t46 = call <vscale x 16 x i16> @llvm.vp.and.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t47 = and <vscale x 16 x i16> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t48 = call <vscale x 2 x i32> @llvm.vp.and.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t49 = and <vscale x 2 x i32> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t50 = call <vscale x 4 x i32> @llvm.vp.and.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t51 = and <vscale x 4 x i32> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t52 = call <vscale x 8 x i32> @llvm.vp.and.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t53 = and <vscale x 8 x i32> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t54 = call <vscale x 16 x i32> @llvm.vp.and.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t55 = and <vscale x 16 x i32> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t56 = call <vscale x 2 x i64> @llvm.vp.and.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t57 = and <vscale x 2 x i64> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t58 = call <vscale x 4 x i64> @llvm.vp.and.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t59 = and <vscale x 4 x i64> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t60 = call <vscale x 8 x i64> @llvm.vp.and.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t61 = and <vscale x 8 x i64> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t62 = call <vscale x 16 x i64> @llvm.vp.and.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t63 = and <vscale x 16 x i64> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %t0 = call <2 x i8> @llvm.vp.and.v2i8(<2 x i8> undef, <2 x i8> undef, <2 x i1> undef, i32 undef)
   %t1 = and <2 x i8> undef, undef
@@ -745,41 +515,6 @@ define void @smax() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t60 = call <vscale x 8 x i64> @llvm.vp.smax.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t62 = call <vscale x 16 x i64> @llvm.vp.smax.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; TYPEBASED-LABEL: 'smax'
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t0 = call <2 x i8> @llvm.vp.smax.v2i8(<2 x i8> undef, <2 x i8> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t2 = call <4 x i8> @llvm.vp.smax.v4i8(<4 x i8> undef, <4 x i8> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t4 = call <8 x i8> @llvm.vp.smax.v8i8(<8 x i8> undef, <8 x i8> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t6 = call <16 x i8> @llvm.vp.smax.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t8 = call <2 x i16> @llvm.vp.smax.v2i16(<2 x i16> undef, <2 x i16> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t10 = call <4 x i16> @llvm.vp.smax.v4i16(<4 x i16> undef, <4 x i16> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t12 = call <8 x i16> @llvm.vp.smax.v8i16(<8 x i16> undef, <8 x i16> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t14 = call <16 x i16> @llvm.vp.smax.v16i16(<16 x i16> undef, <16 x i16> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t16 = call <2 x i32> @llvm.vp.smax.v2i32(<2 x i32> undef, <2 x i32> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t18 = call <4 x i32> @llvm.vp.smax.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t20 = call <8 x i32> @llvm.vp.smax.v8i32(<8 x i32> undef, <8 x i32> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t22 = call <16 x i32> @llvm.vp.smax.v16i32(<16 x i32> undef, <16 x i32> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t24 = call <2 x i64> @llvm.vp.smax.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t26 = call <4 x i64> @llvm.vp.smax.v4i64(<4 x i64> undef, <4 x i64> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t28 = call <8 x i64> @llvm.vp.smax.v8i64(<8 x i64> undef, <8 x i64> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t30 = call <16 x i64> @llvm.vp.smax.v16i64(<16 x i64> undef, <16 x i64> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t32 = call <vscale x 2 x i8> @llvm.vp.smax.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t34 = call <vscale x 4 x i8> @llvm.vp.smax.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t36 = call <vscale x 8 x i8> @llvm.vp.smax.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t38 = call <vscale x 16 x i8> @llvm.vp.smax.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t40 = call <vscale x 2 x i16> @llvm.vp.smax.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t42 = call <vscale x 4 x i16> @llvm.vp.smax.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t44 = call <vscale x 8 x i16> @llvm.vp.smax.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t46 = call <vscale x 16 x i16> @llvm.vp.smax.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t48 = call <vscale x 2 x i32> @llvm.vp.smax.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t50 = call <vscale x 4 x i32> @llvm.vp.smax.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t52 = call <vscale x 8 x i32> @llvm.vp.smax.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t54 = call <vscale x 16 x i32> @llvm.vp.smax.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t56 = call <vscale x 2 x i64> @llvm.vp.smax.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t58 = call <vscale x 4 x i64> @llvm.vp.smax.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t60 = call <vscale x 8 x i64> @llvm.vp.smax.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t62 = call <vscale x 16 x i64> @llvm.vp.smax.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %t0 = call <2 x i8> @llvm.vp.smax.v2i8(<2 x i8> undef, <2 x i8> undef, <2 x i1> undef, i32 undef)
   %t2 = call <4 x i8> @llvm.vp.smax.v4i8(<4 x i8> undef, <4 x i8> undef, <4 x i1> undef, i32 undef)
@@ -852,41 +587,6 @@ define void @smin() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t60 = call <vscale x 8 x i64> @llvm.vp.smin.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t62 = call <vscale x 16 x i64> @llvm.vp.smin.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; TYPEBASED-LABEL: 'smin'
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t0 = call <2 x i8> @llvm.vp.smin.v2i8(<2 x i8> undef, <2 x i8> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t2 = call <4 x i8> @llvm.vp.smin.v4i8(<4 x i8> undef, <4 x i8> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t4 = call <8 x i8> @llvm.vp.smin.v8i8(<8 x i8> undef, <8 x i8> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t6 = call <16 x i8> @llvm.vp.smin.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t8 = call <2 x i16> @llvm.vp.smin.v2i16(<2 x i16> undef, <2 x i16> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t10 = call <4 x i16> @llvm.vp.smin.v4i16(<4 x i16> undef, <4 x i16> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t12 = call <8 x i16> @llvm.vp.smin.v8i16(<8 x i16> undef, <8 x i16> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t14 = call <16 x i16> @llvm.vp.smin.v16i16(<16 x i16> undef, <16 x i16> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t16 = call <2 x i32> @llvm.vp.smin.v2i32(<2 x i32> undef, <2 x i32> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t18 = call <4 x i32> @llvm.vp.smin.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t20 = call <8 x i32> @llvm.vp.smin.v8i32(<8 x i32> undef, <8 x i32> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t22 = call <16 x i32> @llvm.vp.smin.v16i32(<16 x i32> undef, <16 x i32> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t24 = call <2 x i64> @llvm.vp.smin.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t26 = call <4 x i64> @llvm.vp.smin.v4i64(<4 x i64> undef, <4 x i64> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t28 = call <8 x i64> @llvm.vp.smin.v8i64(<8 x i64> undef, <8 x i64> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t30 = call <16 x i64> @llvm.vp.smin.v16i64(<16 x i64> undef, <16 x i64> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t32 = call <vscale x 2 x i8> @llvm.vp.smin.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t34 = call <vscale x 4 x i8> @llvm.vp.smin.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t36 = call <vscale x 8 x i8> @llvm.vp.smin.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t38 = call <vscale x 16 x i8> @llvm.vp.smin.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t40 = call <vscale x 2 x i16> @llvm.vp.smin.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t42 = call <vscale x 4 x i16> @llvm.vp.smin.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t44 = call <vscale x 8 x i16> @llvm.vp.smin.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t46 = call <vscale x 16 x i16> @llvm.vp.smin.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t48 = call <vscale x 2 x i32> @llvm.vp.smin.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t50 = call <vscale x 4 x i32> @llvm.vp.smin.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t52 = call <vscale x 8 x i32> @llvm.vp.smin.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t54 = call <vscale x 16 x i32> @llvm.vp.smin.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t56 = call <vscale x 2 x i64> @llvm.vp.smin.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t58 = call <vscale x 4 x i64> @llvm.vp.smin.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t60 = call <vscale x 8 x i64> @llvm.vp.smin.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t62 = call <vscale x 16 x i64> @llvm.vp.smin.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %t0 = call <2 x i8> @llvm.vp.smin.v2i8(<2 x i8> undef, <2 x i8> undef, <2 x i1> undef, i32 undef)
   %t2 = call <4 x i8> @llvm.vp.smin.v4i8(<4 x i8> undef, <4 x i8> undef, <4 x i1> undef, i32 undef)
@@ -958,41 +658,6 @@ define void @umax() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t60 = call <vscale x 8 x i64> @llvm.vp.umax.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t62 = call <vscale x 16 x i64> @llvm.vp.umax.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; TYPEBASED-LABEL: 'umax'
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t0 = call <2 x i8> @llvm.vp.umax.v2i8(<2 x i8> undef, <2 x i8> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t2 = call <4 x i8> @llvm.vp.umax.v4i8(<4 x i8> undef, <4 x i8> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t4 = call <8 x i8> @llvm.vp.umax.v8i8(<8 x i8> undef, <8 x i8> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t6 = call <16 x i8> @llvm.vp.umax.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t8 = call <2 x i16> @llvm.vp.umax.v2i16(<2 x i16> undef, <2 x i16> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t10 = call <4 x i16> @llvm.vp.umax.v4i16(<4 x i16> undef, <4 x i16> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t12 = call <8 x i16> @llvm.vp.umax.v8i16(<8 x i16> undef, <8 x i16> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t14 = call <16 x i16> @llvm.vp.umax.v16i16(<16 x i16> undef, <16 x i16> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t16 = call <2 x i32> @llvm.vp.umax.v2i32(<2 x i32> undef, <2 x i32> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t18 = call <4 x i32> @llvm.vp.umax.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t20 = call <8 x i32> @llvm.vp.umax.v8i32(<8 x i32> undef, <8 x i32> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t22 = call <16 x i32> @llvm.vp.umax.v16i32(<16 x i32> undef, <16 x i32> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t24 = call <2 x i64> @llvm.vp.umax.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t26 = call <4 x i64> @llvm.vp.umax.v4i64(<4 x i64> undef, <4 x i64> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t28 = call <8 x i64> @llvm.vp.umax.v8i64(<8 x i64> undef, <8 x i64> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t30 = call <16 x i64> @llvm.vp.umax.v16i64(<16 x i64> undef, <16 x i64> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t32 = call <vscale x 2 x i8> @llvm.vp.umax.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t34 = call <vscale x 4 x i8> @llvm.vp.umax.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t36 = call <vscale x 8 x i8> @llvm.vp.umax.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t38 = call <vscale x 16 x i8> @llvm.vp.umax.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t40 = call <vscale x 2 x i16> @llvm.vp.umax.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t42 = call <vscale x 4 x i16> @llvm.vp.umax.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t44 = call <vscale x 8 x i16> @llvm.vp.umax.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t46 = call <vscale x 16 x i16> @llvm.vp.umax.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t48 = call <vscale x 2 x i32> @llvm.vp.umax.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t50 = call <vscale x 4 x i32> @llvm.vp.umax.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t52 = call <vscale x 8 x i32> @llvm.vp.umax.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t54 = call <vscale x 16 x i32> @llvm.vp.umax.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t56 = call <vscale x 2 x i64> @llvm.vp.umax.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t58 = call <vscale x 4 x i64> @llvm.vp.umax.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t60 = call <vscale x 8 x i64> @llvm.vp.umax.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t62 = call <vscale x 16 x i64> @llvm.vp.umax.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %t0 = call <2 x i8> @llvm.vp.umax.v2i8(<2 x i8> undef, <2 x i8> undef, <2 x i1> undef, i32 undef)
   %t2 = call <4 x i8> @llvm.vp.umax.v4i8(<4 x i8> undef, <4 x i8> undef, <4 x i1> undef, i32 undef)
@@ -1064,41 +729,6 @@ define void @umin() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t60 = call <vscale x 8 x i64> @llvm.vp.umin.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t62 = call <vscale x 16 x i64> @llvm.vp.umin.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; TYPEBASED-LABEL: 'umin'
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t0 = call <2 x i8> @llvm.vp.umin.v2i8(<2 x i8> undef, <2 x i8> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t2 = call <4 x i8> @llvm.vp.umin.v4i8(<4 x i8> undef, <4 x i8> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t4 = call <8 x i8> @llvm.vp.umin.v8i8(<8 x i8> undef, <8 x i8> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t6 = call <16 x i8> @llvm.vp.umin.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t8 = call <2 x i16> @llvm.vp.umin.v2i16(<2 x i16> undef, <2 x i16> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t10 = call <4 x i16> @llvm.vp.umin.v4i16(<4 x i16> undef, <4 x i16> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t12 = call <8 x i16> @llvm.vp.umin.v8i16(<8 x i16> undef, <8 x i16> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t14 = call <16 x i16> @llvm.vp.umin.v16i16(<16 x i16> undef, <16 x i16> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t16 = call <2 x i32> @llvm.vp.umin.v2i32(<2 x i32> undef, <2 x i32> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t18 = call <4 x i32> @llvm.vp.umin.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t20 = call <8 x i32> @llvm.vp.umin.v8i32(<8 x i32> undef, <8 x i32> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t22 = call <16 x i32> @llvm.vp.umin.v16i32(<16 x i32> undef, <16 x i32> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t24 = call <2 x i64> @llvm.vp.umin.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t26 = call <4 x i64> @llvm.vp.umin.v4i64(<4 x i64> undef, <4 x i64> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t28 = call <8 x i64> @llvm.vp.umin.v8i64(<8 x i64> undef, <8 x i64> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t30 = call <16 x i64> @llvm.vp.umin.v16i64(<16 x i64> undef, <16 x i64> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t32 = call <vscale x 2 x i8> @llvm.vp.umin.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t34 = call <vscale x 4 x i8> @llvm.vp.umin.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t36 = call <vscale x 8 x i8> @llvm.vp.umin.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t38 = call <vscale x 16 x i8> @llvm.vp.umin.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t40 = call <vscale x 2 x i16> @llvm.vp.umin.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t42 = call <vscale x 4 x i16> @llvm.vp.umin.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t44 = call <vscale x 8 x i16> @llvm.vp.umin.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t46 = call <vscale x 16 x i16> @llvm.vp.umin.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t48 = call <vscale x 2 x i32> @llvm.vp.umin.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t50 = call <vscale x 4 x i32> @llvm.vp.umin.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t52 = call <vscale x 8 x i32> @llvm.vp.umin.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t54 = call <vscale x 16 x i32> @llvm.vp.umin.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t56 = call <vscale x 2 x i64> @llvm.vp.umin.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t58 = call <vscale x 4 x i64> @llvm.vp.umin.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t60 = call <vscale x 8 x i64> @llvm.vp.umin.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t62 = call <vscale x 16 x i64> @llvm.vp.umin.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %t0 = call <2 x i8> @llvm.vp.umin.v2i8(<2 x i8> undef, <2 x i8> undef, <2 x i1> undef, i32 undef)
   %t2 = call <4 x i8> @llvm.vp.umin.v4i8(<4 x i8> undef, <4 x i8> undef, <4 x i1> undef, i32 undef)
@@ -1170,41 +800,6 @@ define void @abs() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %31 = call <vscale x 16 x i64> @llvm.vp.abs.nxv16i64(<vscale x 16 x i64> undef, i1 false, <vscale x 16 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %32 = call <vscale x 16 x i64> @llvm.abs.nxv16i64(<vscale x 16 x i64> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; TYPEBASED-LABEL: 'abs'
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %1 = call <2 x i8> @llvm.vp.abs.v2i8(<2 x i8> undef, i1 false, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %2 = call <2 x i8> @llvm.abs.v2i8(<2 x i8> undef, i1 false)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %3 = call <4 x i8> @llvm.vp.abs.v4i8(<4 x i8> undef, i1 false, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %4 = call <4 x i8> @llvm.abs.v4i8(<4 x i8> undef, i1 false)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %5 = call <8 x i8> @llvm.vp.abs.v8i8(<8 x i8> undef, i1 false, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %6 = call <8 x i8> @llvm.abs.v8i8(<8 x i8> undef, i1 false)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %7 = call <16 x i8> @llvm.vp.abs.v16i8(<16 x i8> undef, i1 false, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> undef, i1 false)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %9 = call <2 x i64> @llvm.vp.abs.v2i64(<2 x i64> undef, i1 false, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %10 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> undef, i1 false)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %11 = call <4 x i64> @llvm.vp.abs.v4i64(<4 x i64> undef, i1 false, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %12 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> undef, i1 false)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %13 = call <8 x i64> @llvm.vp.abs.v8i64(<8 x i64> undef, i1 false, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %14 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> undef, i1 false)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %15 = call <16 x i64> @llvm.vp.abs.v16i64(<16 x i64> undef, i1 false, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %16 = call <16 x i64> @llvm.abs.v16i64(<16 x i64> undef, i1 false)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %17 = call <vscale x 2 x i8> @llvm.vp.abs.nxv2i8(<vscale x 2 x i8> undef, i1 false, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %18 = call <vscale x 2 x i8> @llvm.abs.nxv2i8(<vscale x 2 x i8> undef, i1 false)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %19 = call <vscale x 4 x i8> @llvm.vp.abs.nxv4i8(<vscale x 4 x i8> undef, i1 false, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %20 = call <vscale x 4 x i8> @llvm.abs.nxv4i8(<vscale x 4 x i8> undef, i1 false)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %21 = call <vscale x 8 x i8> @llvm.vp.abs.nxv8i8(<vscale x 8 x i8> undef, i1 false, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %22 = call <vscale x 8 x i8> @llvm.abs.nxv8i8(<vscale x 8 x i8> undef, i1 false)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %23 = call <vscale x 16 x i8> @llvm.vp.abs.nxv16i8(<vscale x 16 x i8> undef, i1 false, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %24 = call <vscale x 16 x i8> @llvm.abs.nxv16i8(<vscale x 16 x i8> undef, i1 false)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %25 = call <vscale x 2 x i64> @llvm.vp.abs.nxv2i64(<vscale x 2 x i64> undef, i1 false, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %26 = call <vscale x 2 x i64> @llvm.abs.nxv2i64(<vscale x 2 x i64> undef, i1 false)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %27 = call <vscale x 4 x i64> @llvm.vp.abs.nxv4i64(<vscale x 4 x i64> undef, i1 false, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %28 = call <vscale x 4 x i64> @llvm.abs.nxv4i64(<vscale x 4 x i64> undef, i1 false)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %29 = call <vscale x 8 x i64> @llvm.vp.abs.nxv8i64(<vscale x 8 x i64> undef, i1 false, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %30 = call <vscale x 8 x i64> @llvm.abs.nxv8i64(<vscale x 8 x i64> undef, i1 false)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %31 = call <vscale x 16 x i64> @llvm.vp.abs.nxv16i64(<vscale x 16 x i64> undef, i1 false, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %32 = call <vscale x 16 x i64> @llvm.abs.nxv16i64(<vscale x 16 x i64> undef, i1 false)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   call <2 x i8> @llvm.vp.abs.v2i8(<2 x i8> undef, i1 0, <2 x i1> undef, i32 undef)
   call <2 x i8> @llvm.abs.v2i8(<2 x i8> undef, i1 0)
@@ -1276,41 +871,6 @@ define void @load() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t31 = call <vscale x 16 x i64> @llvm.vp.load.nxv16i64.p0(ptr undef, <vscale x 16 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t32 = load <vscale x 16 x i64>, ptr undef, align 128
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; TYPEBASED-LABEL: 'load'
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t0 = call <2 x i8> @llvm.vp.load.v2i8.p0(ptr undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t1 = load <2 x i8>, ptr undef, align 2
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t2 = call <4 x i8> @llvm.vp.load.v4i8.p0(ptr undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t3 = load <4 x i8>, ptr undef, align 4
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t4 = call <8 x i8> @llvm.vp.load.v8i8.p0(ptr undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t5 = load <8 x i8>, ptr undef, align 8
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t6 = call <16 x i8> @llvm.vp.load.v16i8.p0(ptr undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t7 = load <16 x i8>, ptr undef, align 16
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t8 = call <2 x i64> @llvm.vp.load.v2i64.p0(ptr undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t9 = load <2 x i64>, ptr undef, align 16
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t10 = call <4 x i64> @llvm.vp.load.v4i64.p0(ptr undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t12 = load <4 x i64>, ptr undef, align 32
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t13 = call <8 x i64> @llvm.vp.load.v8i64.p0(ptr undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t14 = load <8 x i64>, ptr undef, align 64
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t15 = call <16 x i64> @llvm.vp.load.v16i64.p0(ptr undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t16 = load <16 x i64>, ptr undef, align 128
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t17 = call <vscale x 2 x i8> @llvm.vp.load.nxv2i8.p0(ptr undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t18 = load <vscale x 2 x i8>, ptr undef, align 2
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t19 = call <vscale x 4 x i8> @llvm.vp.load.nxv4i8.p0(ptr undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t20 = load <vscale x 4 x i8>, ptr undef, align 4
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t21 = call <vscale x 8 x i8> @llvm.vp.load.nxv8i8.p0(ptr undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t22 = load <vscale x 8 x i8>, ptr undef, align 8
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t23 = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t24 = load <vscale x 16 x i8>, ptr undef, align 16
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t25 = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t26 = load <vscale x 2 x i64>, ptr undef, align 16
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t27 = call <vscale x 4 x i64> @llvm.vp.load.nxv4i64.p0(ptr undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t28 = load <vscale x 4 x i64>, ptr undef, align 32
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t29 = call <vscale x 8 x i64> @llvm.vp.load.nxv8i64.p0(ptr undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t30 = load <vscale x 8 x i64>, ptr undef, align 64
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t31 = call <vscale x 16 x i64> @llvm.vp.load.nxv16i64.p0(ptr undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t32 = load <vscale x 16 x i64>, ptr undef, align 128
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %t0 = call <2 x i8> @llvm.vp.load.v2i8(ptr undef, <2 x i1> undef, i32 undef)
   %t1 = load <2 x i8>, ptr undef
@@ -1382,41 +942,6 @@ define void @store() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.vp.store.nxv16i64.p0(<vscale x 16 x i64> undef, ptr undef, <vscale x 16 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: store <vscale x 16 x i64> undef, ptr undef, align 128
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; TYPEBASED-LABEL: 'store'
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.v2i8.p0(<2 x i8> undef, ptr undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i8> undef, ptr undef, align 2
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.v4i8.p0(<4 x i8> undef, ptr undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i8> undef, ptr undef, align 4
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.v8i8.p0(<8 x i8> undef, ptr undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <8 x i8> undef, ptr undef, align 8
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.v16i8.p0(<16 x i8> undef, ptr undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <16 x i8> undef, ptr undef, align 16
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.v2i64.p0(<2 x i64> undef, ptr undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> undef, ptr undef, align 16
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.vp.store.v4i64.p0(<4 x i64> undef, ptr undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <4 x i64> undef, ptr undef, align 32
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.vp.store.v8i64.p0(<8 x i64> undef, ptr undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <8 x i64> undef, ptr undef, align 64
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.vp.store.v16i64.p0(<16 x i64> undef, ptr undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <16 x i64> undef, ptr undef, align 128
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.nxv2i8.p0(<vscale x 2 x i8> undef, ptr undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 2 x i8> undef, ptr undef, align 2
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.nxv4i8.p0(<vscale x 4 x i8> undef, ptr undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 4 x i8> undef, ptr undef, align 4
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.nxv8i8.p0(<vscale x 8 x i8> undef, ptr undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 8 x i8> undef, ptr undef, align 8
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> undef, ptr undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <vscale x 16 x i8> undef, ptr undef, align 16
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> undef, ptr undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <vscale x 2 x i64> undef, ptr undef, align 16
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.vp.store.nxv4i64.p0(<vscale x 4 x i64> undef, ptr undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <vscale x 4 x i64> undef, ptr undef, align 32
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.vp.store.nxv8i64.p0(<vscale x 8 x i64> undef, ptr undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <vscale x 8 x i64> undef, ptr undef, align 64
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.vp.store.nxv16i64.p0(<vscale x 16 x i64> undef, ptr undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: store <vscale x 16 x i64> undef, ptr undef, align 128
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   call void @llvm.vp.store.v2i8(<2 x i8> undef, ptr undef, <2 x i1> undef, i32 undef)
   store <2 x i8> undef, ptr undef
@@ -1454,58 +979,58 @@ define void @store() {
 }
 
 define void @strided_load() {
-; CHECK-LABEL: 'strided_load'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %ti1_2 = call <2 x i1> @llvm.experimental.vp.strided.load.v2i1.p0.i64(ptr undef, i64 undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %ti1_4 = call <4 x i1> @llvm.experimental.vp.strided.load.v4i1.p0.i64(ptr undef, i64 undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %ti1_8 = call <8 x i1> @llvm.experimental.vp.strided.load.v8i1.p0.i64(ptr undef, i64 undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %ti1_16 = call <16 x i1> @llvm.experimental.vp.strided.load.v16i1.p0.i64(ptr undef, i64 undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t0 = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr undef, i64 undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t2 = call <4 x i8> @llvm.experimental.vp.strided.load.v4i8.p0.i64(ptr undef, i64 undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t4 = call <8 x i8> @llvm.experimental.vp.strided.load.v8i8.p0.i64(ptr undef, i64 undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t6 = call <16 x i8> @llvm.experimental.vp.strided.load.v16i8.p0.i64(ptr undef, i64 undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t8.a = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.p0.i64(ptr align 8 undef, i64 undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t10.a = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i64(ptr align 8 undef, i64 undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t13.a = call <8 x i64> @llvm.experimental.vp.strided.load.v8i64.p0.i64(ptr align 8 undef, i64 undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t15.a = call <16 x i64> @llvm.experimental.vp.strided.load.v16i64.p0.i64(ptr align 8 undef, i64 undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t8 = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.p0.i64(ptr undef, i64 undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t10 = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i64(ptr undef, i64 undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t13 = call <8 x i64> @llvm.experimental.vp.strided.load.v8i64.p0.i64(ptr undef, i64 undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t15 = call <16 x i64> @llvm.experimental.vp.strided.load.v16i64.p0.i64(ptr undef, i64 undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t17 = call <vscale x 2 x i8> @llvm.experimental.vp.strided.load.nxv2i8.p0.i64(ptr undef, i64 undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t19 = call <vscale x 4 x i8> @llvm.experimental.vp.strided.load.nxv4i8.p0.i64(ptr undef, i64 undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t21 = call <vscale x 8 x i8> @llvm.experimental.vp.strided.load.nxv8i8.p0.i64(ptr undef, i64 undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %t23 = call <vscale x 16 x i8> @llvm.experimental.vp.strided.load.nxv16i8.p0.i64(ptr undef, i64 undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t25 = call <vscale x 2 x i64> @llvm.experimental.vp.strided.load.nxv2i64.p0.i64(ptr undef, i64 undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t27 = call <vscale x 4 x i64> @llvm.experimental.vp.strided.load.nxv4i64.p0.i64(ptr undef, i64 undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t29 = call <vscale x 8 x i64> @llvm.experimental.vp.strided.load.nxv8i64.p0.i64(ptr undef, i64 undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %t31 = call <vscale x 16 x i64> @llvm.experimental.vp.strided.load.nxv16i64.p0.i64(ptr undef, i64 undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; ARGBASED-LABEL: 'strided_load'
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %ti1_2 = call <2 x i1> @llvm.experimental.vp.strided.load.v2i1.p0.i64(ptr undef, i64 undef, <2 x i1> undef, i32 undef)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %ti1_4 = call <4 x i1> @llvm.experimental.vp.strided.load.v4i1.p0.i64(ptr undef, i64 undef, <4 x i1> undef, i32 undef)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %ti1_8 = call <8 x i1> @llvm.experimental.vp.strided.load.v8i1.p0.i64(ptr undef, i64 undef, <8 x i1> undef, i32 undef)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %ti1_16 = call <16 x i1> @llvm.experimental.vp.strided.load.v16i1.p0.i64(ptr undef, i64 undef, <16 x i1> undef, i32 undef)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t0 = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr undef, i64 undef, <2 x i1> undef, i32 undef)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t2 = call <4 x i8> @llvm.experimental.vp.strided.load.v4i8.p0.i64(ptr undef, i64 undef, <4 x i1> undef, i32 undef)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t4 = call <8 x i8> @llvm.experimental.vp.strided.load.v8i8.p0.i64(ptr undef, i64 undef, <8 x i1> undef, i32 undef)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t6 = call <16 x i8> @llvm.experimental.vp.strided.load.v16i8.p0.i64(ptr undef, i64 undef, <16 x i1> undef, i32 undef)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t8.a = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.p0.i64(ptr align 8 undef, i64 undef, <2 x i1> undef, i32 undef)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t10.a = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i64(ptr align 8 undef, i64 undef, <4 x i1> undef, i32 undef)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t13.a = call <8 x i64> @llvm.experimental.vp.strided.load.v8i64.p0.i64(ptr align 8 undef, i64 undef, <8 x i1> undef, i32 undef)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t15.a = call <16 x i64> @llvm.experimental.vp.strided.load.v16i64.p0.i64(ptr align 8 undef, i64 undef, <16 x i1> undef, i32 undef)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t8 = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.p0.i64(ptr undef, i64 undef, <2 x i1> undef, i32 undef)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t10 = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i64(ptr undef, i64 undef, <4 x i1> undef, i32 undef)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t13 = call <8 x i64> @llvm.experimental.vp.strided.load.v8i64.p0.i64(ptr undef, i64 undef, <8 x i1> undef, i32 undef)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t15 = call <16 x i64> @llvm.experimental.vp.strided.load.v16i64.p0.i64(ptr undef, i64 undef, <16 x i1> undef, i32 undef)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t17 = call <vscale x 2 x i8> @llvm.experimental.vp.strided.load.nxv2i8.p0.i64(ptr undef, i64 undef, <vscale x 2 x i1> undef, i32 undef)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t19 = call <vscale x 4 x i8> @llvm.experimental.vp.strided.load.nxv4i8.p0.i64(ptr undef, i64 undef, <vscale x 4 x i1> undef, i32 undef)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t21 = call <vscale x 8 x i8> @llvm.experimental.vp.strided.load.nxv8i8.p0.i64(ptr undef, i64 undef, <vscale x 8 x i1> undef, i32 undef)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %t23 = call <vscale x 16 x i8> @llvm.experimental.vp.strided.load.nxv16i8.p0.i64(ptr undef, i64 undef, <vscale x 16 x i1> undef, i32 undef)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t25 = call <vscale x 2 x i64> @llvm.experimental.vp.strided.load.nxv2i64.p0.i64(ptr undef, i64 undef, <vscale x 2 x i1> undef, i32 undef)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t27 = call <vscale x 4 x i64> @llvm.experimental.vp.strided.load.nxv4i64.p0.i64(ptr undef, i64 undef, <vscale x 4 x i1> undef, i32 undef)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t29 = call <vscale x 8 x i64> @llvm.experimental.vp.strided.load.nxv8i64.p0.i64(ptr undef, i64 undef, <vscale x 8 x i1> undef, i32 undef)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %t31 = call <vscale x 16 x i64> @llvm.experimental.vp.strided.load.nxv16i64.p0.i64(ptr undef, i64 undef, <vscale x 16 x i1> undef, i32 undef)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; TYPEBASED-LABEL: 'strided_load'
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %ti1_2 = call <2 x i1> @llvm.experimental.vp.strided.load.v2i1.p0.i64(ptr undef, i64 undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %ti1_4 = call <4 x i1> @llvm.experimental.vp.strided.load.v4i1.p0.i64(ptr undef, i64 undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %ti1_8 = call <8 x i1> @llvm.experimental.vp.strided.load.v8i1.p0.i64(ptr undef, i64 undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %ti1_16 = call <16 x i1> @llvm.experimental.vp.strided.load.v16i1.p0.i64(ptr undef, i64 undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %t0 = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr undef, i64 undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %t2 = call <4 x i8> @llvm.experimental.vp.strided.load.v4i8.p0.i64(ptr undef, i64 undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %t4 = call <8 x i8> @llvm.experimental.vp.strided.load.v8i8.p0.i64(ptr undef, i64 undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %t6 = call <16 x i8> @llvm.experimental.vp.strided.load.v16i8.p0.i64(ptr undef, i64 undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %t8.a = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.p0.i64(ptr align 8 undef, i64 undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %t10.a = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i64(ptr align 8 undef, i64 undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %t13.a = call <8 x i64> @llvm.experimental.vp.strided.load.v8i64.p0.i64(ptr align 8 undef, i64 undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: %t15.a = call <16 x i64> @llvm.experimental.vp.strided.load.v16i64.p0.i64(ptr align 8 undef, i64 undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %t8 = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.p0.i64(ptr undef, i64 undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %t10 = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i64(ptr undef, i64 undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %t13 = call <8 x i64> @llvm.experimental.vp.strided.load.v8i64.p0.i64(ptr undef, i64 undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: %t15 = call <16 x i64> @llvm.experimental.vp.strided.load.v16i64.p0.i64(ptr undef, i64 undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t17 = call <vscale x 2 x i8> @llvm.experimental.vp.strided.load.nxv2i8.p0.i64(ptr undef, i64 undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t19 = call <vscale x 4 x i8> @llvm.experimental.vp.strided.load.nxv4i8.p0.i64(ptr undef, i64 undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t21 = call <vscale x 8 x i8> @llvm.experimental.vp.strided.load.nxv8i8.p0.i64(ptr undef, i64 undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t23 = call <vscale x 16 x i8> @llvm.experimental.vp.strided.load.nxv16i8.p0.i64(ptr undef, i64 undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t25 = call <vscale x 2 x i64> @llvm.experimental.vp.strided.load.nxv2i64.p0.i64(ptr undef, i64 undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t27 = call <vscale x 4 x i64> @llvm.experimental.vp.strided.load.nxv4i64.p0.i64(ptr undef, i64 undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t29 = call <vscale x 8 x i64> @llvm.experimental.vp.strided.load.nxv8i64.p0.i64(ptr undef, i64 undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t31 = call <vscale x 16 x i64> @llvm.experimental.vp.strided.load.nxv16i64.p0.i64(ptr undef, i64 undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %ti1_2 = call <2 x i1> @llvm.experimental.vp.strided.load.v2i1.p0.i64(ptr undef, i64 undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %ti1_4 = call <4 x i1> @llvm.experimental.vp.strided.load.v4i1.p0.i64(ptr undef, i64 undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %ti1_8 = call <8 x i1> @llvm.experimental.vp.strided.load.v8i1.p0.i64(ptr undef, i64 undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %ti1_16 = call <16 x i1> @llvm.experimental.vp.strided.load.v16i1.p0.i64(ptr undef, i64 undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t0 = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr undef, i64 undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t2 = call <4 x i8> @llvm.experimental.vp.strided.load.v4i8.p0.i64(ptr undef, i64 undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t4 = call <8 x i8> @llvm.experimental.vp.strided.load.v8i8.p0.i64(ptr undef, i64 undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t6 = call <16 x i8> @llvm.experimental.vp.strided.load.v16i8.p0.i64(ptr undef, i64 undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t8.a = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.p0.i64(ptr align 8 undef, i64 undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t10.a = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i64(ptr align 8 undef, i64 undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t13.a = call <8 x i64> @llvm.experimental.vp.strided.load.v8i64.p0.i64(ptr align 8 undef, i64 undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t15.a = call <16 x i64> @llvm.experimental.vp.strided.load.v16i64.p0.i64(ptr align 8 undef, i64 undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t8 = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.p0.i64(ptr undef, i64 undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t10 = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i64(ptr undef, i64 undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t13 = call <8 x i64> @llvm.experimental.vp.strided.load.v8i64.p0.i64(ptr undef, i64 undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t15 = call <16 x i64> @llvm.experimental.vp.strided.load.v16i64.p0.i64(ptr undef, i64 undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t17 = call <vscale x 2 x i8> @llvm.experimental.vp.strided.load.nxv2i8.p0.i64(ptr undef, i64 undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t19 = call <vscale x 4 x i8> @llvm.experimental.vp.strided.load.nxv4i8.p0.i64(ptr undef, i64 undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t21 = call <vscale x 8 x i8> @llvm.experimental.vp.strided.load.nxv8i8.p0.i64(ptr undef, i64 undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %t23 = call <vscale x 16 x i8> @llvm.experimental.vp.strided.load.nxv16i8.p0.i64(ptr undef, i64 undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t25 = call <vscale x 2 x i64> @llvm.experimental.vp.strided.load.nxv2i64.p0.i64(ptr undef, i64 undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t27 = call <vscale x 4 x i64> @llvm.experimental.vp.strided.load.nxv4i64.p0.i64(ptr undef, i64 undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t29 = call <vscale x 8 x i64> @llvm.experimental.vp.strided.load.nxv8i64.p0.i64(ptr undef, i64 undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %t31 = call <vscale x 16 x i64> @llvm.experimental.vp.strided.load.nxv16i64.p0.i64(ptr undef, i64 undef, <vscale x 16 x i1> undef, i32 undef)
 ; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %ti1_2 = call <2 x i1> @llvm.experimental.vp.strided.load.v2i1.i64(ptr undef, i64 undef, <2 x i1> undef, i32 undef)
@@ -1558,29 +1083,6 @@ define void @strided_store() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.experimental.vp.strided.store.nxv8i64.p0.i64(<vscale x 8 x i64> undef, ptr undef, i64 undef, <vscale x 8 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.experimental.vp.strided.store.nxv16i64.p0.i64(<vscale x 16 x i64> undef, ptr undef, i64 undef, <vscale x 16 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; TYPEBASED-LABEL: 'strided_store'
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.experimental.vp.strided.store.v2i8.p0.i64(<2 x i8> undef, ptr undef, i64 undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.experimental.vp.strided.store.v4i8.p0.i64(<4 x i8> undef, ptr undef, i64 undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.experimental.vp.strided.store.v8i8.p0.i64(<8 x i8> undef, ptr undef, i64 undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: call void @llvm.experimental.vp.strided.store.v16i8.p0.i64(<16 x i8> undef, ptr undef, i64 undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.experimental.vp.strided.store.v2i64.p0.i64(<2 x i64> undef, ptr undef, i64 undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.experimental.vp.strided.store.v4i64.p0.i64(<4 x i64> undef, ptr undef, i64 undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.experimental.vp.strided.store.v8i64.p0.i64(<8 x i64> undef, ptr undef, i64 undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: call void @llvm.experimental.vp.strided.store.v16i64.p0.i64(<16 x i64> undef, ptr undef, i64 undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.experimental.vp.strided.store.v2i64.p0.i64(<2 x i64> undef, ptr align 8 undef, i64 undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.experimental.vp.strided.store.v4i64.p0.i64(<4 x i64> undef, ptr align 8 undef, i64 undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.experimental.vp.strided.store.v8i64.p0.i64(<8 x i64> undef, ptr align 8 undef, i64 undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: call void @llvm.experimental.vp.strided.store.v16i64.p0.i64(<16 x i64> undef, ptr align 8 undef, i64 undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.experimental.vp.strided.store.nxv2i8.p0.i64(<vscale x 2 x i8> undef, ptr undef, i64 undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.experimental.vp.strided.store.nxv4i8.p0.i64(<vscale x 4 x i8> undef, ptr undef, i64 undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.experimental.vp.strided.store.nxv8i8.p0.i64(<vscale x 8 x i8> undef, ptr undef, i64 undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.experimental.vp.strided.store.nxv16i8.p0.i64(<vscale x 16 x i8> undef, ptr undef, i64 undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.experimental.vp.strided.store.nxv2i64.p0.i64(<vscale x 2 x i64> undef, ptr undef, i64 undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.experimental.vp.strided.store.nxv4i64.p0.i64(<vscale x 4 x i64> undef, ptr undef, i64 undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.experimental.vp.strided.store.nxv8i64.p0.i64(<vscale x 8 x i64> undef, ptr undef, i64 undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.experimental.vp.strided.store.nxv16i64.p0.i64(<vscale x 16 x i64> undef, ptr undef, i64 undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   call void @llvm.experimental.vp.strided.store.v2i8.i64(<2 x i8> undef, ptr undef, i64 undef, <2 x i1> undef, i32 undef)
   call void @llvm.experimental.vp.strided.store.v4i8.i64(<4 x i8> undef, ptr undef, i64 undef, <4 x i1> undef, i32 undef)
@@ -1642,41 +1144,6 @@ define void @reduce_add() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %31 = call i64 @llvm.vp.reduce.add.nxv16i64(i64 undef, <vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %32 = call i64 @llvm.vector.reduce.add.nxv16i64(<vscale x 16 x i64> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; TYPEBASED-LABEL: 'reduce_add'
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %1 = call i8 @llvm.vp.reduce.add.v2i8(i8 undef, <2 x i8> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %3 = call i8 @llvm.vp.reduce.add.v4i8(i8 undef, <4 x i8> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %5 = call i8 @llvm.vp.reduce.add.v8i8(i8 undef, <8 x i8> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %6 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %7 = call i8 @llvm.vp.reduce.add.v16i8(i8 undef, <16 x i8> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %9 = call i64 @llvm.vp.reduce.add.v2i64(i64 undef, <2 x i64> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %10 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %11 = call i64 @llvm.vp.reduce.add.v4i64(i64 undef, <4 x i64> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %12 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %13 = call i64 @llvm.vp.reduce.add.v8i64(i64 undef, <8 x i64> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %14 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %15 = call i64 @llvm.vp.reduce.add.v16i64(i64 undef, <16 x i64> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %17 = call i8 @llvm.vp.reduce.add.nxv8i8(i8 undef, <vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %18 = call i8 @llvm.vector.reduce.add.nxv2i8(<vscale x 2 x i8> undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %19 = call i8 @llvm.vp.reduce.add.nxv4i8(i8 undef, <vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %20 = call i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8> undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %21 = call i8 @llvm.vp.reduce.add.nxv8i8(i8 undef, <vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %22 = call i8 @llvm.vector.reduce.add.nxv8i8(<vscale x 8 x i8> undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %23 = call i8 @llvm.vp.reduce.add.nxv16i8(i8 undef, <vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %24 = call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %25 = call i64 @llvm.vp.reduce.add.nxv2i64(i64 undef, <vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %26 = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %27 = call i64 @llvm.vp.reduce.add.nxv4i64(i64 undef, <vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %28 = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %29 = call i64 @llvm.vp.reduce.add.nxv8i64(i64 undef, <vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %30 = call i64 @llvm.vector.reduce.add.nxv8i64(<vscale x 8 x i64> undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %31 = call i64 @llvm.vp.reduce.add.nxv16i64(i64 undef, <vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %32 = call i64 @llvm.vector.reduce.add.nxv16i64(<vscale x 16 x i64> undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   call i8 @llvm.vp.reduce.add.v2i8(i8 undef, <2 x i8> undef, <2 x i1> undef, i32 undef)
   call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
@@ -1748,41 +1215,6 @@ define void @reduce_fadd() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %31 = call double @llvm.vp.reduce.fadd.nxv16f64(double undef, <vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %32 = call double @llvm.vector.reduce.fadd.nxv16f64(double undef, <vscale x 16 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; TYPEBASED-LABEL: 'reduce_fadd'
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %1 = call float @llvm.vp.reduce.fadd.v2f32(float undef, <2 x float> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %2 = call float @llvm.vector.reduce.fadd.v2f32(float undef, <2 x float> undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %3 = call float @llvm.vp.reduce.fadd.v4f32(float undef, <4 x float> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %4 = call float @llvm.vector.reduce.fadd.v4f32(float undef, <4 x float> undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %5 = call float @llvm.vp.reduce.fadd.v8f32(float undef, <8 x float> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %6 = call float @llvm.vector.reduce.fadd.v8f32(float undef, <8 x float> undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %7 = call float @llvm.vp.reduce.fadd.v16f32(float undef, <16 x float> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %8 = call float @llvm.vector.reduce.fadd.v16f32(float undef, <16 x float> undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %9 = call double @llvm.vp.reduce.fadd.v2f64(double undef, <2 x double> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %10 = call double @llvm.vector.reduce.fadd.v2f64(double undef, <2 x double> undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %11 = call double @llvm.vp.reduce.fadd.v4f64(double undef, <4 x double> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %12 = call double @llvm.vector.reduce.fadd.v4f64(double undef, <4 x double> undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %13 = call double @llvm.vp.reduce.fadd.v8f64(double undef, <8 x double> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %14 = call double @llvm.vector.reduce.fadd.v8f64(double undef, <8 x double> undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %15 = call double @llvm.vp.reduce.fadd.v16f64(double undef, <16 x double> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %16 = call double @llvm.vector.reduce.fadd.v16f64(double undef, <16 x double> undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %17 = call float @llvm.vp.reduce.fadd.nxv2f32(float undef, <vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %18 = call float @llvm.vector.reduce.fadd.nxv2f32(float undef, <vscale x 2 x float> undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %19 = call float @llvm.vp.reduce.fadd.nxv4f32(float undef, <vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %20 = call float @llvm.vector.reduce.fadd.nxv4f32(float undef, <vscale x 4 x float> undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %21 = call float @llvm.vp.reduce.fadd.nxv8f32(float undef, <vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %22 = call float @llvm.vector.reduce.fadd.nxv8f32(float undef, <vscale x 8 x float> undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %23 = call float @llvm.vp.reduce.fadd.nxv16f32(float undef, <vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %24 = call float @llvm.vector.reduce.fadd.nxv16f32(float undef, <vscale x 16 x float> undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %25 = call double @llvm.vp.reduce.fadd.nxv2f64(double undef, <vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %26 = call double @llvm.vector.reduce.fadd.nxv2f64(double undef, <vscale x 2 x double> undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %27 = call double @llvm.vp.reduce.fadd.nxv4f64(double undef, <vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %28 = call double @llvm.vector.reduce.fadd.nxv4f64(double undef, <vscale x 4 x double> undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %29 = call double @llvm.vp.reduce.fadd.nxv8f64(double undef, <vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %30 = call double @llvm.vector.reduce.fadd.nxv8f64(double undef, <vscale x 8 x double> undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %31 = call double @llvm.vp.reduce.fadd.nxv16f64(double undef, <vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %32 = call double @llvm.vector.reduce.fadd.nxv16f64(double undef, <vscale x 16 x double> undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   call float @llvm.vp.reduce.fadd.v2f32(float undef, <2 x float> undef, <2 x i1> undef, i32 undef)
   call float @llvm.vector.reduce.fadd.v2f32(float undef, <2 x float> undef)
@@ -1836,23 +1268,6 @@ define void @reduce_other() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %13 = call float @llvm.vp.reduce.fmax.nxv4f32(float undef, <vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %14 = call float @llvm.vp.reduce.fmaximum.nxv4f32(float undef, <vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; TYPEBASED-LABEL: 'reduce_other'
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %1 = call i32 @llvm.vp.reduce.xor.v4i32(i32 undef, <4 x i32> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %2 = call i32 @llvm.vp.reduce.and.v4i32(i32 undef, <4 x i32> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %3 = call i32 @llvm.vp.reduce.or.v4i32(i32 undef, <4 x i32> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %4 = call i32 @llvm.vp.reduce.mul.v4i32(i32 undef, <4 x i32> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %5 = call i32 @llvm.vp.reduce.smin.v4i32(i32 undef, <4 x i32> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %6 = call i32 @llvm.vp.reduce.smax.v4i32(i32 undef, <4 x i32> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %7 = call i32 @llvm.vp.reduce.umin.v4i32(i32 undef, <4 x i32> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %8 = call i32 @llvm.vp.reduce.umax.v4i32(i32 undef, <4 x i32> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %9 = call i32 @llvm.vp.reduce.mul.nxv4i32(i32 undef, <vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %10 = call float @llvm.vp.reduce.fmul.nxv4f32(float undef, <vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %11 = call float @llvm.vp.reduce.fmin.nxv4f32(float undef, <vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %12 = call float @llvm.vp.reduce.fminimum.nxv4f32(float undef, <vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %13 = call float @llvm.vp.reduce.fmax.nxv4f32(float undef, <vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %14 = call float @llvm.vp.reduce.fmaximum.nxv4f32(float undef, <vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   call i32 @llvm.vp.reduce.xor.v4i32(i32 undef, <4 x i32> undef, <4 x i1> undef, i32 undef)
   call i32 @llvm.vp.reduce.and.v4i32(i32 undef, <4 x i32> undef, <4 x i1> undef, i32 undef)
@@ -1894,26 +1309,6 @@ define void @vp_fadd(){
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %t31 = call <vscale x 16 x double> @llvm.vp.fadd.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %t32 = fadd <vscale x 16 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; TYPEBASED-LABEL: 'vp_fadd'
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t0 = call <2 x float> @llvm.vp.fadd.v2f32(<2 x float> undef, <2 x float> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t2 = call <4 x float> @llvm.vp.fadd.v4f32(<4 x float> undef, <4 x float> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t4 = call <8 x float> @llvm.vp.fadd.v8f32(<8 x float> undef, <8 x float> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t6 = call <16 x float> @llvm.vp.fadd.v16f32(<16 x float> undef, <16 x float> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t8 = call <2 x double> @llvm.vp.fadd.v2f64(<2 x double> undef, <2 x double> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t10 = call <4 x double> @llvm.vp.fadd.v4f64(<4 x double> undef, <4 x double> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t12 = call <8 x double> @llvm.vp.fadd.v8f64(<8 x double> undef, <8 x double> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t14 = call <16 x double> @llvm.vp.fadd.v16f64(<16 x double> undef, <16 x double> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t17 = call <vscale x 2 x float> @llvm.vp.fadd.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t19 = call <vscale x 4 x float> @llvm.vp.fadd.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t21 = call <vscale x 8 x float> @llvm.vp.fadd.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t23 = call <vscale x 16 x float> @llvm.vp.fadd.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t25 = call <vscale x 2 x double> @llvm.vp.fadd.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t27 = call <vscale x 4 x double> @llvm.vp.fadd.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t29 = call <vscale x 8 x double> @llvm.vp.fadd.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %t31 = call <vscale x 16 x double> @llvm.vp.fadd.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %t32 = fadd <vscale x 16 x double> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %t0 = call <2 x float> @llvm.vp.fadd.v2f32(<2 x float> undef, <2 x float> undef, <2 x i1> undef, i32 undef)
   %t2 = call <4 x float> @llvm.vp.fadd.v4f32(<4 x float> undef, <4 x float> undef, <4 x i1> undef, i32 undef)
@@ -1958,25 +1353,6 @@ define void @vp_fsub(){
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t29 = call <vscale x 8 x double> @llvm.vp.fsub.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %t31 = call <vscale x 16 x double> @llvm.vp.fsub.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; TYPEBASED-LABEL: 'vp_fsub'
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t0 = call <2 x float> @llvm.vp.fsub.v2f32(<2 x float> undef, <2 x float> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t2 = call <4 x float> @llvm.vp.fsub.v4f32(<4 x float> undef, <4 x float> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t4 = call <8 x float> @llvm.vp.fsub.v8f32(<8 x float> undef, <8 x float> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t6 = call <16 x float> @llvm.vp.fsub.v16f32(<16 x float> undef, <16 x float> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t8 = call <2 x double> @llvm.vp.fsub.v2f64(<2 x double> undef, <2 x double> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t10 = call <4 x double> @llvm.vp.fsub.v4f64(<4 x double> undef, <4 x double> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t12 = call <8 x double> @llvm.vp.fsub.v8f64(<8 x double> undef, <8 x double> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t14 = call <16 x double> @llvm.vp.fsub.v16f64(<16 x double> undef, <16 x double> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t17 = call <vscale x 2 x float> @llvm.vp.fsub.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t19 = call <vscale x 4 x float> @llvm.vp.fsub.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t21 = call <vscale x 8 x float> @llvm.vp.fsub.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t23 = call <vscale x 16 x float> @llvm.vp.fsub.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t25 = call <vscale x 2 x double> @llvm.vp.fsub.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t27 = call <vscale x 4 x double> @llvm.vp.fsub.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t29 = call <vscale x 8 x double> @llvm.vp.fsub.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %t31 = call <vscale x 16 x double> @llvm.vp.fsub.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %t0 = call <2 x float> @llvm.vp.fsub.v2f32(<2 x float> undef, <2 x float> undef, <2 x i1> undef, i32 undef)
   %t2 = call <4 x float> @llvm.vp.fsub.v4f32(<4 x float> undef, <4 x float> undef, <4 x i1> undef, i32 undef)
@@ -2020,25 +1396,6 @@ define void @vp_fmul(){
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t29 = call <vscale x 8 x double> @llvm.vp.fmul.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %t31 = call <vscale x 16 x double> @llvm.vp.fmul.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; TYPEBASED-LABEL: 'vp_fmul'
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t0 = call <2 x float> @llvm.vp.fmul.v2f32(<2 x float> undef, <2 x float> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t2 = call <4 x float> @llvm.vp.fmul.v4f32(<4 x float> undef, <4 x float> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t4 = call <8 x float> @llvm.vp.fmul.v8f32(<8 x float> undef, <8 x float> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t6 = call <16 x float> @llvm.vp.fmul.v16f32(<16 x float> undef, <16 x float> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t8 = call <2 x double> @llvm.vp.fmul.v2f64(<2 x double> undef, <2 x double> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t10 = call <4 x double> @llvm.vp.fmul.v4f64(<4 x double> undef, <4 x double> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t12 = call <8 x double> @llvm.vp.fmul.v8f64(<8 x double> undef, <8 x double> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t14 = call <16 x double> @llvm.vp.fmul.v16f64(<16 x double> undef, <16 x double> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t17 = call <vscale x 2 x float> @llvm.vp.fmul.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t19 = call <vscale x 4 x float> @llvm.vp.fmul.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t21 = call <vscale x 8 x float> @llvm.vp.fmul.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t23 = call <vscale x 16 x float> @llvm.vp.fmul.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t25 = call <vscale x 2 x double> @llvm.vp.fmul.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t27 = call <vscale x 4 x double> @llvm.vp.fmul.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t29 = call <vscale x 8 x double> @llvm.vp.fmul.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %t31 = call <vscale x 16 x double> @llvm.vp.fmul.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %t0 = call <2 x float> @llvm.vp.fmul.v2f32(<2 x float> undef, <2 x float> undef, <2 x i1> undef, i32 undef)
   %t2 = call <4 x float> @llvm.vp.fmul.v4f32(<4 x float> undef, <4 x float> undef, <4 x i1> undef, i32 undef)
@@ -2082,25 +1439,6 @@ define void @vp_fdiv(){
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t29 = call <vscale x 8 x double> @llvm.vp.fdiv.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %t31 = call <vscale x 16 x double> @llvm.vp.fdiv.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; TYPEBASED-LABEL: 'vp_fdiv'
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t0 = call <2 x float> @llvm.vp.fdiv.v2f32(<2 x float> undef, <2 x float> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t2 = call <4 x float> @llvm.vp.fdiv.v4f32(<4 x float> undef, <4 x float> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t4 = call <8 x float> @llvm.vp.fdiv.v8f32(<8 x float> undef, <8 x float> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t6 = call <16 x float> @llvm.vp.fdiv.v16f32(<16 x float> undef, <16 x float> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t8 = call <2 x double> @llvm.vp.fdiv.v2f64(<2 x double> undef, <2 x double> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t10 = call <4 x double> @llvm.vp.fdiv.v4f64(<4 x double> undef, <4 x double> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t12 = call <8 x double> @llvm.vp.fdiv.v8f64(<8 x double> undef, <8 x double> undef, <8 x i1> undef, i32 undef)
########## TRUNCATED ###########