[inductor] refine loop split logic #128812

zhuhaozhe · 2024-06-17T06:19:41Z

This PR aims to improves parallelization by collapsing vectorized loop. #122281

For such case, the parallel level is only 2.
And the vectorized loop cannot be collapsed.

#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(2L); x0+=static_cast<long>(1L))
{
    for(long x1=static_cast<long>(0L); x1<static_cast<long>(199984L); x1+=static_cast<long>(16L))
    {
        auto tmp0 = at::vec::VectorizedN<int64_t,2>::loadu(in_ptr0 + static_cast<long>(x1 + (199985L*x0)), 16);
        tmp0.store(out_ptr0 + static_cast<long>(x1 + (209985L*x0)), 16);
    }
    #pragma omp simd simdlen(8) 
    for(long x1=static_cast<long>(199984L); x1<static_cast<long>(199985L); x1+=static_cast<long>(1L))
    {
        auto tmp0 = in_ptr0[static_cast<long>(x1 + (199985L*x0))];
        out_ptr0[static_cast<long>(x1 + (209985L*x0))] = tmp0;
    }
}

After this PR, we will gen code

#pragma omp for collapse(2)
for(long x0=static_cast<long>(0L); x0<static_cast<long>(2L); x0+=static_cast<long>(1L))
{
    for(long x1=static_cast<long>(0L); x1<static_cast<long>(199985L); x1+=static_cast<long>(16L))
    {
        if (x1 >= 0 && x1 <199984) {
            auto tmp0 = at::vec::VectorizedN<int64_t,2>::loadu(in_ptr0 + static_cast<long>(x1 + (199985L*x0)), 16);
            tmp0.store(out_ptr0 + static_cast<long>(x1 + (209985L*x0)), 16);
        }
        if (x1 >= 199984 && x1 <199985) {
            auto tmp0 = in_ptr0[static_cast<long>(x1 + (199985L*x0))];
            out_ptr0[static_cast<long>(x1 + (209985L*x0))] = tmp0;
        }
    }
}

Highlight

For reduction case, we have some side-effect here.
For below case, we vectorized x1 dim and reduction at x2 dim.

#pragma omp for
for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(39L); x0+=static_cast<int64_t>(1L))
{
    for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(16L); x1+=static_cast<int64_t>(8L))
    {
        {
            float tmp_acc0 = -std::numeric_limits<float>::infinity();
            at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(-std::numeric_limits<float>::infinity());
            for(int64_t x2=static_cast<int64_t>(0L); x2<static_cast<int64_t>(18L); x2+=static_cast<int64_t>(1L))
            {
                auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<int64_t>(x1 + (17L*x2) + (306L*x0)), 8);
                tmp_acc0_vec = at::vec::maximum(tmp_acc0_vec, tmp0);
            }
            [&]
            {
                __at_align__ std::array<float, 8> tmpbuf;
                tmp_acc0_vec.store(tmpbuf.data(), 8);
                #pragma GCC unroll 8
                for (long x1_inner = 0; x1_inner < 8; x1_inner++)
                {
                    out_ptr1[static_cast<int64_t>(x0 + (39L*x1) + (39L*x1_inner))] = tmpbuf[x1_inner];
                }
            }
            ()
            ;
        }
    }
    #pragma omp simd simdlen(4) 
    for(int64_t x1=static_cast<int64_t>(16L); x1<static_cast<int64_t>(17L); x1+=static_cast<int64_t>(1L))
    {
        {
            float tmp_acc0 = -std::numeric_limits<float>::infinity();
            for(int64_t x2=static_cast<int64_t>(0L); x2<static_cast<int64_t>(18L); x2+=static_cast<int64_t>(1L))
            {
                auto tmp0 = in_ptr1[static_cast<int64_t>(x1 + (17L*x2) + (306L*x0))];
                tmp_acc0 = max_propagate_nan(tmp_acc0, tmp0);
            }
            out_ptr1[static_cast<int64_t>(x0 + (39L*x1))] = tmp_acc0;
        }
    }
}

After collapse, the loop order will be x1 -> x2 -> x1_tail_part, thus we will need a tmp_acc_arr to store the reduction result for x1_tail_part. And for reduction_stores, we also need to check x1's value like what we do in the loopbody since the reduction_stores happened between x1 and x2 loops.

#pragma omp for collapse(2)
for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(39L); x0+=static_cast<int64_t>(1L))
{
    for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(17L); x1+=static_cast<int64_t>(8L))
    {
        {
            float tmp_acc0_arr[8];           ######### need an array to hold acc result for tail part
            for (int i = 0; i < 8; i++)
            {
                tmp_acc0_arr[i] = -std::numeric_limits<float>::infinity();
            }
            float tmp_acc0 = -std::numeric_limits<float>::infinity();
            at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(-std::numeric_limits<float>::infinity());
            for(int64_t x2=static_cast<int64_t>(0L); x2<static_cast<int64_t>(18L); x2+=static_cast<int64_t>(1L))
            {
                {
                    if(C10_LIKELY(x1 >= static_cast<int64_t>(0) && x1 < static_cast<int64_t>(16L)))
                    {
                        auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<int64_t>(x1 + (17L*x2) + (306L*x0)), 8);
                        tmp_acc0_vec = at::vec::maximum(tmp_acc0_vec, tmp0);
                    }
                    if(C10_UNLIKELY(x1 >= static_cast<int64_t>(16L) && x1 < static_cast<int64_t>(17L)))
                    {
                        for (long x1_tail = static_cast<int64_t>(16L); x1_tail < static_cast<int64_t>(17L); x1_tail++)
                        {
                            auto tmp0 = in_ptr1[static_cast<int64_t>(x1_tail + (17L*x2) + (306L*x0))];
                            tmp_acc0_arr[x1_tail - static_cast<int64_t>(16L)] = max_propagate_nan(tmp_acc0_arr[x1_tail - static_cast<int64_t>(16L)], tmp0);
                        }
                    }
                }
            }

            ############### reduction stores
            if(C10_LIKELY(x1 >= static_cast<int64_t>(0) && x1 < static_cast<int64_t>(16L)))
            {
                [&]
                {
                    __at_align__ std::array<float, 8> tmpbuf;
                    tmp_acc0_vec.store(tmpbuf.data(), 8);
                    #pragma GCC unroll 8
                    for (long x1_inner = 0; x1_inner < 8; x1_inner++)
                    {
                        out_ptr1[static_cast<int64_t>(x0 + (39L*x1) + (39L*x1_inner))] = tmpbuf[x1_inner];
                    }
                }
                ()
                ;
            }
            if(C10_UNLIKELY(x1 >= static_cast<int64_t>(16L) && x1 < static_cast<int64_t>(17L)))
            {
                for (long x1_tail = static_cast<int64_t>(16L); x1_tail < static_cast<int64_t>(17L); x1_tail++)
                {
                    out_ptr1[static_cast<int64_t>(x0 + (39L*x1_tail))] = tmp_acc0_arr[x1_tail - static_cast<int64_t>(16L)];
                }
            }
        }
    }
}

Stack from ghstack (oldest at bottom):

-> [inductor] refine loop split logic #128812

cc @voznesenskym @penguinwu @EikanWang @jgong5 @Guobing-Chen @XiaobingSuper @blzheng @wenzhe-nrv @jiayisunx @peterbell10 @ipiszy @yf225 @chenyang78 @kadeng @muchulee8 @ColinPeppler @amjames @desertfire @chauhang

pytorch-bot · 2024-06-17T06:19:44Z

🔗 Helpful Links

🧪 See artifacts and rendered test results at hud.pytorch.org/pr/128812

📄 Preview Python docs built from this PR
📄 Preview C++ docs built from this PR
❓ Need help or want to give feedback on the CI? Visit the bot commands wiki or our office hours

Note: Links to docs will display an error until the docs builds have been completed.

❌ 1 New Failure

As of commit 685e7d1 with merge base 32f45f0 ():

NEW FAILURE - The following job has failed:

Check mergeability of ghstack PR / ghstack-mergeability-check (gh)
RuntimeError: Command git -C /home/runner/work/pytorch/pytorch cherry-pick -x 4ebb4d0 returned non-zero exit code 1

This comment was automatically generated by Dr. CI and updates every 15 minutes.

ghstack-source-id: a0ffb42b1c0b2159b72f278aa4184ab75325cd03 Pull Request resolved: #128812

[ghstack-poisoned]

ghstack-source-id: a0ffb42b1c0b2159b72f278aa4184ab75325cd03 Pull Request resolved: pytorch#128812

ghstack-source-id: ae8e67d681d811c0cd0ed703d186ddbe8e39f854 Pull Request resolved: #128812

[ghstack-poisoned]

ghstack-source-id: ae8e67d681d811c0cd0ed703d186ddbe8e39f854 Pull Request resolved: pytorch#128812

ghstack-source-id: ff1dcca4bbb2cf3100f86bf622b492f73df3ad16 Pull Request resolved: #128812

[ghstack-poisoned]

ghstack-source-id: 39d237a5cf04be275029125ef488469b2f430dda Pull Request resolved: #128812

[ghstack-poisoned]

ghstack-source-id: 6baf7b0426bbcc1ea0c06180b393ecb4619bb59d Pull Request resolved: #128812

[ghstack-poisoned]

ghstack-source-id: 8254f219519f68724f941713938b04d9d44c53ac Pull Request resolved: #128812

[ghstack-poisoned]

ghstack-source-id: 470238141e894f1cd0ea1c798987c229020dccf4 Pull Request resolved: #128812

[ghstack-poisoned]

leslie-fang-intel · 2024-08-29T12:52:48Z

torch/_inductor/codegen/cpp.py

+ assert deepest_proxy is not None
+ return deepest_proxy
+
+ deepest_proxy = find_deepest_proxy(cpp_kernel_proxy_list)


Why we have to find the deepest kernel proxy?

We do not need it now since we will let the LoopNest in OuterFusedKernel start from depth 0 and we do not need fusion depth.
Previously we will loss the LoopLevel to gen if we do not choose the deepest kernel proxy here.

ghstack-source-id: 7c3963eca96d94f8708064acff585d141e097332 Pull Request resolved: #128812

[ghstack-poisoned]

ghstack-source-id: 1c26422a26460a6d862fbdd8bde1a5401b950b01 Pull Request resolved: #128812

[ghstack-poisoned]

ghstack-source-id: d3a393e324eb6c991988876f7a030bb502b6c8c2 Pull Request resolved: #128812

[ghstack-poisoned]

ghstack-source-id: 7ad611d5f734ea372a5b151fb838e1c870bd2965 Pull Request resolved: #128812

[ghstack-poisoned]

ghstack-source-id: c1813f3d77fd592337afdd5680fa81855a8af8d5 Pull Request resolved: #128812

[ghstack-poisoned]

ghstack-source-id: 75e5bb00666a71450e4fd3f23238f3d67258194d Pull Request resolved: #128812

[ghstack-poisoned]

ghstack-source-id: 194ee307738c834fa2c1a54a19a2ae32ffcd35c6 Pull Request resolved: #128812

[ghstack-poisoned]

ghstack-source-id: 037af8d2d6965266a54d4be8c7e50296cd4f6422 Pull Request resolved: #128812

[ghstack-poisoned]

ghstack-source-id: a6eb457e0c029cf912fc404d6246dfb58a747c7d Pull Request resolved: #128812

[ghstack-poisoned]

pytorch-bot bot added ciflow/inductor module: inductor labels Jun 17, 2024

zhuhaozhe added a commit that referenced this pull request Jun 17, 2024

[inductor] refine loop split logic

5413bb8

ghstack-source-id: a0ffb42b1c0b2159b72f278aa4184ab75325cd03 Pull Request resolved: #128812

Update

1b424d0

[ghstack-poisoned]

pytorchbot added the open source label Jun 17, 2024

zhuhaozhe marked this pull request as draft July 17, 2024 07:47

zhuhaozhe added a commit to zhuhaozhe/pytorch that referenced this pull request Jul 24, 2024

[inductor] refine loop split logic

c3d519c

ghstack-source-id: a0ffb42b1c0b2159b72f278aa4184ab75325cd03 Pull Request resolved: pytorch#128812

zhuhaozhe added a commit to zhuhaozhe/pytorch that referenced this pull request Jul 24, 2024

[inductor] refine loop split logic

0153704

ghstack-source-id: a0ffb42b1c0b2159b72f278aa4184ab75325cd03 Pull Request resolved: pytorch#128812

zhuhaozhe added a commit that referenced this pull request Jul 25, 2024

[inductor] refine loop split logic

93d10ed

ghstack-source-id: ae8e67d681d811c0cd0ed703d186ddbe8e39f854 Pull Request resolved: #128812

Update

fa2080d

[ghstack-poisoned]

zhuhaozhe added a commit to zhuhaozhe/pytorch that referenced this pull request Jul 26, 2024

[inductor] refine loop split logic

5e05640

ghstack-source-id: ae8e67d681d811c0cd0ed703d186ddbe8e39f854 Pull Request resolved: pytorch#128812

zhuhaozhe added a commit to zhuhaozhe/pytorch that referenced this pull request Jul 27, 2024

[inductor] refine loop split logic

6cc8326

ghstack-source-id: ae8e67d681d811c0cd0ed703d186ddbe8e39f854 Pull Request resolved: pytorch#128812

zhuhaozhe added a commit that referenced this pull request Aug 16, 2024

[inductor] refine loop split logic

abbb08f

ghstack-source-id: ff1dcca4bbb2cf3100f86bf622b492f73df3ad16 Pull Request resolved: #128812

Update

f5edeb5

[ghstack-poisoned]

zhuhaozhe added a commit that referenced this pull request Aug 16, 2024

[inductor] refine loop split logic

038f273

ghstack-source-id: 39d237a5cf04be275029125ef488469b2f430dda Pull Request resolved: #128812

Update

594e303

[ghstack-poisoned]

zhuhaozhe added a commit that referenced this pull request Aug 16, 2024

[inductor] refine loop split logic

9acccc8

ghstack-source-id: 6baf7b0426bbcc1ea0c06180b393ecb4619bb59d Pull Request resolved: #128812

Update

d0d1807

[ghstack-poisoned]

zhuhaozhe added a commit that referenced this pull request Aug 16, 2024

[inductor] refine loop split logic

cfcf8f8

ghstack-source-id: 8254f219519f68724f941713938b04d9d44c53ac Pull Request resolved: #128812

Update

5080505

[ghstack-poisoned]

This was referenced Aug 20, 2024

[inductor] refine loop split logic #131438

Closed

[inductor] refine loop split logic #124060

Draft

leslie-fang-intel mentioned this pull request Aug 22, 2024

[inductor][cpu]performance regression in 2024-08-18 nightly release #134094

Open

zhuhaozhe added a commit that referenced this pull request Aug 29, 2024

[inductor] refine loop split logic

162d06d

ghstack-source-id: 470238141e894f1cd0ea1c798987c229020dccf4 Pull Request resolved: #128812

Update

d08d8f3

[ghstack-poisoned]

leslie-fang-intel mentioned this pull request Aug 29, 2024

Optimize the Loop Structure of Inductor CPP backend to enhance parallelization #134740

Open

leslie-fang-intel reviewed Aug 29, 2024

View reviewed changes

zhuhaozhe added a commit that referenced this pull request Sep 1, 2024

[inductor] refine loop split logic

1c20c42

ghstack-source-id: 7c3963eca96d94f8708064acff585d141e097332 Pull Request resolved: #128812

Update

43f3c55

[ghstack-poisoned]

zhuhaozhe added a commit that referenced this pull request Sep 1, 2024

[inductor] refine loop split logic

00d165a

ghstack-source-id: 1c26422a26460a6d862fbdd8bde1a5401b950b01 Pull Request resolved: #128812

Update

6c3d886

[ghstack-poisoned]

zhuhaozhe added a commit that referenced this pull request Sep 2, 2024

[inductor] refine loop split logic

e4809bd

ghstack-source-id: d3a393e324eb6c991988876f7a030bb502b6c8c2 Pull Request resolved: #128812

Update

d7609ea

[ghstack-poisoned]

zhuhaozhe added a commit that referenced this pull request Sep 2, 2024

[inductor] refine loop split logic

eabf0d1

ghstack-source-id: 7ad611d5f734ea372a5b151fb838e1c870bd2965 Pull Request resolved: #128812

Update

afe8ace

[ghstack-poisoned]

zhuhaozhe added the topic: not user facing topic category label Sep 2, 2024

zhuhaozhe requested review from leslie-fang-intel and jgong5 September 3, 2024 02:04

zhuhaozhe added a commit that referenced this pull request Sep 4, 2024

[inductor] refine loop split logic

2dfcf3b

ghstack-source-id: c1813f3d77fd592337afdd5680fa81855a8af8d5 Pull Request resolved: #128812

Update

0c4c292

[ghstack-poisoned]

zhuhaozhe added a commit that referenced this pull request Sep 5, 2024

[inductor] refine loop split logic

6e00a56

ghstack-source-id: 75e5bb00666a71450e4fd3f23238f3d67258194d Pull Request resolved: #128812

Update

3361334

[ghstack-poisoned]

zhuhaozhe added a commit that referenced this pull request Sep 5, 2024

[inductor] refine loop split logic

6b5e134

ghstack-source-id: 194ee307738c834fa2c1a54a19a2ae32ffcd35c6 Pull Request resolved: #128812

Update

6e0715a

[ghstack-poisoned]

zhuhaozhe added a commit that referenced this pull request Sep 5, 2024

[inductor] refine loop split logic

b694429

ghstack-source-id: 037af8d2d6965266a54d4be8c7e50296cd4f6422 Pull Request resolved: #128812

Update

1e9dc69

[ghstack-poisoned]

zhuhaozhe added a commit that referenced this pull request Sep 5, 2024

[inductor] refine loop split logic

4ebb4d0

ghstack-source-id: a6eb457e0c029cf912fc404d6246dfb58a747c7d Pull Request resolved: #128812

Update

685e7d1

[ghstack-poisoned]

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[inductor] refine loop split logic #128812

[inductor] refine loop split logic #128812

zhuhaozhe commented Jun 17, 2024 •

edited

Loading

pytorch-bot bot commented Jun 17, 2024 •

edited

Loading

leslie-fang-intel Aug 29, 2024

zhuhaozhe Sep 3, 2024

[inductor] refine loop split logic #128812

Are you sure you want to change the base?

[inductor] refine loop split logic #128812

Conversation

zhuhaozhe commented Jun 17, 2024 • edited Loading

Highlight

pytorch-bot bot commented Jun 17, 2024 • edited Loading

🔗 Helpful Links

🧪 See artifacts and rendered test results at hud.pytorch.org/pr/128812

❌ 1 New Failure

leslie-fang-intel Aug 29, 2024

Choose a reason for hiding this comment

zhuhaozhe Sep 3, 2024

Choose a reason for hiding this comment

zhuhaozhe commented Jun 17, 2024 •

edited

Loading

pytorch-bot bot commented Jun 17, 2024 •

edited

Loading