Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[inductor] refine loop split logic #128812

Draft
wants to merge 16 commits into
base: gh/zhuhaozhe/39/base
Choose a base branch
from

Conversation

zhuhaozhe
Copy link
Collaborator

@zhuhaozhe zhuhaozhe commented Jun 17, 2024

This PR aims to improves parallelization by collapsing vectorized loop. #122281

For such case, the parallel level is only 2.
And the vectorized loop cannot be collapsed.

#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(2L); x0+=static_cast<long>(1L))
{
    for(long x1=static_cast<long>(0L); x1<static_cast<long>(199984L); x1+=static_cast<long>(16L))
    {
        auto tmp0 = at::vec::VectorizedN<int64_t,2>::loadu(in_ptr0 + static_cast<long>(x1 + (199985L*x0)), 16);
        tmp0.store(out_ptr0 + static_cast<long>(x1 + (209985L*x0)), 16);
    }
    #pragma omp simd simdlen(8) 
    for(long x1=static_cast<long>(199984L); x1<static_cast<long>(199985L); x1+=static_cast<long>(1L))
    {
        auto tmp0 = in_ptr0[static_cast<long>(x1 + (199985L*x0))];
        out_ptr0[static_cast<long>(x1 + (209985L*x0))] = tmp0;
    }
}

After this PR, we will gen code

#pragma omp for collapse(2)
for(long x0=static_cast<long>(0L); x0<static_cast<long>(2L); x0+=static_cast<long>(1L))
{
    for(long x1=static_cast<long>(0L); x1<static_cast<long>(199985L); x1+=static_cast<long>(16L))
    {
        if (x1 >= 0 && x1 <199984) {
            auto tmp0 = at::vec::VectorizedN<int64_t,2>::loadu(in_ptr0 + static_cast<long>(x1 + (199985L*x0)), 16);
            tmp0.store(out_ptr0 + static_cast<long>(x1 + (209985L*x0)), 16);
        }
        if (x1 >= 199984 && x1 <199985) {
            auto tmp0 = in_ptr0[static_cast<long>(x1 + (199985L*x0))];
            out_ptr0[static_cast<long>(x1 + (209985L*x0))] = tmp0;
        }
    }
}

Highlight

For reduction case, we have some side-effect here.
For below case, we vectorized x1 dim and reduction at x2 dim.

#pragma omp for
for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(39L); x0+=static_cast<int64_t>(1L))
{
    for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(16L); x1+=static_cast<int64_t>(8L))
    {
        {
            float tmp_acc0 = -std::numeric_limits<float>::infinity();
            at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(-std::numeric_limits<float>::infinity());
            for(int64_t x2=static_cast<int64_t>(0L); x2<static_cast<int64_t>(18L); x2+=static_cast<int64_t>(1L))
            {
                auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<int64_t>(x1 + (17L*x2) + (306L*x0)), 8);
                tmp_acc0_vec = at::vec::maximum(tmp_acc0_vec, tmp0);
            }
            [&]
            {
                __at_align__ std::array<float, 8> tmpbuf;
                tmp_acc0_vec.store(tmpbuf.data(), 8);
                #pragma GCC unroll 8
                for (long x1_inner = 0; x1_inner < 8; x1_inner++)
                {
                    out_ptr1[static_cast<int64_t>(x0 + (39L*x1) + (39L*x1_inner))] = tmpbuf[x1_inner];
                }
            }
            ()
            ;
        }
    }
    #pragma omp simd simdlen(4) 
    for(int64_t x1=static_cast<int64_t>(16L); x1<static_cast<int64_t>(17L); x1+=static_cast<int64_t>(1L))
    {
        {
            float tmp_acc0 = -std::numeric_limits<float>::infinity();
            for(int64_t x2=static_cast<int64_t>(0L); x2<static_cast<int64_t>(18L); x2+=static_cast<int64_t>(1L))
            {
                auto tmp0 = in_ptr1[static_cast<int64_t>(x1 + (17L*x2) + (306L*x0))];
                tmp_acc0 = max_propagate_nan(tmp_acc0, tmp0);
            }
            out_ptr1[static_cast<int64_t>(x0 + (39L*x1))] = tmp_acc0;
        }
    }
}

After collapse, the loop order will be x1 -> x2 -> x1_tail_part, thus we will need a tmp_acc_arr to store the reduction result for x1_tail_part. And for reduction_stores, we also need to check x1's value like what we do in the loopbody since the reduction_stores happened between x1 and x2 loops.

#pragma omp for collapse(2)
for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(39L); x0+=static_cast<int64_t>(1L))
{
    for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(17L); x1+=static_cast<int64_t>(8L))
    {
        {
            float tmp_acc0_arr[8];           ######### need an array to hold acc result for tail part
            for (int i = 0; i < 8; i++)
            {
                tmp_acc0_arr[i] = -std::numeric_limits<float>::infinity();
            }
            float tmp_acc0 = -std::numeric_limits<float>::infinity();
            at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(-std::numeric_limits<float>::infinity());
            for(int64_t x2=static_cast<int64_t>(0L); x2<static_cast<int64_t>(18L); x2+=static_cast<int64_t>(1L))
            {
                {
                    if(C10_LIKELY(x1 >= static_cast<int64_t>(0) && x1 < static_cast<int64_t>(16L)))
                    {
                        auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<int64_t>(x1 + (17L*x2) + (306L*x0)), 8);
                        tmp_acc0_vec = at::vec::maximum(tmp_acc0_vec, tmp0);
                    }
                    if(C10_UNLIKELY(x1 >= static_cast<int64_t>(16L) && x1 < static_cast<int64_t>(17L)))
                    {
                        for (long x1_tail = static_cast<int64_t>(16L); x1_tail < static_cast<int64_t>(17L); x1_tail++)
                        {
                            auto tmp0 = in_ptr1[static_cast<int64_t>(x1_tail + (17L*x2) + (306L*x0))];
                            tmp_acc0_arr[x1_tail - static_cast<int64_t>(16L)] = max_propagate_nan(tmp_acc0_arr[x1_tail - static_cast<int64_t>(16L)], tmp0);
                        }
                    }
                }
            }

            ############### reduction stores
            if(C10_LIKELY(x1 >= static_cast<int64_t>(0) && x1 < static_cast<int64_t>(16L)))
            {
                [&]
                {
                    __at_align__ std::array<float, 8> tmpbuf;
                    tmp_acc0_vec.store(tmpbuf.data(), 8);
                    #pragma GCC unroll 8
                    for (long x1_inner = 0; x1_inner < 8; x1_inner++)
                    {
                        out_ptr1[static_cast<int64_t>(x0 + (39L*x1) + (39L*x1_inner))] = tmpbuf[x1_inner];
                    }
                }
                ()
                ;
            }
            if(C10_UNLIKELY(x1 >= static_cast<int64_t>(16L) && x1 < static_cast<int64_t>(17L)))
            {
                for (long x1_tail = static_cast<int64_t>(16L); x1_tail < static_cast<int64_t>(17L); x1_tail++)
                {
                    out_ptr1[static_cast<int64_t>(x0 + (39L*x1_tail))] = tmp_acc0_arr[x1_tail - static_cast<int64_t>(16L)];
                }
            }
        }
    }
}

Stack from ghstack (oldest at bottom):

cc @voznesenskym @penguinwu @EikanWang @jgong5 @Guobing-Chen @XiaobingSuper @blzheng @wenzhe-nrv @jiayisunx @peterbell10 @ipiszy @yf225 @chenyang78 @kadeng @muchulee8 @ColinPeppler @amjames @desertfire @chauhang

Copy link

pytorch-bot bot commented Jun 17, 2024

🔗 Helpful Links

🧪 See artifacts and rendered test results at hud.pytorch.org/pr/128812

Note: Links to docs will display an error until the docs builds have been completed.

❌ 1 New Failure

As of commit 685e7d1 with merge base 32f45f0 (image):

NEW FAILURE - The following job has failed:

This comment was automatically generated by Dr. CI and updates every 15 minutes.

zhuhaozhe added a commit that referenced this pull request Jun 17, 2024
ghstack-source-id: a0ffb42b1c0b2159b72f278aa4184ab75325cd03
Pull Request resolved: #128812
[ghstack-poisoned]
@zhuhaozhe zhuhaozhe marked this pull request as draft July 17, 2024 07:47
zhuhaozhe added a commit to zhuhaozhe/pytorch that referenced this pull request Jul 24, 2024
ghstack-source-id: a0ffb42b1c0b2159b72f278aa4184ab75325cd03
Pull Request resolved: pytorch#128812
zhuhaozhe added a commit to zhuhaozhe/pytorch that referenced this pull request Jul 24, 2024
ghstack-source-id: a0ffb42b1c0b2159b72f278aa4184ab75325cd03
Pull Request resolved: pytorch#128812
zhuhaozhe added a commit that referenced this pull request Jul 25, 2024
ghstack-source-id: ae8e67d681d811c0cd0ed703d186ddbe8e39f854
Pull Request resolved: #128812
[ghstack-poisoned]
zhuhaozhe added a commit to zhuhaozhe/pytorch that referenced this pull request Jul 26, 2024
ghstack-source-id: ae8e67d681d811c0cd0ed703d186ddbe8e39f854
Pull Request resolved: pytorch#128812
zhuhaozhe added a commit to zhuhaozhe/pytorch that referenced this pull request Jul 27, 2024
ghstack-source-id: ae8e67d681d811c0cd0ed703d186ddbe8e39f854
Pull Request resolved: pytorch#128812
zhuhaozhe added a commit that referenced this pull request Aug 16, 2024
ghstack-source-id: ff1dcca4bbb2cf3100f86bf622b492f73df3ad16
Pull Request resolved: #128812
[ghstack-poisoned]
zhuhaozhe added a commit that referenced this pull request Aug 16, 2024
ghstack-source-id: 39d237a5cf04be275029125ef488469b2f430dda
Pull Request resolved: #128812
[ghstack-poisoned]
zhuhaozhe added a commit that referenced this pull request Aug 16, 2024
ghstack-source-id: 6baf7b0426bbcc1ea0c06180b393ecb4619bb59d
Pull Request resolved: #128812
[ghstack-poisoned]
zhuhaozhe added a commit that referenced this pull request Aug 16, 2024
ghstack-source-id: 8254f219519f68724f941713938b04d9d44c53ac
Pull Request resolved: #128812
[ghstack-poisoned]
zhuhaozhe added a commit that referenced this pull request Aug 29, 2024
ghstack-source-id: 470238141e894f1cd0ea1c798987c229020dccf4
Pull Request resolved: #128812
[ghstack-poisoned]
assert deepest_proxy is not None
return deepest_proxy

deepest_proxy = find_deepest_proxy(cpp_kernel_proxy_list)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why we have to find the deepest kernel proxy?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We do not need it now since we will let the LoopNest in OuterFusedKernel start from depth 0 and we do not need fusion depth.
Previously we will loss the LoopLevel to gen if we do not choose the deepest kernel proxy here.

zhuhaozhe added a commit that referenced this pull request Sep 1, 2024
ghstack-source-id: 7c3963eca96d94f8708064acff585d141e097332
Pull Request resolved: #128812
[ghstack-poisoned]
zhuhaozhe added a commit that referenced this pull request Sep 1, 2024
ghstack-source-id: 1c26422a26460a6d862fbdd8bde1a5401b950b01
Pull Request resolved: #128812
[ghstack-poisoned]
zhuhaozhe added a commit that referenced this pull request Sep 2, 2024
ghstack-source-id: d3a393e324eb6c991988876f7a030bb502b6c8c2
Pull Request resolved: #128812
[ghstack-poisoned]
zhuhaozhe added a commit that referenced this pull request Sep 2, 2024
ghstack-source-id: 7ad611d5f734ea372a5b151fb838e1c870bd2965
Pull Request resolved: #128812
[ghstack-poisoned]
@zhuhaozhe zhuhaozhe added the topic: not user facing topic category label Sep 2, 2024
zhuhaozhe added a commit that referenced this pull request Sep 4, 2024
ghstack-source-id: c1813f3d77fd592337afdd5680fa81855a8af8d5
Pull Request resolved: #128812
[ghstack-poisoned]
zhuhaozhe added a commit that referenced this pull request Sep 5, 2024
ghstack-source-id: 75e5bb00666a71450e4fd3f23238f3d67258194d
Pull Request resolved: #128812
[ghstack-poisoned]
zhuhaozhe added a commit that referenced this pull request Sep 5, 2024
ghstack-source-id: 194ee307738c834fa2c1a54a19a2ae32ffcd35c6
Pull Request resolved: #128812
[ghstack-poisoned]
zhuhaozhe added a commit that referenced this pull request Sep 5, 2024
ghstack-source-id: 037af8d2d6965266a54d4be8c7e50296cd4f6422
Pull Request resolved: #128812
[ghstack-poisoned]
zhuhaozhe added a commit that referenced this pull request Sep 5, 2024
ghstack-source-id: a6eb457e0c029cf912fc404d6246dfb58a747c7d
Pull Request resolved: #128812
[ghstack-poisoned]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

None yet

3 participants