Skip to content

Commit

Permalink
Refactor GPU and Metal langauges in their own separate renderers (tin…
Browse files Browse the repository at this point in the history
…ygrad#2033)

* Refactor GPU and Metal langauges in their own separate renderers

* remove CStyleLanguage imports

* move renderers too
  • Loading branch information
Qazalin committed Oct 10, 2023
1 parent f139060 commit 71d93ff
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 16 deletions.
16 changes: 16 additions & 0 deletions tinygrad/renderer/metal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import functools
from tinygrad.renderer.cstyle import uops_to_cstyle, CStyleLanguage

class MetalLanguage(CStyleLanguage):
kernel_prefix = "#include <metal_stdlib>\nusing namespace metal;\nkernel "
buffer_prefix = "device "
smem_prefix = "threadgroup "
arg_int_prefix = "constant int&"
barrier = "threadgroup_barrier(mem_flags::mem_threadgroup);"
float4 = "float4"
uses_ptr_arithmetic=True
gid = [f"gid.{chr(120+i)}" for i in range(3)]
lid = [f"lid.{chr(120+i)}" for i in range(3)]
extra_args = ['uint3 gid [[threadgroup_position_in_grid]]', 'uint3 lid [[thread_position_in_threadgroup]]']

MetalRenderer = functools.partial(uops_to_cstyle, MetalLanguage())
19 changes: 19 additions & 0 deletions tinygrad/renderer/opencl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import functools
from tinygrad.helpers import dtypes
from tinygrad.renderer.cstyle import uops_to_cstyle, CStyleLanguage

type_map = { dtypes.uint8: "uchar", dtypes.uint32: "uint", dtypes.uint64: "ulong" }
class OpenCLLanguage(CStyleLanguage):
kernel_prefix = "__kernel "
buffer_prefix = "__global "
smem_align = "__attribute__ ((aligned (16))) "
smem_prefix = "__local "
arg_int_prefix = "const int"
half_prekernel = "#pragma OPENCL EXTENSION cl_khr_fp16 : enable"
barrier = "barrier(CLK_LOCAL_MEM_FENCE);"
float4 = "(float4)"
gid = [f'get_group_id({i})' for i in range(3)]
lid = [f'get_local_id({i})' for i in range(3)]
uses_vload=True

OpenCLRenderer = functools.partial(uops_to_cstyle, OpenCLLanguage())
11 changes: 3 additions & 8 deletions tinygrad/runtime/ops_gpu.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
from __future__ import annotations
import pathlib, functools
import pathlib
import numpy as np
import pyopencl as cl # type: ignore
from typing import Optional, List
from tinygrad.helpers import DEBUG, getenv, prod, ImageDType, OSX, fromimport
from tinygrad.ops import Compiled
from tinygrad.renderer.opencl import OpenCLRenderer
from tinygrad.runtime.lib import RawBufferCopyInOut, LRUAllocator, RawBufferTransfer
from tinygrad.codegen.kernel import LinearizerOptions
from tinygrad.renderer.cstyle import uops_to_cstyle, CStyleLanguage

OSX_TIMING_RATIO = (125/3) if OSX else 1.0 # see test/external/external_osx_profiling.py to determine this ratio. it's in like GPU clocks or something

Expand Down Expand Up @@ -103,9 +103,4 @@ def __call__(self, global_size, local_size, *bufs, wait=False) -> Optional[float
return None
return None

renderer = functools.partial(uops_to_cstyle, CStyleLanguage(
kernel_prefix = "__kernel ", buffer_prefix = "__global ", smem_align = "__attribute__ ((aligned (16))) ", smem_prefix = "__local ", arg_int_prefix = "const int",
half_prekernel = "#pragma OPENCL EXTENSION cl_khr_fp16 : enable",
barrier = "barrier(CLK_LOCAL_MEM_FENCE);", float4 = "(float4)",
gid = [f'get_group_id({i})' for i in range(3)], lid = [f'get_local_id({i})' for i in range(3)], uses_vload=True))
GPUBuffer = Compiled(CLBuffer, LinearizerOptions(), renderer, CLProgram, CL.synchronize)
GPUBuffer = Compiled(CLBuffer, LinearizerOptions(), OpenCLRenderer, CLProgram, CL.synchronize)
11 changes: 3 additions & 8 deletions tinygrad/runtime/ops_metal.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
# pip3 install pyobjc-framework-Metal pyobjc-framework-Cocoa pyobjc-framework-libdispatch
import os, subprocess, pathlib, functools, ctypes
import os, subprocess, pathlib, ctypes
import Metal, Cocoa, libdispatch # type: ignore
from typing import List, Any, Tuple
from tinygrad.codegen.kernel import LinearizerOptions
from tinygrad.renderer.cstyle import uops_to_cstyle, CStyleLanguage
from tinygrad.helpers import prod, getenv, DEBUG, DType, dtypes
from tinygrad.ops import Compiled, ASTRunner, BasicBatchExecutor
from tinygrad.renderer.metal import MetalRenderer
from tinygrad.runtime.lib import RawBufferMapped, LRUAllocator

METAL_XCODE = getenv("METAL_XCODE")
Expand Down Expand Up @@ -100,9 +100,4 @@ def __call__(self, global_size, local_size, *bufs, wait=False):
return command_buffer.GPUEndTime() - command_buffer.GPUStartTime()
METAL.mtl_buffers_in_flight.append(command_buffer)

renderer = functools.partial(uops_to_cstyle, CStyleLanguage(
kernel_prefix = "#include <metal_stdlib>\nusing namespace metal;\nkernel ", buffer_prefix = "device ", smem_prefix = "threadgroup ", arg_int_prefix = "constant int&",
barrier = "threadgroup_barrier(mem_flags::mem_threadgroup);", float4 = "float4", uses_ptr_arithmetic=True,
gid = [f"gid.{chr(120+i)}" for i in range(3)], lid = [f"lid.{chr(120+i)}" for i in range(3)],
extra_args = ['uint3 gid [[threadgroup_position_in_grid]]', 'uint3 lid [[thread_position_in_threadgroup]]']))
MetalBuffer = Compiled(RawMetalBuffer, LinearizerOptions(device="METAL"), renderer, MetalProgram, METAL.synchronize, MetalBatchExecutor)
MetalBuffer = Compiled(RawMetalBuffer, LinearizerOptions(device="METAL"), MetalRenderer, MetalProgram, METAL.synchronize, MetalBatchExecutor)

0 comments on commit 71d93ff

Please sign in to comment.