
# Autogenerated by mlir-tblgen; don't manually edit.

from ._ods_common import _cext as _ods_cext
from ._ods_common import (
    equally_sized_accessor as _ods_equally_sized_accessor,
    get_default_loc_context as _ods_get_default_loc_context,
    get_op_results_or_values as _get_op_results_or_values,
    segmented_accessor as _ods_segmented_accessor,
)
_ods_ir = _ods_cext.ir
_ods_cext.globals.register_traceback_file_exclusion(__file__)

import builtins
from typing import Sequence as _Sequence, Union as _Union, Optional as _Optional


@_ods_cext.register_dialect
class _Dialect(_ods_ir.Dialect):
  DIALECT_NAMESPACE = "gpu"

@_ods_cext.register_operation(_Dialect)
class AllReduceOp(_ods_ir.OpView):
  r"""
  The `all_reduce` op reduces the value of every work item across a local
  workgroup. The result is equal for all work items of a workgroup.
  
  For example, both
  
  ```mlir
  %1 = gpu.all_reduce add %0 {} : (f32) -> (f32)
  %2 = gpu.all_reduce %0 {
  ^bb(%lhs : f32, %rhs : f32):
    %sum = arith.addf %lhs, %rhs : f32
    "gpu.yield"(%sum) : (f32) -> ()
  } : (f32) -> (f32)
  ```
  
  compute the sum of each work item's %0 value. The first version specifies
  the accumulation as operation, whereas the second version specifies the
  accumulation as code region. The reduction operation must be one of:
  *  Integer types: `add`, `mul`, `minui`, `minsi`, `maxui`, `maxsi`, `and`,
     `or`, `xor`
  *  Floating point types: `add`, `mul`, `minnumf`, `maxnumf`, `minimumf`,
     `maximumf`
  
  If `uniform` flag is set either none or all work items of a workgroup
  need to execute this op in convergence.
  """

  OPERATION_NAME = "gpu.all_reduce"

  _ODS_REGIONS = (1, True)

  def __init__(self, value, *, op=None, uniform=None, results=None, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.append(value)
    _ods_context = _ods_get_default_loc_context(loc)
    if op is not None: attributes["op"] = (op if (
        isinstance(op, _ods_ir.Attribute) or
        not _ods_ir.AttrBuilder.contains('GPU_AllReduceOperationAttr')) else
          _ods_ir.AttrBuilder.get('GPU_AllReduceOperationAttr')(op, context=_ods_context))
    if bool(uniform): attributes["uniform"] = _ods_ir.UnitAttr.get(
      _ods_get_default_loc_context(loc))
    if results is None: results = [operands[0].type] * 1
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def value(self) -> _ods_ir.Value:
    return self.operation.operands[0]

  @builtins.property
  def op(self) -> _Optional[_ods_ir.Attribute]:
    if "op" not in self.operation.attributes:
      return None
    return self.operation.attributes["op"]

  @op.setter
  def op(self, value: _Optional[_ods_ir.Attribute]):
    if value is not None:
      self.operation.attributes["op"] = value
    elif "op" in self.operation.attributes:
      del self.operation.attributes["op"]

  @op.deleter
  def op(self):
    del self.operation.attributes["op"]

  @builtins.property
  def uniform(self) -> bool:
    return "uniform" in self.operation.attributes

  @uniform.setter
  def uniform(self, value):
    if bool(value):
      self.operation.attributes["uniform"] = _ods_ir.UnitAttr.get()
    elif "uniform" in self.operation.attributes:
      del self.operation.attributes["uniform"]

  @uniform.deleter
  def uniform(self):
    del self.operation.attributes["uniform"]

  @builtins.property
  def result(self) -> _ods_ir.OpResult:
    return self.operation.results[0]

  @builtins.property
  def body(self) -> _ods_ir.Region:
    return self.regions[0]

def all_reduce(value, *, op=None, uniform=None, results=None, loc=None, ip=None) -> _ods_ir.OpResult:
  return AllReduceOp(value=value, op=op, uniform=uniform, results=results, loc=loc, ip=ip).result

@_ods_cext.register_operation(_Dialect)
class AllocOp(_ods_ir.OpView):
  r"""
  The `gpu.alloc` operation allocates a region of memory on the GPU. It is
  similar to the `memref.alloc` op, but supports asynchronous GPU execution.
  
  The op does not execute before all async dependencies have finished
  executing.
  
  If the `async` keyword is present, the op is executed asynchronously (i.e.
  it does not block until the execution has finished on the device). In
  that case, it also returns a !gpu.async.token.
  
  If the `host_shared` keyword is present, the memory will be allocated in a
  memory accessible both on host and on device.
  
  Example:
  
  ```mlir
  %memref, %token = gpu.alloc async [%dep] host_shared (%width) : memref<64x?xf32, 1>
  ```
  """

  OPERATION_NAME = "gpu.alloc"

  _ODS_OPERAND_SEGMENTS = [-1,-1,-1,]

  _ODS_REGIONS = (0, True)

  def __init__(self, memref, asyncToken, asyncDependencies, dynamicSizes, symbolOperands, *, hostShared=None, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.append(_get_op_results_or_values(asyncDependencies))
    operands.append(_get_op_results_or_values(dynamicSizes))
    operands.append(_get_op_results_or_values(symbolOperands))
    _ods_context = _ods_get_default_loc_context(loc)
    if bool(hostShared): attributes["hostShared"] = _ods_ir.UnitAttr.get(
      _ods_get_default_loc_context(loc))
    results = []
    results.append(memref)
    if asyncToken is not None: results.append(asyncToken)
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def asyncDependencies(self) -> _ods_ir.OpOperandList:
    operand_range = _ods_segmented_accessor(
         self.operation.operands,
         self.operation.attributes["operandSegmentSizes"], 0)
    return operand_range

  @builtins.property
  def dynamicSizes(self) -> _ods_ir.OpOperandList:
    operand_range = _ods_segmented_accessor(
         self.operation.operands,
         self.operation.attributes["operandSegmentSizes"], 1)
    return operand_range

  @builtins.property
  def symbolOperands(self) -> _ods_ir.OpOperandList:
    operand_range = _ods_segmented_accessor(
         self.operation.operands,
         self.operation.attributes["operandSegmentSizes"], 2)
    return operand_range

  @builtins.property
  def hostShared(self) -> bool:
    return "hostShared" in self.operation.attributes

  @hostShared.setter
  def hostShared(self, value):
    if bool(value):
      self.operation.attributes["hostShared"] = _ods_ir.UnitAttr.get()
    elif "hostShared" in self.operation.attributes:
      del self.operation.attributes["hostShared"]

  @hostShared.deleter
  def hostShared(self):
    del self.operation.attributes["hostShared"]

  @builtins.property
  def memref(self) -> _ods_ir.OpResult[_ods_ir.MemRefType]:
    return self.operation.results[0]

  @builtins.property
  def asyncToken(self) -> _Optional[_ods_ir.OpResult]:
    return None if len(self.operation.results) < 2 else self.operation.results[1]

def alloc(memref, async_token, async_dependencies, dynamic_sizes, symbol_operands, *, host_shared=None, loc=None, ip=None) -> _Union[_ods_ir.OpResult, _ods_ir.OpResultList, AllocOp]:
  op = AllocOp(memref=memref, asyncToken=async_token, asyncDependencies=async_dependencies, dynamicSizes=dynamic_sizes, symbolOperands=symbol_operands, hostShared=host_shared, loc=loc, ip=ip); results = op.results
  return results if len(results) > 1 else (results[0] if len(results) == 1 else op)

@_ods_cext.register_operation(_Dialect)
class BarrierOp(_ods_ir.OpView):
  r"""
  The `barrier` op synchronizes all work items of a workgroup. It is used
  to coordinate communication between the work items of the workgroup.
  
  ```mlir
  gpu.barrier
  ```
  
  waits until all work items in the workgroup have reached this point
  and all memory accesses made by these work items prior to the op are
  visible to all work items in the workgroup. Data hazards between work items
  accessing the same memory can be avoided by synchronizing work items
  in-between these accesses.
  
  Either none or all work items of a workgroup need to execute this op
  in convergence.
  """

  OPERATION_NAME = "gpu.barrier"

  _ODS_REGIONS = (0, True)

  def __init__(self, *, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    _ods_context = _ods_get_default_loc_context(loc)
    results = []
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

def barrier(*, loc=None, ip=None) -> BarrierOp:
  return BarrierOp(loc=loc, ip=ip)

@_ods_cext.register_operation(_Dialect)
class BinaryOp(_ods_ir.OpView):
  r"""
  GPU binaries provide a semantic mechanism for storing GPU objects,
  e.g. the result of compiling a GPU module to an object file.
  
  This operation has 3 arguments:
   - The name of the binary.
   - An optional attribute implementing the offloading LLVM translation interface.
   - An array of GPU object attributes.
  
  During translation, the offloading attribute will be called for translating
  GPU `binary` and `launch_func` operations. The default offloading handler is:
  `#gpu.select_object`, this handler selects the first object from the array
  and embeds it as a string.
  
  Examples:
  ```
    // Selects the first object.
    gpu.binary @myobject [#gpu.object<...>, #gpu.object<...>]
    // Uses the `#foo.my_handler` for handling the binary during translation.
    gpu.binary @myobject <#foo.my_handler> [#gpu.object<...>, #gpu.object<...>]
    // Selects the object with the `#rocdl.target` target attribute.
    gpu.binary @myobject <#gpu.select_object<#rocdl.target>> [#gpu.object<...>, #gpu.object<#rocdl.target, ...>]
  ```
  """

  OPERATION_NAME = "gpu.binary"

  _ODS_REGIONS = (0, True)

  def __init__(self, sym_name, objects, *, offloadingHandler=None, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    _ods_context = _ods_get_default_loc_context(loc)
    attributes["sym_name"] = (sym_name if (
    isinstance(sym_name, _ods_ir.Attribute) or
    not _ods_ir.AttrBuilder.contains('SymbolNameAttr')) else
      _ods_ir.AttrBuilder.get('SymbolNameAttr')(sym_name, context=_ods_context))
    if offloadingHandler is not None: attributes["offloadingHandler"] = (offloadingHandler if (
        isinstance(offloadingHandler, _ods_ir.Attribute) or
        not _ods_ir.AttrBuilder.contains('AnyAttr')) else
          _ods_ir.AttrBuilder.get('AnyAttr')(offloadingHandler, context=_ods_context))
    attributes["objects"] = (objects if (
    isinstance(objects, _ods_ir.Attribute) or
    not _ods_ir.AttrBuilder.contains('GPUObjectArrayAttr')) else
      _ods_ir.AttrBuilder.get('GPUObjectArrayAttr')(objects, context=_ods_context))
    results = []
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def sym_name(self) -> _ods_ir.StringAttr:
    return self.operation.attributes["sym_name"]

  @sym_name.setter
  def sym_name(self, value: _ods_ir.StringAttr):
    if value is None:
      raise ValueError("'None' not allowed as value for mandatory attributes")
    self.operation.attributes["sym_name"] = value

  @builtins.property
  def offloadingHandler(self) -> _Optional[_ods_ir.Attribute]:
    if "offloadingHandler" not in self.operation.attributes:
      return None
    return self.operation.attributes["offloadingHandler"]

  @offloadingHandler.setter
  def offloadingHandler(self, value: _Optional[_ods_ir.Attribute]):
    if value is not None:
      self.operation.attributes["offloadingHandler"] = value
    elif "offloadingHandler" in self.operation.attributes:
      del self.operation.attributes["offloadingHandler"]

  @offloadingHandler.deleter
  def offloadingHandler(self):
    del self.operation.attributes["offloadingHandler"]

  @builtins.property
  def objects(self) -> _ods_ir.ArrayAttr:
    return self.operation.attributes["objects"]

  @objects.setter
  def objects(self, value: _ods_ir.ArrayAttr):
    if value is None:
      raise ValueError("'None' not allowed as value for mandatory attributes")
    self.operation.attributes["objects"] = value

def binary(sym_name, objects, *, offloading_handler=None, loc=None, ip=None) -> BinaryOp:
  return BinaryOp(sym_name=sym_name, objects=objects, offloadingHandler=offloading_handler, loc=loc, ip=ip)

@_ods_cext.register_operation(_Dialect)
class BlockDimOp(_ods_ir.OpView):
  r"""
  Returns the number of threads in the thread block (aka the block size) along
  the x, y, or z `dimension`.
  
  Example:
  
  ```mlir
  %bDimX = gpu.block_dim x
  ```
  
  If `known_block_size` is set on an this operation's enclosing `gpu.func`,
  or `gpu.known_block_size` is set on an enclosing `FunctionOpInterface`
  implementor, or if the enclosing `gpu.launch` specifies a constant size for
  `dimension`'s blocks, these contextual facts may be used to infer that this
  operation has a constant value, though such a transformation will not be
  performed by canonicalization or the default constant folder. Executions which
  cause that constant-value assumption to be false incur undefined behavior.
  
  If `upper_bound` is set, executions where the bblock size along `dimension`
  exceeds `upper_bound` cause undefined behavior.
  
  There is an implicit upper bound of `kMaxDim` (currently uint32_t::max).
  """

  OPERATION_NAME = "gpu.block_dim"

  _ODS_REGIONS = (0, True)

  def __init__(self, dimension, *, upper_bound=None, results=None, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    _ods_context = _ods_get_default_loc_context(loc)
    attributes["dimension"] = (dimension if (
    isinstance(dimension, _ods_ir.Attribute) or
    not _ods_ir.AttrBuilder.contains('GPU_DimensionAttr')) else
      _ods_ir.AttrBuilder.get('GPU_DimensionAttr')(dimension, context=_ods_context))
    if upper_bound is not None: attributes["upper_bound"] = (upper_bound if (
        isinstance(upper_bound, _ods_ir.Attribute) or
        not _ods_ir.AttrBuilder.contains('IndexAttr')) else
          _ods_ir.AttrBuilder.get('IndexAttr')(upper_bound, context=_ods_context))
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def dimension(self) -> _ods_ir.Attribute:
    return self.operation.attributes["dimension"]

  @dimension.setter
  def dimension(self, value: _ods_ir.Attribute):
    if value is None:
      raise ValueError("'None' not allowed as value for mandatory attributes")
    self.operation.attributes["dimension"] = value

  @builtins.property
  def upper_bound(self) -> _Optional[_ods_ir.IntegerAttr]:
    if "upper_bound" not in self.operation.attributes:
      return None
    return self.operation.attributes["upper_bound"]

  @upper_bound.setter
  def upper_bound(self, value: _Optional[_ods_ir.IntegerAttr]):
    if value is not None:
      self.operation.attributes["upper_bound"] = value
    elif "upper_bound" in self.operation.attributes:
      del self.operation.attributes["upper_bound"]

  @upper_bound.deleter
  def upper_bound(self):
    del self.operation.attributes["upper_bound"]

def block_dim(dimension, *, upper_bound=None, results=None, loc=None, ip=None) -> _ods_ir.OpResult:
  return BlockDimOp(dimension=dimension, upper_bound=upper_bound, results=results, loc=loc, ip=ip).result

@_ods_cext.register_operation(_Dialect)
class BlockIdOp(_ods_ir.OpView):
  r"""
  Returns the block id, i.e. the index of the current block within the grid
  along the x, y, or z `dimension`.
  
  Example:
  
  ```mlir
  %bIdY = gpu.block_id y
  ```
  
  If `upper_bound` is set, or if one can be inferred from `known_grid_size`-type
  annotations in context, executions where the block index in `dimension` would
  be greater than or equal to that bound cause undefined behavior. `upper_bound`
  takes priority over bounds inferrable from context.
  
  There is an implicit upper bound of `kMaxDim` (currently uint32_t::max).
  """

  OPERATION_NAME = "gpu.block_id"

  _ODS_REGIONS = (0, True)

  def __init__(self, dimension, *, upper_bound=None, results=None, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    _ods_context = _ods_get_default_loc_context(loc)
    attributes["dimension"] = (dimension if (
    isinstance(dimension, _ods_ir.Attribute) or
    not _ods_ir.AttrBuilder.contains('GPU_DimensionAttr')) else
      _ods_ir.AttrBuilder.get('GPU_DimensionAttr')(dimension, context=_ods_context))
    if upper_bound is not None: attributes["upper_bound"] = (upper_bound if (
        isinstance(upper_bound, _ods_ir.Attribute) or
        not _ods_ir.AttrBuilder.contains('IndexAttr')) else
          _ods_ir.AttrBuilder.get('IndexAttr')(upper_bound, context=_ods_context))
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def dimension(self) -> _ods_ir.Attribute:
    return self.operation.attributes["dimension"]

  @dimension.setter
  def dimension(self, value: _ods_ir.Attribute):
    if value is None:
      raise ValueError("'None' not allowed as value for mandatory attributes")
    self.operation.attributes["dimension"] = value

  @builtins.property
  def upper_bound(self) -> _Optional[_ods_ir.IntegerAttr]:
    if "upper_bound" not in self.operation.attributes:
      return None
    return self.operation.attributes["upper_bound"]

  @upper_bound.setter
  def upper_bound(self, value: _Optional[_ods_ir.IntegerAttr]):
    if value is not None:
      self.operation.attributes["upper_bound"] = value
    elif "upper_bound" in self.operation.attributes:
      del self.operation.attributes["upper_bound"]

  @upper_bound.deleter
  def upper_bound(self):
    del self.operation.attributes["upper_bound"]

def block_id(dimension, *, upper_bound=None, results=None, loc=None, ip=None) -> _ods_ir.OpResult:
  return BlockIdOp(dimension=dimension, upper_bound=upper_bound, results=results, loc=loc, ip=ip).result

@_ods_cext.register_operation(_Dialect)
class ClusterBlockIdOp(_ods_ir.OpView):
  r"""
  Returns the block id within the cluster along the x, y, or z `dimension`.
  
  Example:
  
  ```mlir
  %cBlockIdY = gpu.cluster_block_id y
  ```
  
  If `upper_bound` is set, then executing (a lowering of) this operation in an
  environment where the number of thread blocks per cluster along `dimension`
  is greater than `upper_bound` causes undefined behavior.
  
  There is an implicit upper bound of `kMaxClusterDim` (currently 8).
  """

  OPERATION_NAME = "gpu.cluster_block_id"

  _ODS_REGIONS = (0, True)

  def __init__(self, dimension, *, upper_bound=None, results=None, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    _ods_context = _ods_get_default_loc_context(loc)
    attributes["dimension"] = (dimension if (
    isinstance(dimension, _ods_ir.Attribute) or
    not _ods_ir.AttrBuilder.contains('GPU_DimensionAttr')) else
      _ods_ir.AttrBuilder.get('GPU_DimensionAttr')(dimension, context=_ods_context))
    if upper_bound is not None: attributes["upper_bound"] = (upper_bound if (
        isinstance(upper_bound, _ods_ir.Attribute) or
        not _ods_ir.AttrBuilder.contains('IndexAttr')) else
          _ods_ir.AttrBuilder.get('IndexAttr')(upper_bound, context=_ods_context))
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def dimension(self) -> _ods_ir.Attribute:
    return self.operation.attributes["dimension"]

  @dimension.setter
  def dimension(self, value: _ods_ir.Attribute):
    if value is None:
      raise ValueError("'None' not allowed as value for mandatory attributes")
    self.operation.attributes["dimension"] = value

  @builtins.property
  def upper_bound(self) -> _Optional[_ods_ir.IntegerAttr]:
    if "upper_bound" not in self.operation.attributes:
      return None
    return self.operation.attributes["upper_bound"]

  @upper_bound.setter
  def upper_bound(self, value: _Optional[_ods_ir.IntegerAttr]):
    if value is not None:
      self.operation.attributes["upper_bound"] = value
    elif "upper_bound" in self.operation.attributes:
      del self.operation.attributes["upper_bound"]

  @upper_bound.deleter
  def upper_bound(self):
    del self.operation.attributes["upper_bound"]

def cluster_block_id(dimension, *, upper_bound=None, results=None, loc=None, ip=None) -> _ods_ir.OpResult:
  return ClusterBlockIdOp(dimension=dimension, upper_bound=upper_bound, results=results, loc=loc, ip=ip).result

@_ods_cext.register_operation(_Dialect)
class ClusterDimBlocksOp(_ods_ir.OpView):
  r"""
  Returns the number of thread blocks in the cluster along
  the x, y, or z `dimension`.
  
  Example:
  
  ```mlir
  %cDimBlocksX = gpu.cluster_dim_blocks x
  ```
  
  If `upper_bound` is set, then executing (a lowering of) this operation in an
  environment where the thread blocks per cluster  is greater than `upper_bound`
  causes undefined behavior.
  
  There is an implicit upper bound of `kMaxClusterDim` (currently 8).
  """

  OPERATION_NAME = "gpu.cluster_dim_blocks"

  _ODS_REGIONS = (0, True)

  def __init__(self, dimension, *, upper_bound=None, results=None, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    _ods_context = _ods_get_default_loc_context(loc)
    attributes["dimension"] = (dimension if (
    isinstance(dimension, _ods_ir.Attribute) or
    not _ods_ir.AttrBuilder.contains('GPU_DimensionAttr')) else
      _ods_ir.AttrBuilder.get('GPU_DimensionAttr')(dimension, context=_ods_context))
    if upper_bound is not None: attributes["upper_bound"] = (upper_bound if (
        isinstance(upper_bound, _ods_ir.Attribute) or
        not _ods_ir.AttrBuilder.contains('IndexAttr')) else
          _ods_ir.AttrBuilder.get('IndexAttr')(upper_bound, context=_ods_context))
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def dimension(self) -> _ods_ir.Attribute:
    return self.operation.attributes["dimension"]

  @dimension.setter
  def dimension(self, value: _ods_ir.Attribute):
    if value is None:
      raise ValueError("'None' not allowed as value for mandatory attributes")
    self.operation.attributes["dimension"] = value

  @builtins.property
  def upper_bound(self) -> _Optional[_ods_ir.IntegerAttr]:
    if "upper_bound" not in self.operation.attributes:
      return None
    return self.operation.attributes["upper_bound"]

  @upper_bound.setter
  def upper_bound(self, value: _Optional[_ods_ir.IntegerAttr]):
    if value is not None:
      self.operation.attributes["upper_bound"] = value
    elif "upper_bound" in self.operation.attributes:
      del self.operation.attributes["upper_bound"]

  @upper_bound.deleter
  def upper_bound(self):
    del self.operation.attributes["upper_bound"]

def cluster_dim_blocks(dimension, *, upper_bound=None, results=None, loc=None, ip=None) -> _ods_ir.OpResult:
  return ClusterDimBlocksOp(dimension=dimension, upper_bound=upper_bound, results=results, loc=loc, ip=ip).result

@_ods_cext.register_operation(_Dialect)
class ClusterDimOp(_ods_ir.OpView):
  r"""
  Returns the number of cluster identifiers per grid along
  the x, y, or z `dimension`.
  
  Example:
  
  ```mlir
  %cDimX = gpu.cluster_dim x
  ```
  
  If `upper_bound` is set, then executing (a lowering of) this operation in an
  environment where the clusters per grid is greater than `upper_bound` causes
  undefined behavior.
  
  There is an implicit upper bound of `kMaxDim` (currently uint32_t::max).
  """

  OPERATION_NAME = "gpu.cluster_dim"

  _ODS_REGIONS = (0, True)

  def __init__(self, dimension, *, upper_bound=None, results=None, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    _ods_context = _ods_get_default_loc_context(loc)
    attributes["dimension"] = (dimension if (
    isinstance(dimension, _ods_ir.Attribute) or
    not _ods_ir.AttrBuilder.contains('GPU_DimensionAttr')) else
      _ods_ir.AttrBuilder.get('GPU_DimensionAttr')(dimension, context=_ods_context))
    if upper_bound is not None: attributes["upper_bound"] = (upper_bound if (
        isinstance(upper_bound, _ods_ir.Attribute) or
        not _ods_ir.AttrBuilder.contains('IndexAttr')) else
          _ods_ir.AttrBuilder.get('IndexAttr')(upper_bound, context=_ods_context))
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def dimension(self) -> _ods_ir.Attribute:
    return self.operation.attributes["dimension"]

  @dimension.setter
  def dimension(self, value: _ods_ir.Attribute):
    if value is None:
      raise ValueError("'None' not allowed as value for mandatory attributes")
    self.operation.attributes["dimension"] = value

  @builtins.property
  def upper_bound(self) -> _Optional[_ods_ir.IntegerAttr]:
    if "upper_bound" not in self.operation.attributes:
      return None
    return self.operation.attributes["upper_bound"]

  @upper_bound.setter
  def upper_bound(self, value: _Optional[_ods_ir.IntegerAttr]):
    if value is not None:
      self.operation.attributes["upper_bound"] = value
    elif "upper_bound" in self.operation.attributes:
      del self.operation.attributes["upper_bound"]

  @upper_bound.deleter
  def upper_bound(self):
    del self.operation.attributes["upper_bound"]

def cluster_dim(dimension, *, upper_bound=None, results=None, loc=None, ip=None) -> _ods_ir.OpResult:
  return ClusterDimOp(dimension=dimension, upper_bound=upper_bound, results=results, loc=loc, ip=ip).result

@_ods_cext.register_operation(_Dialect)
class ClusterIdOp(_ods_ir.OpView):
  r"""
  Returns the cluster id, i.e. the index of the current cluster within the
  grid along the x, y, or z `dimension`.
  
  Example:
  
  ```mlir
  %cIdY = gpu.cluster_id y
  ```
  
  If `upper_bound` is set, then executing (a lowering of) this operation in an
  environment where the number of clusters in the grid along `dimension` is
  greater than `upper_bound` causes undefined behavior.
  
  There is an implicit upper bound of `kMaxDim` (currently uint32_t::max).
  """

  OPERATION_NAME = "gpu.cluster_id"

  _ODS_REGIONS = (0, True)

  def __init__(self, dimension, *, upper_bound=None, results=None, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    _ods_context = _ods_get_default_loc_context(loc)
    attributes["dimension"] = (dimension if (
    isinstance(dimension, _ods_ir.Attribute) or
    not _ods_ir.AttrBuilder.contains('GPU_DimensionAttr')) else
      _ods_ir.AttrBuilder.get('GPU_DimensionAttr')(dimension, context=_ods_context))
    if upper_bound is not None: attributes["upper_bound"] = (upper_bound if (
        isinstance(upper_bound, _ods_ir.Attribute) or
        not _ods_ir.AttrBuilder.contains('IndexAttr')) else
          _ods_ir.AttrBuilder.get('IndexAttr')(upper_bound, context=_ods_context))
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def dimension(self) -> _ods_ir.Attribute:
    return self.operation.attributes["dimension"]

  @dimension.setter
  def dimension(self, value: _ods_ir.Attribute):
    if value is None:
      raise ValueError("'None' not allowed as value for mandatory attributes")
    self.operation.attributes["dimension"] = value

  @builtins.property
  def upper_bound(self) -> _Optional[_ods_ir.IntegerAttr]:
    if "upper_bound" not in self.operation.attributes:
      return None
    return self.operation.attributes["upper_bound"]

  @upper_bound.setter
  def upper_bound(self, value: _Optional[_ods_ir.IntegerAttr]):
    if value is not None:
      self.operation.attributes["upper_bound"] = value
    elif "upper_bound" in self.operation.attributes:
      del self.operation.attributes["upper_bound"]

  @upper_bound.deleter
  def upper_bound(self):
    del self.operation.attributes["upper_bound"]

def cluster_id(dimension, *, upper_bound=None, results=None, loc=None, ip=None) -> _ods_ir.OpResult:
  return ClusterIdOp(dimension=dimension, upper_bound=upper_bound, results=results, loc=loc, ip=ip).result

@_ods_cext.register_operation(_Dialect)
class Create2To4SpMatOp(_ods_ir.OpView):
  r"""
  The `gpu.create_2to4_spmat` operation initializes a sparse matrix in dense
  format with 2:4 sparsity.
  The buffers must already be copied from the host to the device prior to
  using this operation. The operation returns a handle to the sparse
  matrix descriptor.
  
  If the `async` keyword is present, the op is executed asynchronously (i.e.
  it does not block until the execution has finished on the device). In
  that case, it returns a !gpu.async.token in addition to the environment.
  
  Example:
  
  ```mlir
  %spmat, %token = gpu.create_2to4_spmat async [%dep] {PRUNE_AND_CHECK} %rows, %cols, %mem: memref<?xf64>
  ```
  """

  OPERATION_NAME = "gpu.create_2to4_spmat"

  _ODS_REGIONS = (0, True)

  def __init__(self, spMat, asyncToken, asyncDependencies, rows, cols, memref, *, pruneFlag=None, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.extend(_get_op_results_or_values(asyncDependencies))
    operands.append(rows)
    operands.append(cols)
    operands.append(memref)
    _ods_context = _ods_get_default_loc_context(loc)
    if pruneFlag is not None: attributes["pruneFlag"] = (pruneFlag if (
        isinstance(pruneFlag, _ods_ir.Attribute) or
        not _ods_ir.AttrBuilder.contains('GPU_Prune2To4SpMatFlagAttr')) else
          _ods_ir.AttrBuilder.get('GPU_Prune2To4SpMatFlagAttr')(pruneFlag, context=_ods_context))
    results = []
    results.append(spMat)
    if asyncToken is not None: results.append(asyncToken)
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def asyncDependencies(self) -> _ods_ir.OpOperandList:
    _ods_variadic_group_length = len(self.operation.operands) - 4 + 1
    return self.operation.operands[0:0 + _ods_variadic_group_length]

  @builtins.property
  def rows(self) -> _ods_ir.Value[_ods_ir.IndexType]:
    _ods_variadic_group_length = len(self.operation.operands) - 4 + 1
    return self.operation.operands[1 + _ods_variadic_group_length - 1]

  @builtins.property
  def cols(self) -> _ods_ir.Value[_ods_ir.IndexType]:
    _ods_variadic_group_length = len(self.operation.operands) - 4 + 1
    return self.operation.operands[2 + _ods_variadic_group_length - 1]

  @builtins.property
  def memref(self) -> _ods_ir.Value[_ods_ir.MemRefType]:
    _ods_variadic_group_length = len(self.operation.operands) - 4 + 1
    return self.operation.operands[3 + _ods_variadic_group_length - 1]

  @builtins.property
  def pruneFlag(self) -> _ods_ir.Attribute:
    return self.operation.attributes["pruneFlag"]

  @pruneFlag.setter
  def pruneFlag(self, value: _ods_ir.Attribute):
    if value is None:
      raise ValueError("'None' not allowed as value for mandatory attributes")
    self.operation.attributes["pruneFlag"] = value

  @builtins.property
  def spMat(self) -> _ods_ir.OpResult:
    return self.operation.results[0]

  @builtins.property
  def asyncToken(self) -> _Optional[_ods_ir.OpResult]:
    return None if len(self.operation.results) < 2 else self.operation.results[1]

def create_2to4_spmat(sp_mat, async_token, async_dependencies, rows, cols, memref, *, prune_flag=None, loc=None, ip=None) -> _Union[_ods_ir.OpResult, _ods_ir.OpResultList, Create2To4SpMatOp]:
  op = Create2To4SpMatOp(spMat=sp_mat, asyncToken=async_token, asyncDependencies=async_dependencies, rows=rows, cols=cols, memref=memref, pruneFlag=prune_flag, loc=loc, ip=ip); results = op.results
  return results if len(results) > 1 else (results[0] if len(results) == 1 else op)

@_ods_cext.register_operation(_Dialect)
class CreateBsrOp(_ods_ir.OpView):
  r"""
  The `gpu.create_bsr` operation initializes a sparse matrix in BSR format
  with the given sizes for the matrix and blocks from the given position,
  index, and values buffers. The buffers must already be copied from the
  host to the device prior to using this operation. The operation returns
  a handle to the sparse matrix descriptor.
  
  The BSR format is similar to CSR, where the column indices represent
  two-dimensional blocks instead of a single matrix entry. Note that this
  operation (currently) only supports storage with **square** blocks,
  i.e., `rBlockSize == cBlockSize`.
  
  If the `async` keyword is present, the op is executed asynchronously (i.e.
  it does not block until the execution has finished on the device). In
  that case, it returns a !gpu.async.token in addition to the environment.
  
  Example:
  
  ```mlir
  %spmat, %token = gpu.create_bsr async [%dep]
     %brows, %bcols, %bnnz, %rBlockSize, %cBlockSize,
     %bRowPos, %bColIdxs, %values : memref<?xindex>, memref<?xindex>, memref<?xf64>
  ```
  """

  OPERATION_NAME = "gpu.create_bsr"

  _ODS_REGIONS = (0, True)

  def __init__(self, spmat, asyncToken, asyncDependencies, brows, bcols, bnnz, rBlockSize, cBlockSize, bRowPos, bColIdxs, values, *, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.extend(_get_op_results_or_values(asyncDependencies))
    operands.append(brows)
    operands.append(bcols)
    operands.append(bnnz)
    operands.append(rBlockSize)
    operands.append(cBlockSize)
    operands.append(bRowPos)
    operands.append(bColIdxs)
    operands.append(values)
    _ods_context = _ods_get_default_loc_context(loc)
    results = []
    results.append(spmat)
    if asyncToken is not None: results.append(asyncToken)
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def asyncDependencies(self) -> _ods_ir.OpOperandList:
    _ods_variadic_group_length = len(self.operation.operands) - 9 + 1
    return self.operation.operands[0:0 + _ods_variadic_group_length]

  @builtins.property
  def brows(self) -> _ods_ir.Value[_ods_ir.IndexType]:
    _ods_variadic_group_length = len(self.operation.operands) - 9 + 1
    return self.operation.operands[1 + _ods_variadic_group_length - 1]

  @builtins.property
  def bcols(self) -> _ods_ir.Value[_ods_ir.IndexType]:
    _ods_variadic_group_length = len(self.operation.operands) - 9 + 1
    return self.operation.operands[2 + _ods_variadic_group_length - 1]

  @builtins.property
  def bnnz(self) -> _ods_ir.Value[_ods_ir.IndexType]:
    _ods_variadic_group_length = len(self.operation.operands) - 9 + 1
    return self.operation.operands[3 + _ods_variadic_group_length - 1]

  @builtins.property
  def rBlockSize(self) -> _ods_ir.Value[_ods_ir.IndexType]:
    _ods_variadic_group_length = len(self.operation.operands) - 9 + 1
    return self.operation.operands[4 + _ods_variadic_group_length - 1]

  @builtins.property
  def cBlockSize(self) -> _ods_ir.Value[_ods_ir.IndexType]:
    _ods_variadic_group_length = len(self.operation.operands) - 9 + 1
    return self.operation.operands[5 + _ods_variadic_group_length - 1]

  @builtins.property
  def bRowPos(self) -> _ods_ir.Value[_ods_ir.MemRefType]:
    _ods_variadic_group_length = len(self.operation.operands) - 9 + 1
    return self.operation.operands[6 + _ods_variadic_group_length - 1]

  @builtins.property
  def bColIdxs(self) -> _ods_ir.Value[_ods_ir.MemRefType]:
    _ods_variadic_group_length = len(self.operation.operands) - 9 + 1
    return self.operation.operands[7 + _ods_variadic_group_length - 1]

  @builtins.property
  def values(self) -> _ods_ir.Value[_ods_ir.MemRefType]:
    _ods_variadic_group_length = len(self.operation.operands) - 9 + 1
    return self.operation.operands[8 + _ods_variadic_group_length - 1]

  @builtins.property
  def spmat(self) -> _ods_ir.OpResult:
    return self.operation.results[0]

  @builtins.property
  def asyncToken(self) -> _Optional[_ods_ir.OpResult]:
    return None if len(self.operation.results) < 2 else self.operation.results[1]

def create_bsr(spmat, async_token, async_dependencies, brows, bcols, bnnz, r_block_size, c_block_size, b_row_pos, b_col_idxs, values, *, loc=None, ip=None) -> _Union[_ods_ir.OpResult, _ods_ir.OpResultList, CreateBsrOp]:
  op = CreateBsrOp(spmat=spmat, asyncToken=async_token, asyncDependencies=async_dependencies, brows=brows, bcols=bcols, bnnz=bnnz, rBlockSize=r_block_size, cBlockSize=c_block_size, bRowPos=b_row_pos, bColIdxs=b_col_idxs, values=values, loc=loc, ip=ip); results = op.results
  return results if len(results) > 1 else (results[0] if len(results) == 1 else op)

@_ods_cext.register_operation(_Dialect)
class CreateCooAoSOp(_ods_ir.OpView):
  r"""
  The `gpu.create_coo_aos` operation initializes a sparse matrix in COO format
  with the given sizes from the given index and values buffers. The buffers
  must already be copied from the host to the device prior to using this
  operation. The operation returns a handle to the sparse matrix descriptor.
  Unlike the default `gpu.create_coo` operation, this operation builds the
  COO format from a single index buffer in AoS format (note that this
  feature has been deprecated in cuSparse 11.2).
  
  If the `async` keyword is present, the op is executed asynchronously (i.e.
  it does not block until the execution has finished on the device). In
  that case, it returns a !gpu.async.token in addition to the environment.
  
  Example:
  
  ```mlir
  %spmat, %token = gpu.create_coo_aos async [%dep] %rows, %cols, %nnz, %idxs,
      %values : memref<?xindex>, memref<?xf64>
  ```
  """

  OPERATION_NAME = "gpu.create_coo_aos"

  _ODS_REGIONS = (0, True)

  def __init__(self, spmat, asyncToken, asyncDependencies, rows, cols, nnz, idxs, values, *, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.extend(_get_op_results_or_values(asyncDependencies))
    operands.append(rows)
    operands.append(cols)
    operands.append(nnz)
    operands.append(idxs)
    operands.append(values)
    _ods_context = _ods_get_default_loc_context(loc)
    results = []
    results.append(spmat)
    if asyncToken is not None: results.append(asyncToken)
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def asyncDependencies(self) -> _ods_ir.OpOperandList:
    _ods_variadic_group_length = len(self.operation.operands) - 6 + 1
    return self.operation.operands[0:0 + _ods_variadic_group_length]

  @builtins.property
  def rows(self) -> _ods_ir.Value[_ods_ir.IndexType]:
    _ods_variadic_group_length = len(self.operation.operands) - 6 + 1
    return self.operation.operands[1 + _ods_variadic_group_length - 1]

  @builtins.property
  def cols(self) -> _ods_ir.Value[_ods_ir.IndexType]:
    _ods_variadic_group_length = len(self.operation.operands) - 6 + 1
    return self.operation.operands[2 + _ods_variadic_group_length - 1]

  @builtins.property
  def nnz(self) -> _ods_ir.Value[_ods_ir.IndexType]:
    _ods_variadic_group_length = len(self.operation.operands) - 6 + 1
    return self.operation.operands[3 + _ods_variadic_group_length - 1]

  @builtins.property
  def idxs(self) -> _ods_ir.Value[_ods_ir.MemRefType]:
    _ods_variadic_group_length = len(self.operation.operands) - 6 + 1
    return self.operation.operands[4 + _ods_variadic_group_length - 1]

  @builtins.property
  def values(self) -> _ods_ir.Value[_ods_ir.MemRefType]:
    _ods_variadic_group_length = len(self.operation.operands) - 6 + 1
    return self.operation.operands[5 + _ods_variadic_group_length - 1]

  @builtins.property
  def spmat(self) -> _ods_ir.OpResult:
    return self.operation.results[0]

  @builtins.property
  def asyncToken(self) -> _Optional[_ods_ir.OpResult]:
    return None if len(self.operation.results) < 2 else self.operation.results[1]

def create_coo_aos(spmat, async_token, async_dependencies, rows, cols, nnz, idxs, values, *, loc=None, ip=None) -> _Union[_ods_ir.OpResult, _ods_ir.OpResultList, CreateCooAoSOp]:
  op = CreateCooAoSOp(spmat=spmat, asyncToken=async_token, asyncDependencies=async_dependencies, rows=rows, cols=cols, nnz=nnz, idxs=idxs, values=values, loc=loc, ip=ip); results = op.results
  return results if len(results) > 1 else (results[0] if len(results) == 1 else op)

@_ods_cext.register_operation(_Dialect)
class CreateCooOp(_ods_ir.OpView):
  r"""
  The `gpu.create_coo` operation initializes a sparse matrix in COO format
  with the given sizes from the given index and values buffers. The buffers
  must already be copied from the host to the device prior to using this
  operation. The operation returns a handle to the sparse matrix descriptor.
  Note that this operation builds the COO in SoA format.
  
  If the `async` keyword is present, the op is executed asynchronously (i.e.
  it does not block until the execution has finished on the device). In
  that case, it returns a !gpu.async.token in addition to the environment.
  
  Example:
  
  ```mlir
  %spmat, %token = gpu.create_coo async [%dep] %rows, %cols, %nnz, %rowIdx,
      %colIdx, %values : memref<?xindex>, memref<?xindex>, memref<?xf64>
  ```
  """

  OPERATION_NAME = "gpu.create_coo"

  _ODS_REGIONS = (0, True)

  def __init__(self, spmat, asyncToken, asyncDependencies, rows, cols, nnz, rowIdxs, colIdxs, values, *, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.extend(_get_op_results_or_values(asyncDependencies))
    operands.append(rows)
    operands.append(cols)
    operands.append(nnz)
    operands.append(rowIdxs)
    operands.append(colIdxs)
    operands.append(values)
    _ods_context = _ods_get_default_loc_context(loc)
    results = []
    results.append(spmat)
    if asyncToken is not None: results.append(asyncToken)
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def asyncDependencies(self) -> _ods_ir.OpOperandList:
    _ods_variadic_group_length = len(self.operation.operands) - 7 + 1
    return self.operation.operands[0:0 + _ods_variadic_group_length]

  @builtins.property
  def rows(self) -> _ods_ir.Value[_ods_ir.IndexType]:
    _ods_variadic_group_length = len(self.operation.operands) - 7 + 1
    return self.operation.operands[1 + _ods_variadic_group_length - 1]

  @builtins.property
  def cols(self) -> _ods_ir.Value[_ods_ir.IndexType]:
    _ods_variadic_group_length = len(self.operation.operands) - 7 + 1
    return self.operation.operands[2 + _ods_variadic_group_length - 1]

  @builtins.property
  def nnz(self) -> _ods_ir.Value[_ods_ir.IndexType]:
    _ods_variadic_group_length = len(self.operation.operands) - 7 + 1
    return self.operation.operands[3 + _ods_variadic_group_length - 1]

  @builtins.property
  def rowIdxs(self) -> _ods_ir.Value[_ods_ir.MemRefType]:
    _ods_variadic_group_length = len(self.operation.operands) - 7 + 1
    return self.operation.operands[4 + _ods_variadic_group_length - 1]

  @builtins.property
  def colIdxs(self) -> _ods_ir.Value[_ods_ir.MemRefType]:
    _ods_variadic_group_length = len(self.operation.operands) - 7 + 1
    return self.operation.operands[5 + _ods_variadic_group_length - 1]

  @builtins.property
  def values(self) -> _ods_ir.Value[_ods_ir.MemRefType]:
    _ods_variadic_group_length = len(self.operation.operands) - 7 + 1
    return self.operation.operands[6 + _ods_variadic_group_length - 1]

  @builtins.property
  def spmat(self) -> _ods_ir.OpResult:
    return self.operation.results[0]

  @builtins.property
  def asyncToken(self) -> _Optional[_ods_ir.OpResult]:
    return None if len(self.operation.results) < 2 else self.operation.results[1]

def create_coo(spmat, async_token, async_dependencies, rows, cols, nnz, row_idxs, col_idxs, values, *, loc=None, ip=None) -> _Union[_ods_ir.OpResult, _ods_ir.OpResultList, CreateCooOp]:
  op = CreateCooOp(spmat=spmat, asyncToken=async_token, asyncDependencies=async_dependencies, rows=rows, cols=cols, nnz=nnz, rowIdxs=row_idxs, colIdxs=col_idxs, values=values, loc=loc, ip=ip); results = op.results
  return results if len(results) > 1 else (results[0] if len(results) == 1 else op)

@_ods_cext.register_operation(_Dialect)
class CreateCscOp(_ods_ir.OpView):
  r"""
  The `gpu.create_csc` operation initializes a sparse matrix in CSC format
  with the given sizes from the given position, index, and values buffers.
  The buffers must already be copied from the host to the device prior to
  using this operation. The operation returns a handle to the sparse
  matrix descriptor.
  
  The CSC format has exactly the same memory layout as its transpose
  in CSR format (and vice versa).
  
  If the `async` keyword is present, the op is executed asynchronously (i.e.
  it does not block until the execution has finished on the device). In
  that case, it returns a !gpu.async.token in addition to the environment.
  
  Example:
  
  ```mlir
  %spmat, %token = gpu.create_csc async [%dep] %rows, %cols, %nnz, %colPos,
      %rowIdx, %values : memref<?xindex>, memref<?xindex>, memref<?xf64>
  ```
  """

  OPERATION_NAME = "gpu.create_csc"

  _ODS_REGIONS = (0, True)

  def __init__(self, spmat, asyncToken, asyncDependencies, rows, cols, nnz, colPos, rowIdxs, values, *, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.extend(_get_op_results_or_values(asyncDependencies))
    operands.append(rows)
    operands.append(cols)
    operands.append(nnz)
    operands.append(colPos)
    operands.append(rowIdxs)
    operands.append(values)
    _ods_context = _ods_get_default_loc_context(loc)
    results = []
    results.append(spmat)
    if asyncToken is not None: results.append(asyncToken)
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def asyncDependencies(self) -> _ods_ir.OpOperandList:
    _ods_variadic_group_length = len(self.operation.operands) - 7 + 1
    return self.operation.operands[0:0 + _ods_variadic_group_length]

  @builtins.property
  def rows(self) -> _ods_ir.Value[_ods_ir.IndexType]:
    _ods_variadic_group_length = len(self.operation.operands) - 7 + 1
    return self.operation.operands[1 + _ods_variadic_group_length - 1]

  @builtins.property
  def cols(self) -> _ods_ir.Value[_ods_ir.IndexType]:
    _ods_variadic_group_length = len(self.operation.operands) - 7 + 1
    return self.operation.operands[2 + _ods_variadic_group_length - 1]

  @builtins.property
  def nnz(self) -> _ods_ir.Value[_ods_ir.IndexType]:
    _ods_variadic_group_length = len(self.operation.operands) - 7 + 1
    return self.operation.operands[3 + _ods_variadic_group_length - 1]

  @builtins.property
  def colPos(self) -> _ods_ir.Value[_ods_ir.MemRefType]:
    _ods_variadic_group_length = len(self.operation.operands) - 7 + 1
    return self.operation.operands[4 + _ods_variadic_group_length - 1]

  @builtins.property
  def rowIdxs(self) -> _ods_ir.Value[_ods_ir.MemRefType]:
    _ods_variadic_group_length = len(self.operation.operands) - 7 + 1
    return self.operation.operands[5 + _ods_variadic_group_length - 1]

  @builtins.property
  def values(self) -> _ods_ir.Value[_ods_ir.MemRefType]:
    _ods_variadic_group_length = len(self.operation.operands) - 7 + 1
    return self.operation.operands[6 + _ods_variadic_group_length - 1]

  @builtins.property
  def spmat(self) -> _ods_ir.OpResult:
    return self.operation.results[0]

  @builtins.property
  def asyncToken(self) -> _Optional[_ods_ir.OpResult]:
    return None if len(self.operation.results) < 2 else self.operation.results[1]

def create_csc(spmat, async_token, async_dependencies, rows, cols, nnz, col_pos, row_idxs, values, *, loc=None, ip=None) -> _Union[_ods_ir.OpResult, _ods_ir.OpResultList, CreateCscOp]:
  op = CreateCscOp(spmat=spmat, asyncToken=async_token, asyncDependencies=async_dependencies, rows=rows, cols=cols, nnz=nnz, colPos=col_pos, rowIdxs=row_idxs, values=values, loc=loc, ip=ip); results = op.results
  return results if len(results) > 1 else (results[0] if len(results) == 1 else op)

@_ods_cext.register_operation(_Dialect)
class CreateCsrOp(_ods_ir.OpView):
  r"""
  The `gpu.create_csr` operation initializes a sparse matrix in CSR format
  with the given sizes from the given position, index, and values buffers.
  The buffers must already be copied from the host to the device prior to
  using this operation. The operation returns a handle to the sparse
  matrix descriptor.
  
  The CSR format has exactly the same memory layout as its transpose
  in CSC format (and vice versa).
  
  If the `async` keyword is present, the op is executed asynchronously (i.e.
  it does not block until the execution has finished on the device). In
  that case, it returns a !gpu.async.token in addition to the environment.
  
  Example:
  
  ```mlir
  %spmat, %token = gpu.create_csr async [%dep] %rows, %cols, %nnz, %rowPos,
      %colIdx, %values : memref<?xindex>, memref<?xindex>, memref<?xf64>
  ```
  """

  OPERATION_NAME = "gpu.create_csr"

  _ODS_REGIONS = (0, True)

  def __init__(self, spmat, asyncToken, asyncDependencies, rows, cols, nnz, rowPos, colIdxs, values, *, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.extend(_get_op_results_or_values(asyncDependencies))
    operands.append(rows)
    operands.append(cols)
    operands.append(nnz)
    operands.append(rowPos)
    operands.append(colIdxs)
    operands.append(values)
    _ods_context = _ods_get_default_loc_context(loc)
    results = []
    results.append(spmat)
    if asyncToken is not None: results.append(asyncToken)
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def asyncDependencies(self) -> _ods_ir.OpOperandList:
    _ods_variadic_group_length = len(self.operation.operands) - 7 + 1
    return self.operation.operands[0:0 + _ods_variadic_group_length]

  @builtins.property
  def rows(self) -> _ods_ir.Value[_ods_ir.IndexType]:
    _ods_variadic_group_length = len(self.operation.operands) - 7 + 1
    return self.operation.operands[1 + _ods_variadic_group_length - 1]

  @builtins.property
  def cols(self) -> _ods_ir.Value[_ods_ir.IndexType]:
    _ods_variadic_group_length = len(self.operation.operands) - 7 + 1
    return self.operation.operands[2 + _ods_variadic_group_length - 1]

  @builtins.property
  def nnz(self) -> _ods_ir.Value[_ods_ir.IndexType]:
    _ods_variadic_group_length = len(self.operation.operands) - 7 + 1
    return self.operation.operands[3 + _ods_variadic_group_length - 1]

  @builtins.property
  def rowPos(self) -> _ods_ir.Value[_ods_ir.MemRefType]:
    _ods_variadic_group_length = len(self.operation.operands) - 7 + 1
    return self.operation.operands[4 + _ods_variadic_group_length - 1]

  @builtins.property
  def colIdxs(self) -> _ods_ir.Value[_ods_ir.MemRefType]:
    _ods_variadic_group_length = len(self.operation.operands) - 7 + 1
    return self.operation.operands[5 + _ods_variadic_group_length - 1]

  @builtins.property
  def values(self) -> _ods_ir.Value[_ods_ir.MemRefType]:
    _ods_variadic_group_length = len(self.operation.operands) - 7 + 1
    return self.operation.operands[6 + _ods_variadic_group_length - 1]

  @builtins.property
  def spmat(self) -> _ods_ir.OpResult:
    return self.operation.results[0]

  @builtins.property
  def asyncToken(self) -> _Optional[_ods_ir.OpResult]:
    return None if len(self.operation.results) < 2 else self.operation.results[1]

def create_csr(spmat, async_token, async_dependencies, rows, cols, nnz, row_pos, col_idxs, values, *, loc=None, ip=None) -> _Union[_ods_ir.OpResult, _ods_ir.OpResultList, CreateCsrOp]:
  op = CreateCsrOp(spmat=spmat, asyncToken=async_token, asyncDependencies=async_dependencies, rows=rows, cols=cols, nnz=nnz, rowPos=row_pos, colIdxs=col_idxs, values=values, loc=loc, ip=ip); results = op.results
  return results if len(results) > 1 else (results[0] if len(results) == 1 else op)

@_ods_cext.register_operation(_Dialect)
class CreateDnTensorOp(_ods_ir.OpView):
  r"""
  The `gpu.create_dn_tensor` operation initializes a dense tensor from
  the given values buffer and sizes. The buffer must already be copied
  from the host to the device prior to using this operation. The
  operation returns a handle to the dense tensor descriptor.
  
  If the `async` keyword is present, the op is executed asynchronously (i.e.
  it does not block until the execution has finished on the device). In
  that case, it returns a !gpu.async.token in addition to the environment.
  
  Example:
  
  ```mlir
  %dmat, %token = gpu.create_dn_tensor async [%dep] %mem, %dims : index, index into memref<?xf64>
  ```
  """

  OPERATION_NAME = "gpu.create_dn_tensor"

  _ODS_OPERAND_SEGMENTS = [-1,1,-1,]

  _ODS_REGIONS = (0, True)

  def __init__(self, dnTensor, asyncToken, asyncDependencies, memref, dims, *, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.append(_get_op_results_or_values(asyncDependencies))
    operands.append(memref)
    operands.append(_get_op_results_or_values(dims))
    _ods_context = _ods_get_default_loc_context(loc)
    results = []
    results.append(dnTensor)
    if asyncToken is not None: results.append(asyncToken)
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def asyncDependencies(self) -> _ods_ir.OpOperandList:
    operand_range = _ods_segmented_accessor(
         self.operation.operands,
         self.operation.attributes["operandSegmentSizes"], 0)
    return operand_range

  @builtins.property
  def memref(self) -> _ods_ir.Value[_ods_ir.MemRefType]:
    operand_range = _ods_segmented_accessor(
         self.operation.operands,
         self.operation.attributes["operandSegmentSizes"], 1)
    return operand_range[0]

  @builtins.property
  def dims(self) -> _ods_ir.OpOperandList:
    operand_range = _ods_segmented_accessor(
         self.operation.operands,
         self.operation.attributes["operandSegmentSizes"], 2)
    return operand_range

  @builtins.property
  def dnTensor(self) -> _ods_ir.OpResult:
    return self.operation.results[0]

  @builtins.property
  def asyncToken(self) -> _Optional[_ods_ir.OpResult]:
    return None if len(self.operation.results) < 2 else self.operation.results[1]

def create_dn_tensor(dn_tensor, async_token, async_dependencies, memref, dims, *, loc=None, ip=None) -> _Union[_ods_ir.OpResult, _ods_ir.OpResultList, CreateDnTensorOp]:
  op = CreateDnTensorOp(dnTensor=dn_tensor, asyncToken=async_token, asyncDependencies=async_dependencies, memref=memref, dims=dims, loc=loc, ip=ip); results = op.results
  return results if len(results) > 1 else (results[0] if len(results) == 1 else op)

@_ods_cext.register_operation(_Dialect)
class DeallocOp(_ods_ir.OpView):
  r"""
  The `gpu.dealloc` operation frees the region of memory referenced by a
  memref which was originally created by the `gpu.alloc` operation. It is
  similar to the `memref.dealloc` op, but supports asynchronous GPU execution.
  
  The op does not execute before all async dependencies have finished
  executing.
  
  If the `async` keyword is present, the op is executed asynchronously (i.e.
  it does not block until the execution has finished on the device). In
  that case, it returns a !gpu.async.token.
  
  Example:
  
  ```mlir
  %token = gpu.dealloc async [%dep] %memref : memref<8x64xf32, 1>
  ```
  """

  OPERATION_NAME = "gpu.dealloc"

  _ODS_REGIONS = (0, True)

  def __init__(self, asyncToken, asyncDependencies, memref, *, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.extend(_get_op_results_or_values(asyncDependencies))
    operands.append(memref)
    _ods_context = _ods_get_default_loc_context(loc)
    results = []
    if asyncToken is not None: results.append(asyncToken)
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def asyncDependencies(self) -> _ods_ir.OpOperandList:
    _ods_variadic_group_length = len(self.operation.operands) - 2 + 1
    return self.operation.operands[0:0 + _ods_variadic_group_length]

  @builtins.property
  def memref(self) -> _ods_ir.Value[_ods_ir.MemRefType]:
    _ods_variadic_group_length = len(self.operation.operands) - 2 + 1
    return self.operation.operands[1 + _ods_variadic_group_length - 1]

  @builtins.property
  def asyncToken(self) -> _Optional[_ods_ir.OpResult]:
    return None if len(self.operation.results) < 1 else self.operation.results[0]

def dealloc(async_token, async_dependencies, memref, *, loc=None, ip=None) -> _Union[_ods_ir.OpResult, _ods_ir.OpResultList, DeallocOp]:
  op = DeallocOp(asyncToken=async_token, asyncDependencies=async_dependencies, memref=memref, loc=loc, ip=ip); results = op.results
  return results if len(results) > 1 else (results[0] if len(results) == 1 else op)

@_ods_cext.register_operation(_Dialect)
class DestroyDnTensorOp(_ods_ir.OpView):
  r"""
  The `gpu.destroy_dn_tensor` operation releases all resources of a dense
  tensor represented by a handle that was previously created by a
  `gpu.create_dn_tensor` operation.
  
  If the `async` keyword is present, the op is executed asynchronously (i.e.
  it does not block until the execution has finished on the device). In
  that case, it returns a !gpu.async.token in addition to the environment.
  
  Example:
  
  ```mlir
  %token = gpu.destroy_dn_tensor async [%dep] %dnTensor
  ```
  """

  OPERATION_NAME = "gpu.destroy_dn_tensor"

  _ODS_REGIONS = (0, True)

  def __init__(self, asyncToken, asyncDependencies, dnTensor, *, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.extend(_get_op_results_or_values(asyncDependencies))
    operands.append(dnTensor)
    _ods_context = _ods_get_default_loc_context(loc)
    results = []
    if asyncToken is not None: results.append(asyncToken)
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def asyncDependencies(self) -> _ods_ir.OpOperandList:
    _ods_variadic_group_length = len(self.operation.operands) - 2 + 1
    return self.operation.operands[0:0 + _ods_variadic_group_length]

  @builtins.property
  def dnTensor(self) -> _ods_ir.Value:
    _ods_variadic_group_length = len(self.operation.operands) - 2 + 1
    return self.operation.operands[1 + _ods_variadic_group_length - 1]

  @builtins.property
  def asyncToken(self) -> _Optional[_ods_ir.OpResult]:
    return None if len(self.operation.results) < 1 else self.operation.results[0]

def destroy_dn_tensor(async_token, async_dependencies, dn_tensor, *, loc=None, ip=None) -> _Union[_ods_ir.OpResult, _ods_ir.OpResultList, DestroyDnTensorOp]:
  op = DestroyDnTensorOp(asyncToken=async_token, asyncDependencies=async_dependencies, dnTensor=dn_tensor, loc=loc, ip=ip); results = op.results
  return results if len(results) > 1 else (results[0] if len(results) == 1 else op)

@_ods_cext.register_operation(_Dialect)
class DestroySpMatOp(_ods_ir.OpView):
  r"""
  The `gpu.destroy_sp_mat` operation releases all resources of a sparse
  matrix represented by a handle that was previously created by a
  one of the sparse matrix creation operations.
  
  If the `async` keyword is present, the op is executed asynchronously (i.e.
  it does not block until the execution has finished on the device). In
  that case, it returns a !gpu.async.token in addition to the environment.
  
  Example:
  
  ```mlir
  %token = gpu.destroy_sp_mat async [%dep] %spmat
  ```
  """

  OPERATION_NAME = "gpu.destroy_sp_mat"

  _ODS_REGIONS = (0, True)

  def __init__(self, asyncToken, asyncDependencies, spmat, *, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.extend(_get_op_results_or_values(asyncDependencies))
    operands.append(spmat)
    _ods_context = _ods_get_default_loc_context(loc)
    results = []
    if asyncToken is not None: results.append(asyncToken)
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def asyncDependencies(self) -> _ods_ir.OpOperandList:
    _ods_variadic_group_length = len(self.operation.operands) - 2 + 1
    return self.operation.operands[0:0 + _ods_variadic_group_length]

  @builtins.property
  def spmat(self) -> _ods_ir.Value:
    _ods_variadic_group_length = len(self.operation.operands) - 2 + 1
    return self.operation.operands[1 + _ods_variadic_group_length - 1]

  @builtins.property
  def asyncToken(self) -> _Optional[_ods_ir.OpResult]:
    return None if len(self.operation.results) < 1 else self.operation.results[0]

def destroy_sp_mat(async_token, async_dependencies, spmat, *, loc=None, ip=None) -> _Union[_ods_ir.OpResult, _ods_ir.OpResultList, DestroySpMatOp]:
  op = DestroySpMatOp(asyncToken=async_token, asyncDependencies=async_dependencies, spmat=spmat, loc=loc, ip=ip); results = op.results
  return results if len(results) > 1 else (results[0] if len(results) == 1 else op)

@_ods_cext.register_operation(_Dialect)
class DynamicSharedMemoryOp(_ods_ir.OpView):
  r"""
  This operation provides a memref pointer to the start of dynamic shared
  memory, often referred to as workgroup memory. It's important to note that
  this dynamic shared memory needs to be allocated at kernel launch. One can
  conveniently utilize the `dynamic_shared_memory_size` parameter of
  `gpu.launch` for this purpose.
  
  Examples:
  ```mlir
  %0 = gpu.dynamic.shared.memory : memref<?xi8, #gpu.address_space<workgroup>>
  %1 = memref.view %0[%c8192][] : memref<?xi8, #gpu.address_space<workgroup>>
                          to memref<32x64xf32, #gpu.address_space<workgroup>>
  %2 = memref.view %0[%c16384][] : memref<?xi8, #gpu.address_space<workgroup>>
                          to memref<32x64xf32, #gpu.address_space<workgroup>>
  ```
  """

  OPERATION_NAME = "gpu.dynamic_shared_memory"

  _ODS_REGIONS = (0, True)

  def __init__(self, resultMemref, *, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    _ods_context = _ods_get_default_loc_context(loc)
    results = []
    results.append(resultMemref)
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def resultMemref(self) -> _ods_ir.OpResult[_ods_ir.MemRefType]:
    return self.operation.results[0]

def dynamic_shared_memory(result_memref, *, loc=None, ip=None) -> _ods_ir.OpResult:
  return DynamicSharedMemoryOp(resultMemref=result_memref, loc=loc, ip=ip).result

@_ods_cext.register_operation(_Dialect)
class GPUFuncOp(_ods_ir.OpView):
  r"""
  Defines a function that can be executed on a GPU. This supports memory
  attribution and its body has a particular execution model.
  
  GPU functions are either kernels (as indicated by the `kernel` attribute) or
  regular functions. The former can be launched from the host side, while the
  latter are device side only.
  
  The memory attribution defines SSA values that correspond to memory buffers
  allocated in the memory hierarchy of the GPU (see below).
  
  The operation has one attached region that corresponds to the body of the
  function. The region arguments consist of the function arguments without
  modification, followed by buffers defined in memory annotations. The body of
  a GPU function, when launched, is executed by multiple work items. There are
  no guarantees on the order in which work items execute, or on the connection
  between them. In particular, work items are not necessarily executed in
  lock-step. Synchronization ops such as "gpu.barrier" should be used to
  coordinate work items. Declarations of GPU functions, i.e. not having the
  body region, are not supported.
  
  A function may optionally be annotated with the block and/or grid sizes
  that will be used when it is launched using the `known_block_size` and
  `known_grid_size` attributes, respectively. If set, these attributes must
  be arrays of three 32-bit integers giving the x, y, and z launch dimensions.
  Launching a kernel that has these annotations, or that calls a function with
  these annotations, using a block size or grid size other than what is specified
  is undefined behavior. These attributes may be set on non-`gpu.func` functions
  by using `gpu.known_block_size` or `gpu.known_grid_size`, but this carries
  the risk that they will de discarded.
  
  Syntax:
  
  ```
  op ::= `gpu.func` symbol-ref-id `(` argument-list `)` (`->`
  function-result-list)?
         memory-attribution `kernel`? function-attributes? region
  
  memory-attribution ::= (`workgroup` `(` ssa-id-and-type-list `)`)?
                         (`private` `(` ssa-id-and-type-list `)`)?
  ```
  
  Example:
  
  ```mlir
  gpu.func @foo(%arg0: index)
      workgroup(%workgroup: memref<32xf32, 3>)
      private(%private: memref<1xf32, 5>)
      kernel
      attributes {qux: "quux"} {
    gpu.return
  }
  ```
  
  The generic form illustrates the concept
  
  ```mlir
  "gpu.func"(%arg: index) {sym_name: "foo", kernel, qux: "quux"} ({
  ^bb0(%arg0: index, %workgroup: memref<32xf32, 3>,
       %private: memref<1xf32, 5>):
    "gpu.return"() : () -> ()
  }) : (index) -> ()
  ```
  
  Note the non-default memory spaces used in memref types in memory
  attribution.
  """

  OPERATION_NAME = "gpu.func"

  _ODS_REGIONS = (1, True)

  def __init__(self, function_type, *, arg_attrs=None, res_attrs=None, workgroup_attrib_attrs=None, private_attrib_attrs=None, known_block_size=None, known_grid_size=None, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    _ods_context = _ods_get_default_loc_context(loc)
    attributes["function_type"] = (function_type if (
    isinstance(function_type, _ods_ir.Attribute) or
    not _ods_ir.AttrBuilder.contains('anonymous_634')) else
      _ods_ir.AttrBuilder.get('anonymous_634')(function_type, context=_ods_context))
    if arg_attrs is not None: attributes["arg_attrs"] = (arg_attrs if (
        isinstance(arg_attrs, _ods_ir.Attribute) or
        not _ods_ir.AttrBuilder.contains('DictArrayAttr')) else
          _ods_ir.AttrBuilder.get('DictArrayAttr')(arg_attrs, context=_ods_context))
    if res_attrs is not None: attributes["res_attrs"] = (res_attrs if (
        isinstance(res_attrs, _ods_ir.Attribute) or
        not _ods_ir.AttrBuilder.contains('DictArrayAttr')) else
          _ods_ir.AttrBuilder.get('DictArrayAttr')(res_attrs, context=_ods_context))
    if workgroup_attrib_attrs is not None: attributes["workgroup_attrib_attrs"] = (workgroup_attrib_attrs if (
        isinstance(workgroup_attrib_attrs, _ods_ir.Attribute) or
        not _ods_ir.AttrBuilder.contains('DictArrayAttr')) else
          _ods_ir.AttrBuilder.get('DictArrayAttr')(workgroup_attrib_attrs, context=_ods_context))
    if private_attrib_attrs is not None: attributes["private_attrib_attrs"] = (private_attrib_attrs if (
        isinstance(private_attrib_attrs, _ods_ir.Attribute) or
        not _ods_ir.AttrBuilder.contains('DictArrayAttr')) else
          _ods_ir.AttrBuilder.get('DictArrayAttr')(private_attrib_attrs, context=_ods_context))
    if known_block_size is not None: attributes["known_block_size"] = (known_block_size if (
        isinstance(known_block_size, _ods_ir.Attribute) or
        not _ods_ir.AttrBuilder.contains('GPU_OptionalDimSizeHintAttr')) else
          _ods_ir.AttrBuilder.get('GPU_OptionalDimSizeHintAttr')(known_block_size, context=_ods_context))
    if known_grid_size is not None: attributes["known_grid_size"] = (known_grid_size if (
        isinstance(known_grid_size, _ods_ir.Attribute) or
        not _ods_ir.AttrBuilder.contains('GPU_OptionalDimSizeHintAttr')) else
          _ods_ir.AttrBuilder.get('GPU_OptionalDimSizeHintAttr')(known_grid_size, context=_ods_context))
    results = []
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def function_type(self) -> _ods_ir.TypeAttr:
    return self.operation.attributes["function_type"]

  @function_type.setter
  def function_type(self, value: _ods_ir.TypeAttr):
    if value is None:
      raise ValueError("'None' not allowed as value for mandatory attributes")
    self.operation.attributes["function_type"] = value

  @builtins.property
  def arg_attrs(self) -> _Optional[_ods_ir.ArrayAttr]:
    if "arg_attrs" not in self.operation.attributes:
      return None
    return self.operation.attributes["arg_attrs"]

  @arg_attrs.setter
  def arg_attrs(self, value: _Optional[_ods_ir.ArrayAttr]):
    if value is not None:
      self.operation.attributes["arg_attrs"] = value
    elif "arg_attrs" in self.operation.attributes:
      del self.operation.attributes["arg_attrs"]

  @arg_attrs.deleter
  def arg_attrs(self):
    del self.operation.attributes["arg_attrs"]

  @builtins.property
  def res_attrs(self) -> _Optional[_ods_ir.ArrayAttr]:
    if "res_attrs" not in self.operation.attributes:
      return None
    return self.operation.attributes["res_attrs"]

  @res_attrs.setter
  def res_attrs(self, value: _Optional[_ods_ir.ArrayAttr]):
    if value is not None:
      self.operation.attributes["res_attrs"] = value
    elif "res_attrs" in self.operation.attributes:
      del self.operation.attributes["res_attrs"]

  @res_attrs.deleter
  def res_attrs(self):
    del self.operation.attributes["res_attrs"]

  @builtins.property
  def workgroup_attrib_attrs(self) -> _Optional[_ods_ir.ArrayAttr]:
    if "workgroup_attrib_attrs" not in self.operation.attributes:
      return None
    return self.operation.attributes["workgroup_attrib_attrs"]

  @workgroup_attrib_attrs.setter
  def workgroup_attrib_attrs(self, value: _Optional[_ods_ir.ArrayAttr]):
    if value is not None:
      self.operation.attributes["workgroup_attrib_attrs"] = value
    elif "workgroup_attrib_attrs" in self.operation.attributes:
      del self.operation.attributes["workgroup_attrib_attrs"]

  @workgroup_attrib_attrs.deleter
  def workgroup_attrib_attrs(self):
    del self.operation.attributes["workgroup_attrib_attrs"]

  @builtins.property
  def private_attrib_attrs(self) -> _Optional[_ods_ir.ArrayAttr]:
    if "private_attrib_attrs" not in self.operation.attributes:
      return None
    return self.operation.attributes["private_attrib_attrs"]

  @private_attrib_attrs.setter
  def private_attrib_attrs(self, value: _Optional[_ods_ir.ArrayAttr]):
    if value is not None:
      self.operation.attributes["private_attrib_attrs"] = value
    elif "private_attrib_attrs" in self.operation.attributes:
      del self.operation.attributes["private_attrib_attrs"]

  @private_attrib_attrs.deleter
  def private_attrib_attrs(self):
    del self.operation.attributes["private_attrib_attrs"]

  @builtins.property
  def known_block_size(self) -> _Optional[_ods_ir.DenseI32ArrayAttr]:
    if "known_block_size" not in self.operation.attributes:
      return None
    return self.operation.attributes["known_block_size"]

  @known_block_size.setter
  def known_block_size(self, value: _Optional[_ods_ir.DenseI32ArrayAttr]):
    if value is not None:
      self.operation.attributes["known_block_size"] = value
    elif "known_block_size" in self.operation.attributes:
      del self.operation.attributes["known_block_size"]

  @known_block_size.deleter
  def known_block_size(self):
    del self.operation.attributes["known_block_size"]

  @builtins.property
  def known_grid_size(self) -> _Optional[_ods_ir.DenseI32ArrayAttr]:
    if "known_grid_size" not in self.operation.attributes:
      return None
    return self.operation.attributes["known_grid_size"]

  @known_grid_size.setter
  def known_grid_size(self, value: _Optional[_ods_ir.DenseI32ArrayAttr]):
    if value is not None:
      self.operation.attributes["known_grid_size"] = value
    elif "known_grid_size" in self.operation.attributes:
      del self.operation.attributes["known_grid_size"]

  @known_grid_size.deleter
  def known_grid_size(self):
    del self.operation.attributes["known_grid_size"]

  @builtins.property
  def body(self) -> _ods_ir.Region:
    return self.regions[0]

def func(function_type, *, arg_attrs=None, res_attrs=None, workgroup_attrib_attrs=None, private_attrib_attrs=None, known_block_size=None, known_grid_size=None, loc=None, ip=None) -> GPUFuncOp:
  return GPUFuncOp(function_type=function_type, arg_attrs=arg_attrs, res_attrs=res_attrs, workgroup_attrib_attrs=workgroup_attrib_attrs, private_attrib_attrs=private_attrib_attrs, known_block_size=known_block_size, known_grid_size=known_grid_size, loc=loc, ip=ip)

@_ods_cext.register_operation(_Dialect)
class GPUModuleOp(_ods_ir.OpView):
  r"""
  GPU module contains code that is intended to be run on a GPU. A host device
  can launch this code through a gpu.launc_func that creates a fully
  qualified symbol through the gpu.module's symbol and a gpu.func symbol
  contained in the gpu.module.
  
  The module's top-level scope is modeled by a single region with a single
  block. GPU modules are required to have a name that is used for symbol
  resolution by the gpu.launch_func operation.
  
  Using an op with a region to define a GPU module enables "embedding" GPU
  modules with SIMT execution models in other dialects in a clean manner and
  allows filtering of code regions to execute passes on only code intended to
  or not intended to be run on the separate device.
  
  Modules can contain zero or more target attributes. These attributes encode
  how to transform modules into binary strings and are used by the
  `gpu-module-to-binary` pass to transform modules into GPU binaries.
  
  Modules can contain an optional `OffloadingTranslationAttr` attribute. This
  attribute will be used during the `gpu-module-to-binary` pass to specify the
  `OffloadingTranslationAttr` used when creating the `gpu.binary` operation.
  
  ```
  gpu.module @symbol_name {
    gpu.func {}
      ...
  }
  // Module with offloading handler and target attributes.
  gpu.module @symbol_name2 <#gpu.select_object<1>> [
      #nvvm.target,
      #rocdl.target<chip = "gfx90a">] {
    gpu.func {}
      ...
  }
  ```
  """

  OPERATION_NAME = "gpu.module"

  _ODS_REGIONS = (1, True)

  def __init__(self, sym_name, *, targets=None, offloadingHandler=None, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    _ods_context = _ods_get_default_loc_context(loc)
    attributes["sym_name"] = (sym_name if (
    isinstance(sym_name, _ods_ir.Attribute) or
    not _ods_ir.AttrBuilder.contains('SymbolNameAttr')) else
      _ods_ir.AttrBuilder.get('SymbolNameAttr')(sym_name, context=_ods_context))
    if targets is not None: attributes["targets"] = (targets if (
        isinstance(targets, _ods_ir.Attribute) or
        not _ods_ir.AttrBuilder.contains('GPUTargetArrayAttr')) else
          _ods_ir.AttrBuilder.get('GPUTargetArrayAttr')(targets, context=_ods_context))
    if offloadingHandler is not None: attributes["offloadingHandler"] = (offloadingHandler if (
        isinstance(offloadingHandler, _ods_ir.Attribute) or
        not _ods_ir.AttrBuilder.contains('AnyAttr')) else
          _ods_ir.AttrBuilder.get('AnyAttr')(offloadingHandler, context=_ods_context))
    results = []
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def sym_name(self) -> _ods_ir.StringAttr:
    return self.operation.attributes["sym_name"]

  @sym_name.setter
  def sym_name(self, value: _ods_ir.StringAttr):
    if value is None:
      raise ValueError("'None' not allowed as value for mandatory attributes")
    self.operation.attributes["sym_name"] = value

  @builtins.property
  def targets(self) -> _Optional[_ods_ir.ArrayAttr]:
    if "targets" not in self.operation.attributes:
      return None
    return self.operation.attributes["targets"]

  @targets.setter
  def targets(self, value: _Optional[_ods_ir.ArrayAttr]):
    if value is not None:
      self.operation.attributes["targets"] = value
    elif "targets" in self.operation.attributes:
      del self.operation.attributes["targets"]

  @targets.deleter
  def targets(self):
    del self.operation.attributes["targets"]

  @builtins.property
  def offloadingHandler(self) -> _Optional[_ods_ir.Attribute]:
    if "offloadingHandler" not in self.operation.attributes:
      return None
    return self.operation.attributes["offloadingHandler"]

  @offloadingHandler.setter
  def offloadingHandler(self, value: _Optional[_ods_ir.Attribute]):
    if value is not None:
      self.operation.attributes["offloadingHandler"] = value
    elif "offloadingHandler" in self.operation.attributes:
      del self.operation.attributes["offloadingHandler"]

  @offloadingHandler.deleter
  def offloadingHandler(self):
    del self.operation.attributes["offloadingHandler"]

  @builtins.property
  def bodyRegion(self) -> _ods_ir.Region:
    return self.regions[0]

def module(sym_name, *, targets=None, offloading_handler=None, loc=None, ip=None) -> GPUModuleOp:
  return GPUModuleOp(sym_name=sym_name, targets=targets, offloadingHandler=offloading_handler, loc=loc, ip=ip)

@_ods_cext.register_operation(_Dialect)
class GlobalIdOp(_ods_ir.OpView):
  r"""
  Returns the unique global workitem/thread id, i.e., the unique index of the
  current workitem/thread within all workgroups / grid along the x, y, or z
  `dimension`.
  
  Example:
  
  ```mlir
  %gidX = gpu.global_id x
  %gidX = gpu.global_id x upper_bound 65536
  ```
  
  The `upper_bound` attribute defines an upper bound analogously to the ones on
  `thread_id` and `block_id`. If one is not set, the bound may be inferred from
  a combination of `known_block_size` and `known_grid_size`-type annotations.
  """

  OPERATION_NAME = "gpu.global_id"

  _ODS_REGIONS = (0, True)

  def __init__(self, dimension, *, upper_bound=None, results=None, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    _ods_context = _ods_get_default_loc_context(loc)
    attributes["dimension"] = (dimension if (
    isinstance(dimension, _ods_ir.Attribute) or
    not _ods_ir.AttrBuilder.contains('GPU_DimensionAttr')) else
      _ods_ir.AttrBuilder.get('GPU_DimensionAttr')(dimension, context=_ods_context))
    if upper_bound is not None: attributes["upper_bound"] = (upper_bound if (
        isinstance(upper_bound, _ods_ir.Attribute) or
        not _ods_ir.AttrBuilder.contains('IndexAttr')) else
          _ods_ir.AttrBuilder.get('IndexAttr')(upper_bound, context=_ods_context))
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def dimension(self) -> _ods_ir.Attribute:
    return self.operation.attributes["dimension"]

  @dimension.setter
  def dimension(self, value: _ods_ir.Attribute):
    if value is None:
      raise ValueError("'None' not allowed as value for mandatory attributes")
    self.operation.attributes["dimension"] = value

  @builtins.property
  def upper_bound(self) -> _Optional[_ods_ir.IntegerAttr]:
    if "upper_bound" not in self.operation.attributes:
      return None
    return self.operation.attributes["upper_bound"]

  @upper_bound.setter
  def upper_bound(self, value: _Optional[_ods_ir.IntegerAttr]):
    if value is not None:
      self.operation.attributes["upper_bound"] = value
    elif "upper_bound" in self.operation.attributes:
      del self.operation.attributes["upper_bound"]

  @upper_bound.deleter
  def upper_bound(self):
    del self.operation.attributes["upper_bound"]

def global_id(dimension, *, upper_bound=None, results=None, loc=None, ip=None) -> _ods_ir.OpResult:
  return GlobalIdOp(dimension=dimension, upper_bound=upper_bound, results=results, loc=loc, ip=ip).result

@_ods_cext.register_operation(_Dialect)
class GridDimOp(_ods_ir.OpView):
  r"""
  Returns the number of thread blocks in the grid along the x, y, or z
  `dimension`.
  
  Example:
  
  ```mlir
  %gDimZ = gpu.grid_dim z
  ```
  
  
  If `known_grid_size` is set on an this operation's enclosing `gpu.func`,
  or `gpu.known_grid_size` is set on an enclosing `FunctionOpInterface`
  implementor, or if the enclosing `gpu.launch` specifies a constant size for
  `dimension`'s grid length, these contextual facts may be used to infer that this
  operation has a constant value, though such a transformation will not be
  performed by canonicalization or the default constant folder. Executions which
  cause that constant-value assumption to be false incur undefined behavior.
  
  If `upper_bound` is set, executions where the grid size in `dimension` would
  exceed `upper_bound` cause undefined behavior.
  
  There is an implicit upper bound of `kMaxDim` (currently uint32_t::max).
  """

  OPERATION_NAME = "gpu.grid_dim"

  _ODS_REGIONS = (0, True)

  def __init__(self, dimension, *, upper_bound=None, results=None, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    _ods_context = _ods_get_default_loc_context(loc)
    attributes["dimension"] = (dimension if (
    isinstance(dimension, _ods_ir.Attribute) or
    not _ods_ir.AttrBuilder.contains('GPU_DimensionAttr')) else
      _ods_ir.AttrBuilder.get('GPU_DimensionAttr')(dimension, context=_ods_context))
    if upper_bound is not None: attributes["upper_bound"] = (upper_bound if (
        isinstance(upper_bound, _ods_ir.Attribute) or
        not _ods_ir.AttrBuilder.contains('IndexAttr')) else
          _ods_ir.AttrBuilder.get('IndexAttr')(upper_bound, context=_ods_context))
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def dimension(self) -> _ods_ir.Attribute:
    return self.operation.attributes["dimension"]

  @dimension.setter
  def dimension(self, value: _ods_ir.Attribute):
    if value is None:
      raise ValueError("'None' not allowed as value for mandatory attributes")
    self.operation.attributes["dimension"] = value

  @builtins.property
  def upper_bound(self) -> _Optional[_ods_ir.IntegerAttr]:
    if "upper_bound" not in self.operation.attributes:
      return None
    return self.operation.attributes["upper_bound"]

  @upper_bound.setter
  def upper_bound(self, value: _Optional[_ods_ir.IntegerAttr]):
    if value is not None:
      self.operation.attributes["upper_bound"] = value
    elif "upper_bound" in self.operation.attributes:
      del self.operation.attributes["upper_bound"]

  @upper_bound.deleter
  def upper_bound(self):
    del self.operation.attributes["upper_bound"]

def grid_dim(dimension, *, upper_bound=None, results=None, loc=None, ip=None) -> _ods_ir.OpResult:
  return GridDimOp(dimension=dimension, upper_bound=upper_bound, results=results, loc=loc, ip=ip).result

@_ods_cext.register_operation(_Dialect)
class HostRegisterOp(_ods_ir.OpView):
  r"""
  This op maps the provided host buffer into the device address space.
  
  This operation may not be supported in every environment, there is not yet a
  way to check at runtime whether this feature is supported.
  
  Writes from the host are guaranteed to be visible to device kernels that are
  launched afterwards. Writes from the device are guaranteed to be visible on
  the host after synchronizing with the device kernel completion.
  """

  OPERATION_NAME = "gpu.host_register"

  _ODS_REGIONS = (0, True)

  def __init__(self, value, *, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.append(value)
    _ods_context = _ods_get_default_loc_context(loc)
    results = []
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def value(self) -> _ods_ir.Value[_ods_ir.UnrankedMemRefType]:
    return self.operation.operands[0]

def host_register(value, *, loc=None, ip=None) -> HostRegisterOp:
  return HostRegisterOp(value=value, loc=loc, ip=ip)

@_ods_cext.register_operation(_Dialect)
class HostUnregisterOp(_ods_ir.OpView):
  r"""
  This op unmaps the provided host buffer from the device address space.
  
  This operation may not be supported in every environment, there is not yet a
      way to check at runtime whether this feature is supported.
  """

  OPERATION_NAME = "gpu.host_unregister"

  _ODS_REGIONS = (0, True)

  def __init__(self, value, *, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.append(value)
    _ods_context = _ods_get_default_loc_context(loc)
    results = []
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def value(self) -> _ods_ir.Value[_ods_ir.UnrankedMemRefType]:
    return self.operation.operands[0]

def host_unregister(value, *, loc=None, ip=None) -> HostUnregisterOp:
  return HostUnregisterOp(value=value, loc=loc, ip=ip)

@_ods_cext.register_operation(_Dialect)
class LaneIdOp(_ods_ir.OpView):
  r"""
  Returns the lane id within the subgroup (warp/wave).
  
  Example:
  ```mlir
  %laneId = gpu.lane_id
  ```
  
  If `upper_bound` is set, executions with more than `upper_bound` lanes per
  subgroup cause undefined behavior. In the abscence of `upper_bound`,
  the lane id is still assumed to be non-negative and less than the
  target-independent `kMaxSubgroupSize` (currently 128).
  """

  OPERATION_NAME = "gpu.lane_id"

  _ODS_REGIONS = (0, True)

  def __init__(self, *, upper_bound=None, results=None, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    _ods_context = _ods_get_default_loc_context(loc)
    if upper_bound is not None: attributes["upper_bound"] = (upper_bound if (
        isinstance(upper_bound, _ods_ir.Attribute) or
        not _ods_ir.AttrBuilder.contains('IndexAttr')) else
          _ods_ir.AttrBuilder.get('IndexAttr')(upper_bound, context=_ods_context))
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def upper_bound(self) -> _Optional[_ods_ir.IntegerAttr]:
    if "upper_bound" not in self.operation.attributes:
      return None
    return self.operation.attributes["upper_bound"]

  @upper_bound.setter
  def upper_bound(self, value: _Optional[_ods_ir.IntegerAttr]):
    if value is not None:
      self.operation.attributes["upper_bound"] = value
    elif "upper_bound" in self.operation.attributes:
      del self.operation.attributes["upper_bound"]

  @upper_bound.deleter
  def upper_bound(self):
    del self.operation.attributes["upper_bound"]

  @builtins.property
  def result(self) -> _ods_ir.OpResult[_ods_ir.IndexType]:
    return self.operation.results[0]

def lane_id(*, upper_bound=None, results=None, loc=None, ip=None) -> _ods_ir.OpResult:
  return LaneIdOp(upper_bound=upper_bound, results=results, loc=loc, ip=ip).result

@_ods_cext.register_operation(_Dialect)
class LaunchFuncOp(_ods_ir.OpView):
  r"""
  Launch a kernel function on the specified grid of thread blocks.
  `gpu.launch` operations are lowered to `gpu.launch_func` operations by
  outlining the kernel body into a function in a dedicated module, which
  reflects the separate compilation process. The kernel function is required
  to have the `gpu.kernel` attribute. The module containing the kernel
  function is required to be a gpu.module. And finally, the module containing
  the kernel module (which thus cannot be the top-level module) is required
  to have the `gpu.container_module` attribute. The `gpu.launch_func`
  operation has a symbol attribute named `kernel` to identify the fully
  specified kernel function to launch (both the gpu.module and func).
  
  The `gpu.launch_func` supports async dependencies: the kernel does not start
  executing until the ops producing those async dependencies have completed.
  
  By the default, the host implicitly blocks until kernel execution has
  completed. If the `async` keyword is present, the host does not block but
  instead a `!gpu.async.token` is returned. Other async GPU ops can take this
  token as dependency.
  
  The operation requires at least the grid and block sizes along the x,y,z
  dimensions as arguments. When a lower-dimensional kernel is required,
  unused sizes must be explicitly set to `1`.
  
  The remaining operands are optional. The first optional operand corresponds
  to the amount of dynamic shared memory a kernel's workgroup should be
  allocated; when this operand is not present, a zero size is assumed.
  
  The remaining operands if present are passed as arguments to the kernel
  function.
  
  The `gpu.launch_func` also supports kernel launching with clusters if
  supported by the target architecture. The cluster size can be set by
  `clusterSizeX`, `clusterSizeY`, and `clusterSizeZ` arguments. When these
  arguments are present, the Op launches a kernel that clusters the given
  thread blocks. This feature is exclusive to certain architectures.
  
  Example:
  
  ```mlir
  module attributes {gpu.container_module} {
  
    // This module creates a separate compilation unit for the GPU compiler.
    gpu.module @kernels {
      func.func @kernel_1(%arg0 : f32, %arg1 : memref<?xf32, 1>)
          attributes { nvvm.kernel = true } {
  
        // Operations that produce block/thread IDs and dimensions are
        // injected when outlining the `gpu.launch` body to a function called
        // by `gpu.launch_func`.
        %tIdX = gpu.thread_id x
        %tIdY = gpu.thread_id y
        %tIdZ = gpu.thread_id z
  
        %bDimX = gpu.block_dim x
        %bDimY = gpu.block_dim y
        %bDimZ = gpu.block_dim z
  
        %bIdX = gpu.block_id x
        %bIdY = gpu.block_id y
        %bIdZ = gpu.block_id z
  
        %gDimX = gpu.grid_dim x
        %gDimY = gpu.grid_dim y
        %gDimZ = gpu.grid_dim z
  
        // (Optional)  Cluster size only for support architectures
        %cIdX = gpu.cluster_id x
        %cIdY = gpu.cluster_id y
        %cIdZ = gpu.cluster_id z
  
        %cDimX = gpu.cluster_dim x
        %cDimY = gpu.cluster_dim y
        %cDimZ = gpu.cluster_dim z
  
        "some_op"(%bx, %tx) : (index, index) -> ()
        %42 = load %arg1[%bx] : memref<?xf32, 1>
      }
    }
  
    %t0 = gpu.wait async
    gpu.launch_func
        async                           // (Optional) Don't block host, return token.
        [%t0]                           // (Optional) Execute only after %t0 has completed.
        @kernels::@kernel_1             // Kernel function.
        clusters in (%cst, %cst, %cst)  // (Optional) Cluster size only for support architectures.
        blocks in (%cst, %cst, %cst)    // Grid size.
        threads in (%cst, %cst, %cst)   // Block size.
        dynamic_shared_memory_size %s   // (Optional) Amount of dynamic shared
                                        // memory to allocate for a workgroup.
        args(%arg0 : f32,               // (Optional) Kernel arguments.
             %arg1 : memref<?xf32, 1>)
  }
  ```
  """

  OPERATION_NAME = "gpu.launch_func"

  _ODS_OPERAND_SEGMENTS = [-1,1,1,1,1,1,1,0,0,0,0,-1,0,]

  _ODS_REGIONS = (0, True)

  def __init__(self, asyncToken, asyncDependencies, kernel, gridSizeX, gridSizeY, gridSizeZ, blockSizeX, blockSizeY, blockSizeZ, kernelOperands, *, clusterSizeX=None, clusterSizeY=None, clusterSizeZ=None, dynamicSharedMemorySize=None, asyncObject=None, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.append(_get_op_results_or_values(asyncDependencies))
    operands.append(gridSizeX)
    operands.append(gridSizeY)
    operands.append(gridSizeZ)
    operands.append(blockSizeX)
    operands.append(blockSizeY)
    operands.append(blockSizeZ)
    operands.append(clusterSizeX)
    operands.append(clusterSizeY)
    operands.append(clusterSizeZ)
    operands.append(dynamicSharedMemorySize)
    operands.append(_get_op_results_or_values(kernelOperands))
    operands.append(asyncObject)
    _ods_context = _ods_get_default_loc_context(loc)
    attributes["kernel"] = (kernel if (
    isinstance(kernel, _ods_ir.Attribute) or
    not _ods_ir.AttrBuilder.contains('SymbolRefAttr')) else
      _ods_ir.AttrBuilder.get('SymbolRefAttr')(kernel, context=_ods_context))
    results = []
    if asyncToken is not None: results.append(asyncToken)
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def asyncDependencies(self) -> _ods_ir.OpOperandList:
    operand_range = _ods_segmented_accessor(
         self.operation.operands,
         self.operation.attributes["operandSegmentSizes"], 0)
    return operand_range

  @builtins.property
  def gridSizeX(self) -> _ods_ir.Value:
    operand_range = _ods_segmented_accessor(
         self.operation.operands,
         self.operation.attributes["operandSegmentSizes"], 1)
    return operand_range[0]

  @builtins.property
  def gridSizeY(self) -> _ods_ir.Value:
    operand_range = _ods_segmented_accessor(
         self.operation.operands,
         self.operation.attributes["operandSegmentSizes"], 2)
    return operand_range[0]

  @builtins.property
  def gridSizeZ(self) -> _ods_ir.Value:
    operand_range = _ods_segmented_accessor(
         self.operation.operands,
         self.operation.attributes["operandSegmentSizes"], 3)
    return operand_range[0]

  @builtins.property
  def blockSizeX(self) -> _ods_ir.Value:
    operand_range = _ods_segmented_accessor(
         self.operation.operands,
         self.operation.attributes["operandSegmentSizes"], 4)
    return operand_range[0]

  @builtins.property
  def blockSizeY(self) -> _ods_ir.Value:
    operand_range = _ods_segmented_accessor(
         self.operation.operands,
         self.operation.attributes["operandSegmentSizes"], 5)
    return operand_range[0]

  @builtins.property
  def blockSizeZ(self) -> _ods_ir.Value:
    operand_range = _ods_segmented_accessor(
         self.operation.operands,
         self.operation.attributes["operandSegmentSizes"], 6)
    return operand_range[0]

  @builtins.property
  def clusterSizeX(self) -> _Optional[_ods_ir.Value]:
    operand_range = _ods_segmented_accessor(
         self.operation.operands,
         self.operation.attributes["operandSegmentSizes"], 7)
    return operand_range[0] if len(operand_range) > 0 else None

  @builtins.property
  def clusterSizeY(self) -> _Optional[_ods_ir.Value]:
    operand_range = _ods_segmented_accessor(
         self.operation.operands,
         self.operation.attributes["operandSegmentSizes"], 8)
    return operand_range[0] if len(operand_range) > 0 else None

  @builtins.property
  def clusterSizeZ(self) -> _Optional[_ods_ir.Value]:
    operand_range = _ods_segmented_accessor(
         self.operation.operands,
         self.operation.attributes["operandSegmentSizes"], 9)
    return operand_range[0] if len(operand_range) > 0 else None

  @builtins.property
  def dynamicSharedMemorySize(self) -> _Optional[_ods_ir.Value[_ods_ir.IntegerType]]:
    operand_range = _ods_segmented_accessor(
         self.operation.operands,
         self.operation.attributes["operandSegmentSizes"], 10)
    return operand_range[0] if len(operand_range) > 0 else None

  @builtins.property
  def kernelOperands(self) -> _ods_ir.OpOperandList:
    operand_range = _ods_segmented_accessor(
         self.operation.operands,
         self.operation.attributes["operandSegmentSizes"], 11)
    return operand_range

  @builtins.property
  def asyncObject(self) -> _Optional[_ods_ir.Value]:
    operand_range = _ods_segmented_accessor(
         self.operation.operands,
         self.operation.attributes["operandSegmentSizes"], 12)
    return operand_range[0] if len(operand_range) > 0 else None

  @builtins.property
  def kernel(self) -> _ods_ir.SymbolRefAttr:
    return self.operation.attributes["kernel"]

  @kernel.setter
  def kernel(self, value: _ods_ir.SymbolRefAttr):
    if value is None:
      raise ValueError("'None' not allowed as value for mandatory attributes")
    self.operation.attributes["kernel"] = value

  @builtins.property
  def asyncToken(self) -> _Optional[_ods_ir.OpResult]:
    return None if len(self.operation.results) < 1 else self.operation.results[0]

def launch_func(async_token, async_dependencies, kernel, grid_size_x, grid_size_y, grid_size_z, block_size_x, block_size_y, block_size_z, kernel_operands, *, cluster_size_x=None, cluster_size_y=None, cluster_size_z=None, dynamic_shared_memory_size=None, async_object=None, loc=None, ip=None) -> _Union[_ods_ir.OpResult, _ods_ir.OpResultList, LaunchFuncOp]:
  op = LaunchFuncOp(asyncToken=async_token, asyncDependencies=async_dependencies, kernel=kernel, gridSizeX=grid_size_x, gridSizeY=grid_size_y, gridSizeZ=grid_size_z, blockSizeX=block_size_x, blockSizeY=block_size_y, blockSizeZ=block_size_z, kernelOperands=kernel_operands, clusterSizeX=cluster_size_x, clusterSizeY=cluster_size_y, clusterSizeZ=cluster_size_z, dynamicSharedMemorySize=dynamic_shared_memory_size, asyncObject=async_object, loc=loc, ip=ip); results = op.results
  return results if len(results) > 1 else (results[0] if len(results) == 1 else op)

@_ods_cext.register_operation(_Dialect)
class LaunchOp(_ods_ir.OpView):
  r"""
  Launch a kernel on the specified grid of thread blocks. The body of the
  kernel is defined by the single region that this operation contains. The
  operation takes an optional list of async dependencies followed by six
  operands and an optional operand.
  
  The `async` keyword indicates the kernel should be launched asynchronously;
  the operation returns a new !gpu.async.token when the keyword is specified.
  The kernel launched does not start executing until the ops producing its
  async dependencies (optional operands) have completed.
  
  The first three operands (following any async dependencies) are grid sizes
  along the x,y,z dimensions and the following three are block sizes along the
  x,y,z dimensions. When a lower-dimensional kernel is required, unused sizes
  must be explicitly set to `1`.  The last operand is optional and corresponds
  to the amount of dynamic shared memory a kernel's workgroup should be
  allocated; when this operand is not present, a zero size is assumed.
  
  The body region has at least _twelve_ arguments, or _eighteen_ if cluster
  dimensions are present, grouped as follows:
  
  -   three optional arguments that contain cluster identifiers along x,y,z
      dimensions;
  -   three arguments that contain block identifiers along x,y,z dimensions;
  -   three arguments that contain thread identifiers along x,y,z dimensions;
  -   operands of the `gpu.launch` operation as is (i.e. the operands for
      grid and block sizes).
  -   a variadic number of Workgroup memory attributions.
  -   a variadic number of Private memory attributions.
  
  The `function` and `module` attributes are optional and specifies
  the kernel name and a module in which the kernel should be outlined.
  
  Syntax:
  
  ```
  operation ::= `gpu.launch` (`async` (`[` ssa-id-list `]`)? )?
                           ( `clusters` `(` ssa-id-list `)` `in` ssa-reassignment )?
                           `blocks` `(` ssa-id-list `)` `in` ssa-reassignment
                           `threads` `(` ssa-id-list `)` `in` ssa-reassignment
                           (dynamic_shared_memory_size ssa-use)?
                           (`module(` symbol-ref-id `)`)?
                           (`function(` symbol-ref-id `)`)?
                           memory-attribution
                           region attr-dict?
  ssa-reassignment ::= `(` ssa-id `=` ssa-use (`,` ssa-id `=` ssa-use)* `)`
  memory-attribution ::= (`workgroup` `(` ssa-id-and-type-list `)`)?
                         (`private` `(` ssa-id-and-type-list `)`)?
  ```
  
  Example:
  
  ```mlir
  gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %0, %sz_by = %1, %sz_bz = %2)
             threads(%tx, %ty, %tz) in (%sz_tx = %3, %sz_ty = %4, %sz_tz = %5) {
    // Block and thread identifiers, as well as block/grid sizes are
    // immediately usable inside body region.
    "some_op"(%bx, %tx) : (index, index) -> ()
    // Assuming %val1 is defined outside the gpu.launch region.
    %42 = load %val1[%bx] : memref<?xf32, 1>
  }
  
  // Generic syntax explains how the pretty syntax maps to the IR structure.
  "gpu.launch"(%cst, %cst, %c1,  // Grid sizes.
               %cst, %c1, %c1)   // Block sizes.
  
      {/*attributes*/}
      // All sizes and identifiers have "index" size.
      : (index, index, index, index, index, index) -> () {
  // The operation passes block and thread identifiers, followed by grid and
  // block sizes.
  ^bb0(%bx : index, %by : index, %bz : index,
       %tx : index, %ty : index, %tz : index,
       %num_bx : index, %num_by : index, %num_bz : index,
       %num_tx : index, %num_ty : index, %num_tz : index)
    "some_op"(%bx, %tx) : (index, index) -> ()
    %3 = "memref.load"(%val1, %bx) : (memref<?xf32, 1>, index) -> f32
  }
  
  // Launch with memory attributions.
  gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %0, %sz_by = %1, %sz_bz = %2)
             threads(%tx, %ty, %tz) in (%sz_tx = %3, %sz_ty = %4, %sz_tz = %5)
             workgroup(%workgroup: memref<32xf32, 3>)
             private(%private: memref<1xf32, 5>) {
    // Block and thread identifiers, as well as block/grid sizes are
    // immediately usable inside body region.
    "some_op"(%bx, %tx) : (index, index) -> ()
    // Assuming %val1 is defined outside the gpu.launch region.
    %42 = load %workgroup[%bx] : memref<32xf32, 3>
  }
  
  // Launch with clusters.
  gpu.launch clusters(%cx, %cy, %cz) in (%sz_cx = %0, %sz_cy = %1, %sz_cz = %2)
             blocks(%bx, %by, %bz) in (%sz_bx = %3, %sz_by = %4, %sz_bz = %5)
             threads(%tx, %ty, %tz) in (%sz_tx = %6, %sz_ty = %7, %sz_tz = %8)
  {
    // Cluster, block and thread identifiers, as well as cluster/block/grid
    // sizes are immediately usable inside body region.
    "some_op"(%cx, %bx, %tx) : (index, index, index) -> ()
  }
  
  // Launch with module and function attributes.
  gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %0, %sz_by = %1, %sz_bz = %2)
             threads(%tx, %ty, %tz) in (%sz_tx = %3, %sz_ty = %4, %sz_tz = %5)
             module(@kernel_module) function(@kernel_func) {
    "some_op"(%bx, %tx) : (index, index) -> ()
    %42 = load %val1[%bx] : memref<?xf32, 1>
  }
  ```
  
  Rationale: using operation/block arguments gives analyses a clear way of
  understanding that a value has additional semantics (e.g., we will need to
  know what value corresponds to threadIdx.x for coalescing). We can recover
  these properties by analyzing the operations producing values, but it is
  easier just to have that information by construction.
  """

  OPERATION_NAME = "gpu.launch"

  _ODS_OPERAND_SEGMENTS = [-1,1,1,1,1,1,1,0,0,0,0,]

  _ODS_REGIONS = (1, True)

  def __init__(self, asyncToken, asyncDependencies, gridSizeX, gridSizeY, gridSizeZ, blockSizeX, blockSizeY, blockSizeZ, *, clusterSizeX=None, clusterSizeY=None, clusterSizeZ=None, dynamicSharedMemorySize=None, module=None, function=None, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.append(_get_op_results_or_values(asyncDependencies))
    operands.append(gridSizeX)
    operands.append(gridSizeY)
    operands.append(gridSizeZ)
    operands.append(blockSizeX)
    operands.append(blockSizeY)
    operands.append(blockSizeZ)
    operands.append(clusterSizeX)
    operands.append(clusterSizeY)
    operands.append(clusterSizeZ)
    operands.append(dynamicSharedMemorySize)
    _ods_context = _ods_get_default_loc_context(loc)
    if module is not None: attributes["module"] = (module if (
        isinstance(module, _ods_ir.Attribute) or
        not _ods_ir.AttrBuilder.contains('FlatSymbolRefAttr')) else
          _ods_ir.AttrBuilder.get('FlatSymbolRefAttr')(module, context=_ods_context))
    if function is not None: attributes["function"] = (function if (
        isinstance(function, _ods_ir.Attribute) or
        not _ods_ir.AttrBuilder.contains('FlatSymbolRefAttr')) else
          _ods_ir.AttrBuilder.get('FlatSymbolRefAttr')(function, context=_ods_context))
    results = []
    if asyncToken is not None: results.append(asyncToken)
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def asyncDependencies(self) -> _ods_ir.OpOperandList:
    operand_range = _ods_segmented_accessor(
         self.operation.operands,
         self.operation.attributes["operandSegmentSizes"], 0)
    return operand_range

  @builtins.property
  def gridSizeX(self) -> _ods_ir.Value[_ods_ir.IndexType]:
    operand_range = _ods_segmented_accessor(
         self.operation.operands,
         self.operation.attributes["operandSegmentSizes"], 1)
    return operand_range[0]

  @builtins.property
  def gridSizeY(self) -> _ods_ir.Value[_ods_ir.IndexType]:
    operand_range = _ods_segmented_accessor(
         self.operation.operands,
         self.operation.attributes["operandSegmentSizes"], 2)
    return operand_range[0]

  @builtins.property
  def gridSizeZ(self) -> _ods_ir.Value[_ods_ir.IndexType]:
    operand_range = _ods_segmented_accessor(
         self.operation.operands,
         self.operation.attributes["operandSegmentSizes"], 3)
    return operand_range[0]

  @builtins.property
  def blockSizeX(self) -> _ods_ir.Value[_ods_ir.IndexType]:
    operand_range = _ods_segmented_accessor(
         self.operation.operands,
         self.operation.attributes["operandSegmentSizes"], 4)
    return operand_range[0]

  @builtins.property
  def blockSizeY(self) -> _ods_ir.Value[_ods_ir.IndexType]:
    operand_range = _ods_segmented_accessor(
         self.operation.operands,
         self.operation.attributes["operandSegmentSizes"], 5)
    return operand_range[0]

  @builtins.property
  def blockSizeZ(self) -> _ods_ir.Value[_ods_ir.IndexType]:
    operand_range = _ods_segmented_accessor(
         self.operation.operands,
         self.operation.attributes["operandSegmentSizes"], 6)
    return operand_range[0]

  @builtins.property
  def clusterSizeX(self) -> _Optional[_ods_ir.Value[_ods_ir.IndexType]]:
    operand_range = _ods_segmented_accessor(
         self.operation.operands,
         self.operation.attributes["operandSegmentSizes"], 7)
    return operand_range[0] if len(operand_range) > 0 else None

  @builtins.property
  def clusterSizeY(self) -> _Optional[_ods_ir.Value[_ods_ir.IndexType]]:
    operand_range = _ods_segmented_accessor(
         self.operation.operands,
         self.operation.attributes["operandSegmentSizes"], 8)
    return operand_range[0] if len(operand_range) > 0 else None

  @builtins.property
  def clusterSizeZ(self) -> _Optional[_ods_ir.Value[_ods_ir.IndexType]]:
    operand_range = _ods_segmented_accessor(
         self.operation.operands,
         self.operation.attributes["operandSegmentSizes"], 9)
    return operand_range[0] if len(operand_range) > 0 else None

  @builtins.property
  def dynamicSharedMemorySize(self) -> _Optional[_ods_ir.Value[_ods_ir.IntegerType]]:
    operand_range = _ods_segmented_accessor(
         self.operation.operands,
         self.operation.attributes["operandSegmentSizes"], 10)
    return operand_range[0] if len(operand_range) > 0 else None

  @builtins.property
  def module(self) -> _Optional[_ods_ir.FlatSymbolRefAttr]:
    if "module" not in self.operation.attributes:
      return None
    return self.operation.attributes["module"]

  @module.setter
  def module(self, value: _Optional[_ods_ir.FlatSymbolRefAttr]):
    if value is not None:
      self.operation.attributes["module"] = value
    elif "module" in self.operation.attributes:
      del self.operation.attributes["module"]

  @module.deleter
  def module(self):
    del self.operation.attributes["module"]

  @builtins.property
  def function(self) -> _Optional[_ods_ir.FlatSymbolRefAttr]:
    if "function" not in self.operation.attributes:
      return None
    return self.operation.attributes["function"]

  @function.setter
  def function(self, value: _Optional[_ods_ir.FlatSymbolRefAttr]):
    if value is not None:
      self.operation.attributes["function"] = value
    elif "function" in self.operation.attributes:
      del self.operation.attributes["function"]

  @function.deleter
  def function(self):
    del self.operation.attributes["function"]

  @builtins.property
  def asyncToken(self) -> _Optional[_ods_ir.OpResult]:
    return None if len(self.operation.results) < 1 else self.operation.results[0]

  @builtins.property
  def body(self) -> _ods_ir.Region:
    return self.regions[0]

def launch(async_token, async_dependencies, grid_size_x, grid_size_y, grid_size_z, block_size_x, block_size_y, block_size_z, *, cluster_size_x=None, cluster_size_y=None, cluster_size_z=None, dynamic_shared_memory_size=None, module=None, function=None, loc=None, ip=None) -> _Union[_ods_ir.OpResult, _ods_ir.OpResultList, LaunchOp]:
  op = LaunchOp(asyncToken=async_token, asyncDependencies=async_dependencies, gridSizeX=grid_size_x, gridSizeY=grid_size_y, gridSizeZ=grid_size_z, blockSizeX=block_size_x, blockSizeY=block_size_y, blockSizeZ=block_size_z, clusterSizeX=cluster_size_x, clusterSizeY=cluster_size_y, clusterSizeZ=cluster_size_z, dynamicSharedMemorySize=dynamic_shared_memory_size, module=module, function=function, loc=loc, ip=ip); results = op.results
  return results if len(results) > 1 else (results[0] if len(results) == 1 else op)

@_ods_cext.register_operation(_Dialect)
class MemcpyOp(_ods_ir.OpView):
  r"""
  The `gpu.memcpy` operation copies the content of one memref to another.
  
  The op does not execute before all async dependencies have finished
  executing.
  
  If the `async` keyword is present, the op is executed asynchronously (i.e.
  it does not block until the execution has finished on the device). In
  that case, it returns a !gpu.async.token.
  
  Example:
  
  ```mlir
  %token = gpu.memcpy async [%dep] %dst, %src : memref<?xf32, 1>, memref<?xf32>
  ```
  """

  OPERATION_NAME = "gpu.memcpy"

  _ODS_REGIONS = (0, True)

  def __init__(self, asyncToken, asyncDependencies, dst, src, *, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.extend(_get_op_results_or_values(asyncDependencies))
    operands.append(dst)
    operands.append(src)
    _ods_context = _ods_get_default_loc_context(loc)
    results = []
    if asyncToken is not None: results.append(asyncToken)
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def asyncDependencies(self) -> _ods_ir.OpOperandList:
    _ods_variadic_group_length = len(self.operation.operands) - 3 + 1
    return self.operation.operands[0:0 + _ods_variadic_group_length]

  @builtins.property
  def dst(self) -> _ods_ir.Value[_ods_ir.MemRefType]:
    _ods_variadic_group_length = len(self.operation.operands) - 3 + 1
    return self.operation.operands[1 + _ods_variadic_group_length - 1]

  @builtins.property
  def src(self) -> _ods_ir.Value[_ods_ir.MemRefType]:
    _ods_variadic_group_length = len(self.operation.operands) - 3 + 1
    return self.operation.operands[2 + _ods_variadic_group_length - 1]

  @builtins.property
  def asyncToken(self) -> _Optional[_ods_ir.OpResult]:
    return None if len(self.operation.results) < 1 else self.operation.results[0]

def memcpy(async_token, async_dependencies, dst, src, *, loc=None, ip=None) -> _Union[_ods_ir.OpResult, _ods_ir.OpResultList, MemcpyOp]:
  op = MemcpyOp(asyncToken=async_token, asyncDependencies=async_dependencies, dst=dst, src=src, loc=loc, ip=ip); results = op.results
  return results if len(results) > 1 else (results[0] if len(results) == 1 else op)

@_ods_cext.register_operation(_Dialect)
class MemsetOp(_ods_ir.OpView):
  r"""
  The `gpu.memset` operation sets the content of memref to a scalar value.
  
  The op does not execute before all async dependencies have finished
  executing.
  
  If the `async` keyword is present, the op is executed asynchronously (i.e.
  it does not block until the execution has finished on the device). In
  that case, it returns a !gpu.async.token.
  
  Example:
  
  ```mlir
  %token = gpu.memset async [%dep] %dst, %value : memref<?xf32, 1>, f32
  ```
  """

  OPERATION_NAME = "gpu.memset"

  _ODS_REGIONS = (0, True)

  def __init__(self, asyncToken, asyncDependencies, dst, value, *, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.extend(_get_op_results_or_values(asyncDependencies))
    operands.append(dst)
    operands.append(value)
    _ods_context = _ods_get_default_loc_context(loc)
    results = []
    if asyncToken is not None: results.append(asyncToken)
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def asyncDependencies(self) -> _ods_ir.OpOperandList:
    _ods_variadic_group_length = len(self.operation.operands) - 3 + 1
    return self.operation.operands[0:0 + _ods_variadic_group_length]

  @builtins.property
  def dst(self) -> _ods_ir.Value[_ods_ir.MemRefType]:
    _ods_variadic_group_length = len(self.operation.operands) - 3 + 1
    return self.operation.operands[1 + _ods_variadic_group_length - 1]

  @builtins.property
  def value(self) -> _ods_ir.Value:
    _ods_variadic_group_length = len(self.operation.operands) - 3 + 1
    return self.operation.operands[2 + _ods_variadic_group_length - 1]

  @builtins.property
  def asyncToken(self) -> _Optional[_ods_ir.OpResult]:
    return None if len(self.operation.results) < 1 else self.operation.results[0]

def memset(async_token, async_dependencies, dst, value, *, loc=None, ip=None) -> _Union[_ods_ir.OpResult, _ods_ir.OpResultList, MemsetOp]:
  op = MemsetOp(asyncToken=async_token, asyncDependencies=async_dependencies, dst=dst, value=value, loc=loc, ip=ip); results = op.results
  return results if len(results) > 1 else (results[0] if len(results) == 1 else op)

@_ods_cext.register_operation(_Dialect)
class NumSubgroupsOp(_ods_ir.OpView):
  r"""
  Returns the number of subgroups within a workgroup.
  
  Example:
  
  ```mlir
  %numSg = gpu.num_subgroups : index
  ```
  
  If `upper_bound` is set, executions with more than `upper_bound` subgroups
  per workgroup cause undefined behavior. There is a default upper bound of
  `kMaxDim` (currently uint32_t::max).
  """

  OPERATION_NAME = "gpu.num_subgroups"

  _ODS_REGIONS = (0, True)

  def __init__(self, *, upper_bound=None, results=None, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    _ods_context = _ods_get_default_loc_context(loc)
    if upper_bound is not None: attributes["upper_bound"] = (upper_bound if (
        isinstance(upper_bound, _ods_ir.Attribute) or
        not _ods_ir.AttrBuilder.contains('IndexAttr')) else
          _ods_ir.AttrBuilder.get('IndexAttr')(upper_bound, context=_ods_context))
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def upper_bound(self) -> _Optional[_ods_ir.IntegerAttr]:
    if "upper_bound" not in self.operation.attributes:
      return None
    return self.operation.attributes["upper_bound"]

  @upper_bound.setter
  def upper_bound(self, value: _Optional[_ods_ir.IntegerAttr]):
    if value is not None:
      self.operation.attributes["upper_bound"] = value
    elif "upper_bound" in self.operation.attributes:
      del self.operation.attributes["upper_bound"]

  @upper_bound.deleter
  def upper_bound(self):
    del self.operation.attributes["upper_bound"]

  @builtins.property
  def result(self) -> _ods_ir.OpResult[_ods_ir.IndexType]:
    return self.operation.results[0]

def num_subgroups(*, upper_bound=None, results=None, loc=None, ip=None) -> _ods_ir.OpResult:
  return NumSubgroupsOp(upper_bound=upper_bound, results=results, loc=loc, ip=ip).result

@_ods_cext.register_operation(_Dialect)
class PrintfOp(_ods_ir.OpView):
  r"""
  `gpu.printf` takes a literal format string `format` and an arbitrary number of
  scalar arguments that should be printed.
  
  The format string is a C-style printf string, subject to any restrictions
  imposed by one's target platform.
  """

  OPERATION_NAME = "gpu.printf"

  _ODS_REGIONS = (0, True)

  def __init__(self, format, args, *, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.extend(_get_op_results_or_values(args))
    _ods_context = _ods_get_default_loc_context(loc)
    attributes["format"] = (format if (
    isinstance(format, _ods_ir.Attribute) or
    not _ods_ir.AttrBuilder.contains('StrAttr')) else
      _ods_ir.AttrBuilder.get('StrAttr')(format, context=_ods_context))
    results = []
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def args(self) -> _ods_ir.OpOperandList:
    _ods_variadic_group_length = len(self.operation.operands) - 1 + 1
    return self.operation.operands[0:0 + _ods_variadic_group_length]

  @builtins.property
  def format(self) -> _ods_ir.StringAttr:
    return self.operation.attributes["format"]

  @format.setter
  def format(self, value: _ods_ir.StringAttr):
    if value is None:
      raise ValueError("'None' not allowed as value for mandatory attributes")
    self.operation.attributes["format"] = value

def printf(format, args, *, loc=None, ip=None) -> PrintfOp:
  return PrintfOp(format=format, args=args, loc=loc, ip=ip)

@_ods_cext.register_operation(_Dialect)
class ReturnOp(_ods_ir.OpView):
  r"""
  A terminator operation for regions that appear in the body of `gpu.func`
  functions. The operands to the `gpu.return` are the result values returned
  by an invocation of the `gpu.func`.
  """

  OPERATION_NAME = "gpu.return"

  _ODS_REGIONS = (0, True)

  def __init__(self, operands_, *, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.extend(_get_op_results_or_values(operands_))
    _ods_context = _ods_get_default_loc_context(loc)
    results = []
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def operands_(self) -> _ods_ir.OpOperandList:
    _ods_variadic_group_length = len(self.operation.operands) - 1 + 1
    return self.operation.operands[0:0 + _ods_variadic_group_length]

def return_(operands_, *, loc=None, ip=None) -> ReturnOp:
  return ReturnOp(operands_=operands_, loc=loc, ip=ip)

@_ods_cext.register_operation(_Dialect)
class RotateOp(_ods_ir.OpView):
  r"""
  The "rotate" op moves values across lanes in a subgroup (a.k.a., local
  invocations) within the same subgroup. The `width` attribute specifies the
  number of lanes that participate in the rotation, and must be uniform across
  all participating lanes. Further, the first `width` lanes of the subgroup
  must be active.
  
  `width` must be a power of two, and `offset` must be in the range
  `[0, width)`.
  
  Return the `rotateResult` of the invocation whose id within the group is
  calculated as follows:
  
  ```mlir
  Invocation ID = ((LaneId + offset) & (width - 1)) + (LaneId & ~(width - 1))
  ```
  
  Returns the `rotateResult` and `true` if the current lane id is smaller than
  `width`, and poison value and `false` otherwise.
  
  example:
  
  ```mlir
  %1, %2 = gpu.rotate %0, 1, 16 : f32
  ```
  
  For lane `k`, returns the value from lane `(k + cst1) % width`.
  """

  OPERATION_NAME = "gpu.rotate"

  _ODS_REGIONS = (0, True)

  def __init__(self, value, offset, width, *, results=None, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.append(value)
    _ods_context = _ods_get_default_loc_context(loc)
    attributes["offset"] = (offset if (
    isinstance(offset, _ods_ir.Attribute) or
    not _ods_ir.AttrBuilder.contains('I32Attr')) else
      _ods_ir.AttrBuilder.get('I32Attr')(offset, context=_ods_context))
    attributes["width"] = (width if (
    isinstance(width, _ods_ir.Attribute) or
    not _ods_ir.AttrBuilder.contains('I32Attr')) else
      _ods_ir.AttrBuilder.get('I32Attr')(width, context=_ods_context))
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def value(self) -> _ods_ir.Value:
    return self.operation.operands[0]

  @builtins.property
  def offset(self) -> _ods_ir.IntegerAttr:
    return self.operation.attributes["offset"]

  @offset.setter
  def offset(self, value: _ods_ir.IntegerAttr):
    if value is None:
      raise ValueError("'None' not allowed as value for mandatory attributes")
    self.operation.attributes["offset"] = value

  @builtins.property
  def width(self) -> _ods_ir.IntegerAttr:
    return self.operation.attributes["width"]

  @width.setter
  def width(self, value: _ods_ir.IntegerAttr):
    if value is None:
      raise ValueError("'None' not allowed as value for mandatory attributes")
    self.operation.attributes["width"] = value

  @builtins.property
  def rotateResult(self) -> _ods_ir.OpResult:
    return self.operation.results[0]

  @builtins.property
  def valid(self) -> _ods_ir.OpResult[_ods_ir.IntegerType]:
    return self.operation.results[1]

def rotate(value, offset, width, *, results=None, loc=None, ip=None) -> _ods_ir.OpResultList:
  return RotateOp(value=value, offset=offset, width=width, results=results, loc=loc, ip=ip).results

@_ods_cext.register_operation(_Dialect)
class SDDMMBufferSizeOp(_ods_ir.OpView):
  r"""
  The `gpu.sddmm_buffer_size` operation returns the buffer size required
  to perform the SDDMM operation on the given sparse and dense matrices.
  The operation expects handles returned by previous sparse operations
  to construct an environment and the operands for SDDMM.
  
  If the `async` keyword is present, the op is executed asynchronously (i.e.
  it does not block until the execution has finished on the device). In
  that case, it returns a !gpu.async.token in addition to the environment.
  
  Example:
  
  ```mlir
  %buffersz, %token = gpu.sddmm_buffer_size async [%dep] %dnmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %spmatC into f32
  ```
  
  The matrix arguments can also be associated with one of the following
  operators: NON_TRANSPOSE, TRANSPOSE, CONJUGATE_TRANSPOSE. The default value
  is NON_TRANSPOSE.
  """

  OPERATION_NAME = "gpu.sddmm_buffer_size"

  _ODS_REGIONS = (0, True)

  def __init__(self, bufferSz, asyncToken, asyncDependencies, dnmatA, dnmatB, spmatC, computeType, *, modeA=None, modeB=None, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.extend(_get_op_results_or_values(asyncDependencies))
    operands.append(dnmatA)
    operands.append(dnmatB)
    operands.append(spmatC)
    _ods_context = _ods_get_default_loc_context(loc)
    if modeA is not None: attributes["modeA"] = (modeA if (
        isinstance(modeA, _ods_ir.Attribute) or
        not _ods_ir.AttrBuilder.contains('GPU_TransposeModeAttr')) else
          _ods_ir.AttrBuilder.get('GPU_TransposeModeAttr')(modeA, context=_ods_context))
    if modeB is not None: attributes["modeB"] = (modeB if (
        isinstance(modeB, _ods_ir.Attribute) or
        not _ods_ir.AttrBuilder.contains('GPU_TransposeModeAttr')) else
          _ods_ir.AttrBuilder.get('GPU_TransposeModeAttr')(modeB, context=_ods_context))
    attributes["computeType"] = (computeType if (
    isinstance(computeType, _ods_ir.Attribute) or
    not _ods_ir.AttrBuilder.contains('TypeAttr')) else
      _ods_ir.AttrBuilder.get('TypeAttr')(computeType, context=_ods_context))
    results = []
    results.append(bufferSz)
    if asyncToken is not None: results.append(asyncToken)
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def asyncDependencies(self) -> _ods_ir.OpOperandList:
    _ods_variadic_group_length = len(self.operation.operands) - 4 + 1
    return self.operation.operands[0:0 + _ods_variadic_group_length]

  @builtins.property
  def dnmatA(self) -> _ods_ir.Value:
    _ods_variadic_group_length = len(self.operation.operands) - 4 + 1
    return self.operation.operands[1 + _ods_variadic_group_length - 1]

  @builtins.property
  def dnmatB(self) -> _ods_ir.Value:
    _ods_variadic_group_length = len(self.operation.operands) - 4 + 1
    return self.operation.operands[2 + _ods_variadic_group_length - 1]

  @builtins.property
  def spmatC(self) -> _ods_ir.Value:
    _ods_variadic_group_length = len(self.operation.operands) - 4 + 1
    return self.operation.operands[3 + _ods_variadic_group_length - 1]

  @builtins.property
  def modeA(self) -> _ods_ir.Attribute:
    return self.operation.attributes["modeA"]

  @modeA.setter
  def modeA(self, value: _ods_ir.Attribute):
    if value is None:
      raise ValueError("'None' not allowed as value for mandatory attributes")
    self.operation.attributes["modeA"] = value

  @builtins.property
  def modeB(self) -> _ods_ir.Attribute:
    return self.operation.attributes["modeB"]

  @modeB.setter
  def modeB(self, value: _ods_ir.Attribute):
    if value is None:
      raise ValueError("'None' not allowed as value for mandatory attributes")
    self.operation.attributes["modeB"] = value

  @builtins.property
  def computeType(self) -> _ods_ir.TypeAttr:
    return self.operation.attributes["computeType"]

  @computeType.setter
  def computeType(self, value: _ods_ir.TypeAttr):
    if value is None:
      raise ValueError("'None' not allowed as value for mandatory attributes")
    self.operation.attributes["computeType"] = value

  @builtins.property
  def bufferSz(self) -> _ods_ir.OpResult[_ods_ir.IndexType]:
    return self.operation.results[0]

  @builtins.property
  def asyncToken(self) -> _Optional[_ods_ir.OpResult]:
    return None if len(self.operation.results) < 2 else self.operation.results[1]

def sddmm_buffer_size(buffer_sz, async_token, async_dependencies, dnmat_a, dnmat_b, spmat_c, compute_type, *, mode_a=None, mode_b=None, loc=None, ip=None) -> _Union[_ods_ir.OpResult, _ods_ir.OpResultList, SDDMMBufferSizeOp]:
  op = SDDMMBufferSizeOp(bufferSz=buffer_sz, asyncToken=async_token, asyncDependencies=async_dependencies, dnmatA=dnmat_a, dnmatB=dnmat_b, spmatC=spmat_c, computeType=compute_type, modeA=mode_a, modeB=mode_b, loc=loc, ip=ip); results = op.results
  return results if len(results) > 1 else (results[0] if len(results) == 1 else op)

@_ods_cext.register_operation(_Dialect)
class SDDMMOp(_ods_ir.OpView):
  r"""
  The `gpu.sddmm` operation performs the SDDMM operation on the given sparse and
  dense matrices, and buffer.  The operation expects handles returned by previous
  sparse operations to construct an environment and the operands for SDDMM. The
  buffer must have been allocated on the device.
  
  If the `async` keyword is present, the op is executed asynchronously (i.e.
  it does not block until the execution has finished on the device). In
  that case, it returns a !gpu.async.token in addition to the environment.
  
  Example:
  
  ```mlir
  %token = gpu.sddmm async [%dep] %dnmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %spmatC, %buffer into f32
  ```
  
  The matrix arguments can also be associated with one of the following
  operators: NON_TRANSPOSE, TRANSPOSE, CONJUGATE_TRANSPOSE. The default value
  is NON_TRANSPOSE.
  """

  OPERATION_NAME = "gpu.sddmm"

  _ODS_REGIONS = (0, True)

  def __init__(self, asyncToken, asyncDependencies, dnmatA, dnmatB, spmatC, computeType, buffer, *, modeA=None, modeB=None, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.extend(_get_op_results_or_values(asyncDependencies))
    operands.append(dnmatA)
    operands.append(dnmatB)
    operands.append(spmatC)
    operands.append(buffer)
    _ods_context = _ods_get_default_loc_context(loc)
    if modeA is not None: attributes["modeA"] = (modeA if (
        isinstance(modeA, _ods_ir.Attribute) or
        not _ods_ir.AttrBuilder.contains('GPU_TransposeModeAttr')) else
          _ods_ir.AttrBuilder.get('GPU_TransposeModeAttr')(modeA, context=_ods_context))
    if modeB is not None: attributes["modeB"] = (modeB if (
        isinstance(modeB, _ods_ir.Attribute) or
        not _ods_ir.AttrBuilder.contains('GPU_TransposeModeAttr')) else
          _ods_ir.AttrBuilder.get('GPU_TransposeModeAttr')(modeB, context=_ods_context))
    attributes["computeType"] = (computeType if (
    isinstance(computeType, _ods_ir.Attribute) or
    not _ods_ir.AttrBuilder.contains('TypeAttr')) else
      _ods_ir.AttrBuilder.get('TypeAttr')(computeType, context=_ods_context))
    results = []
    if asyncToken is not None: results.append(asyncToken)
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def asyncDependencies(self) -> _ods_ir.OpOperandList:
    _ods_variadic_group_length = len(self.operation.operands) - 5 + 1
    return self.operation.operands[0:0 + _ods_variadic_group_length]

  @builtins.property
  def dnmatA(self) -> _ods_ir.Value:
    _ods_variadic_group_length = len(self.operation.operands) - 5 + 1
    return self.operation.operands[1 + _ods_variadic_group_length - 1]

  @builtins.property
  def dnmatB(self) -> _ods_ir.Value:
    _ods_variadic_group_length = len(self.operation.operands) - 5 + 1
    return self.operation.operands[2 + _ods_variadic_group_length - 1]

  @builtins.property
  def spmatC(self) -> _ods_ir.Value:
    _ods_variadic_group_length = len(self.operation.operands) - 5 + 1
    return self.operation.operands[3 + _ods_variadic_group_length - 1]

  @builtins.property
  def buffer(self) -> _ods_ir.Value[_ods_ir.MemRefType]:
    _ods_variadic_group_length = len(self.operation.operands) - 5 + 1
    return self.operation.operands[4 + _ods_variadic_group_length - 1]

  @builtins.property
  def modeA(self) -> _ods_ir.Attribute:
    return self.operation.attributes["modeA"]

  @modeA.setter
  def modeA(self, value: _ods_ir.Attribute):
    if value is None:
      raise ValueError("'None' not allowed as value for mandatory attributes")
    self.operation.attributes["modeA"] = value

  @builtins.property
  def modeB(self) -> _ods_ir.Attribute:
    return self.operation.attributes["modeB"]

  @modeB.setter
  def modeB(self, value: _ods_ir.Attribute):
    if value is None:
      raise ValueError("'None' not allowed as value for mandatory attributes")
    self.operation.attributes["modeB"] = value

  @builtins.property
  def computeType(self) -> _ods_ir.TypeAttr:
    return self.operation.attributes["computeType"]

  @computeType.setter
  def computeType(self, value: _ods_ir.TypeAttr):
    if value is None:
      raise ValueError("'None' not allowed as value for mandatory attributes")
    self.operation.attributes["computeType"] = value

  @builtins.property
  def asyncToken(self) -> _Optional[_ods_ir.OpResult]:
    return None if len(self.operation.results) < 1 else self.operation.results[0]

def sddmm(async_token, async_dependencies, dnmat_a, dnmat_b, spmat_c, compute_type, buffer, *, mode_a=None, mode_b=None, loc=None, ip=None) -> _Union[_ods_ir.OpResult, _ods_ir.OpResultList, SDDMMOp]:
  op = SDDMMOp(asyncToken=async_token, asyncDependencies=async_dependencies, dnmatA=dnmat_a, dnmatB=dnmat_b, spmatC=spmat_c, computeType=compute_type, buffer=buffer, modeA=mode_a, modeB=mode_b, loc=loc, ip=ip); results = op.results
  return results if len(results) > 1 else (results[0] if len(results) == 1 else op)

@_ods_cext.register_operation(_Dialect)
class SetCsrPointersOp(_ods_ir.OpView):
  r"""
  The `gpu.set_csr_pointers` assigns the given positions, coordinates,
  and values buffer that reside on the device directly to the given sparse
  matrix descriptor in csr format.
  
  If the `async` keyword is present, the op is executed asynchronously (i.e.
  it does not block until the execution has finished on the device). In
  that case, it returns a `!gpu.async.token` in addition to the environment.
  
  Example:
  
  ```mlir
  %token = gpu.set_csr_pointers async [%dep] %positions, %coordinates, %values
        : memref<?xf32>, memref<?xindex>, memref<?xindex>
  ```
  """

  OPERATION_NAME = "gpu.set_csr_pointers"

  _ODS_REGIONS = (0, True)

  def __init__(self, asyncToken, asyncDependencies, spmat, positions, coordinates, values, *, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.extend(_get_op_results_or_values(asyncDependencies))
    operands.append(spmat)
    operands.append(positions)
    operands.append(coordinates)
    operands.append(values)
    _ods_context = _ods_get_default_loc_context(loc)
    results = []
    if asyncToken is not None: results.append(asyncToken)
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def asyncDependencies(self) -> _ods_ir.OpOperandList:
    _ods_variadic_group_length = len(self.operation.operands) - 5 + 1
    return self.operation.operands[0:0 + _ods_variadic_group_length]

  @builtins.property
  def spmat(self) -> _ods_ir.Value:
    _ods_variadic_group_length = len(self.operation.operands) - 5 + 1
    return self.operation.operands[1 + _ods_variadic_group_length - 1]

  @builtins.property
  def positions(self) -> _ods_ir.Value[_ods_ir.MemRefType]:
    _ods_variadic_group_length = len(self.operation.operands) - 5 + 1
    return self.operation.operands[2 + _ods_variadic_group_length - 1]

  @builtins.property
  def coordinates(self) -> _ods_ir.Value[_ods_ir.MemRefType]:
    _ods_variadic_group_length = len(self.operation.operands) - 5 + 1
    return self.operation.operands[3 + _ods_variadic_group_length - 1]

  @builtins.property
  def values(self) -> _ods_ir.Value[_ods_ir.MemRefType]:
    _ods_variadic_group_length = len(self.operation.operands) - 5 + 1
    return self.operation.operands[4 + _ods_variadic_group_length - 1]

  @builtins.property
  def asyncToken(self) -> _Optional[_ods_ir.OpResult]:
    return None if len(self.operation.results) < 1 else self.operation.results[0]

def set_csr_pointers(async_token, async_dependencies, spmat, positions, coordinates, values, *, loc=None, ip=None) -> _Union[_ods_ir.OpResult, _ods_ir.OpResultList, SetCsrPointersOp]:
  op = SetCsrPointersOp(asyncToken=async_token, asyncDependencies=async_dependencies, spmat=spmat, positions=positions, coordinates=coordinates, values=values, loc=loc, ip=ip); results = op.results
  return results if len(results) > 1 else (results[0] if len(results) == 1 else op)

@_ods_cext.register_operation(_Dialect)
class SetDefaultDeviceOp(_ods_ir.OpView):
  r"""
  Operation that sets the current default GPU, using a zero-based index
  into the set of GPUs on the system. The default GPU setting may be
  thread-local.
  """

  OPERATION_NAME = "gpu.set_default_device"

  _ODS_REGIONS = (0, True)

  def __init__(self, devIndex, *, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.append(devIndex)
    _ods_context = _ods_get_default_loc_context(loc)
    results = []
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def devIndex(self) -> _ods_ir.Value[_ods_ir.IntegerType]:
    return self.operation.operands[0]

def set_default_device(dev_index, *, loc=None, ip=None) -> SetDefaultDeviceOp:
  return SetDefaultDeviceOp(devIndex=dev_index, loc=loc, ip=ip)

@_ods_cext.register_operation(_Dialect)
class ShuffleOp(_ods_ir.OpView):
  r"""
  The "shuffle" op moves values across lanes in a subgroup (a.k.a., local
  invocation) within the same subgroup. The `width` argument specifies the
  number of lanes that participate in the shuffle, and must be uniform
  across all lanes. Further, the first `width` lanes of the subgroup must
  be active.
  
  The intepretation of the `offset` arguments depends on the selected
  `mode`.
  
  Returns the `shuffleResult` and `true` if the current lane id is smaller
  than `width`, and an unspecified value and `false` otherwise.
  
  `xor` example:
  
  ```mlir
  %1, %2 = gpu.shuffle xor %0, %offset, %width : f32
  ```
  
  For lane `k`, returns the value `%0` from lane `k ^ offset`. Every lane
  trades value with exactly one other lane.
  
  `down` example:
  
  ```mlir
  %cst1 = arith.constant 1 : i32
  %3, %4 = gpu.shuffle down %0, %cst1, %width : f32
  ```
  
  For lane `k`, returns the value from lane `(k + cst1)`. If `(k + cst1)` is
  bigger than or equal to `width`, the value is poison and `valid` is `false`.
  
  `up` example:
  
  ```mlir
  %cst1 = arith.constant 1 : i32
  %5, %6 = gpu.shuffle up %0, %cst1, %width : f32
  ```
  
  For lane `k`, returns the value from lane `(k - cst1)`. If `(k - cst1)` is
  smaller than `0`, the value is poison and `valid` is `false`.
  
  `idx` example:
  
  ```mlir
  %cst0 = arith.constant 0 : i32
  %7, %8 = gpu.shuffle idx %0, %cst0, %width : f32
  ```
  
  Broadcasts the value from lane 0 to all lanes.
  """

  OPERATION_NAME = "gpu.shuffle"

  _ODS_REGIONS = (0, True)

  def __init__(self, value, offset, width, mode, *, results=None, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.append(value)
    operands.append(offset)
    operands.append(width)
    _ods_context = _ods_get_default_loc_context(loc)
    attributes["mode"] = (mode if (
    isinstance(mode, _ods_ir.Attribute) or
    not _ods_ir.AttrBuilder.contains('GPU_ShuffleModeAttr')) else
      _ods_ir.AttrBuilder.get('GPU_ShuffleModeAttr')(mode, context=_ods_context))
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def value(self) -> _ods_ir.Value:
    return self.operation.operands[0]

  @builtins.property
  def offset(self) -> _ods_ir.Value[_ods_ir.IntegerType]:
    return self.operation.operands[1]

  @builtins.property
  def width(self) -> _ods_ir.Value[_ods_ir.IntegerType]:
    return self.operation.operands[2]

  @builtins.property
  def mode(self) -> _ods_ir.Attribute:
    return self.operation.attributes["mode"]

  @mode.setter
  def mode(self, value: _ods_ir.Attribute):
    if value is None:
      raise ValueError("'None' not allowed as value for mandatory attributes")
    self.operation.attributes["mode"] = value

  @builtins.property
  def shuffleResult(self) -> _ods_ir.OpResult:
    return self.operation.results[0]

  @builtins.property
  def valid(self) -> _ods_ir.OpResult[_ods_ir.IntegerType]:
    return self.operation.results[1]

def shuffle(value, offset, width, mode, *, results=None, loc=None, ip=None) -> _ods_ir.OpResultList:
  return ShuffleOp(value=value, offset=offset, width=width, mode=mode, results=results, loc=loc, ip=ip).results

@_ods_cext.register_operation(_Dialect)
class SpGEMMCopyOp(_ods_ir.OpView):
  r"""
  The `gpu.spgemm_copy` operation copies the sparse matrix result of
  a SpGEMM computation.
  
  If the `async` keyword is present, the op is executed asynchronously (i.e.
  it does not block until the execution has finished on the device). In
  that case, it returns a `!gpu.async.token` in addition to the environment.
  
  Example:
  
  ```mlir
  gpu.spgemm_copy %spmatA, %spmatB, %spmatC, %spgemmDesc: f32
  ```
  
  The matrix arguments can also be associated with one of the following
  operators: NON_TRANSPOSE, TRANSPOSE, CONJUGATE_TRANSPOSE. The default value
  is NON_TRANSPOSE.
  """

  OPERATION_NAME = "gpu.spgemm_copy"

  _ODS_REGIONS = (0, True)

  def __init__(self, asyncToken, asyncDependencies, desc, spmatA, spmatB, spmatC, computeType, *, modeA=None, modeB=None, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.extend(_get_op_results_or_values(asyncDependencies))
    operands.append(desc)
    operands.append(spmatA)
    operands.append(spmatB)
    operands.append(spmatC)
    _ods_context = _ods_get_default_loc_context(loc)
    if modeA is not None: attributes["modeA"] = (modeA if (
        isinstance(modeA, _ods_ir.Attribute) or
        not _ods_ir.AttrBuilder.contains('GPU_TransposeModeAttr')) else
          _ods_ir.AttrBuilder.get('GPU_TransposeModeAttr')(modeA, context=_ods_context))
    if modeB is not None: attributes["modeB"] = (modeB if (
        isinstance(modeB, _ods_ir.Attribute) or
        not _ods_ir.AttrBuilder.contains('GPU_TransposeModeAttr')) else
          _ods_ir.AttrBuilder.get('GPU_TransposeModeAttr')(modeB, context=_ods_context))
    attributes["computeType"] = (computeType if (
    isinstance(computeType, _ods_ir.Attribute) or
    not _ods_ir.AttrBuilder.contains('TypeAttr')) else
      _ods_ir.AttrBuilder.get('TypeAttr')(computeType, context=_ods_context))
    results = []
    if asyncToken is not None: results.append(asyncToken)
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def asyncDependencies(self) -> _ods_ir.OpOperandList:
    _ods_variadic_group_length = len(self.operation.operands) - 5 + 1
    return self.operation.operands[0:0 + _ods_variadic_group_length]

  @builtins.property
  def desc(self) -> _ods_ir.Value:
    _ods_variadic_group_length = len(self.operation.operands) - 5 + 1
    return self.operation.operands[1 + _ods_variadic_group_length - 1]

  @builtins.property
  def spmatA(self) -> _ods_ir.Value:
    _ods_variadic_group_length = len(self.operation.operands) - 5 + 1
    return self.operation.operands[2 + _ods_variadic_group_length - 1]

  @builtins.property
  def spmatB(self) -> _ods_ir.Value:
    _ods_variadic_group_length = len(self.operation.operands) - 5 + 1
    return self.operation.operands[3 + _ods_variadic_group_length - 1]

  @builtins.property
  def spmatC(self) -> _ods_ir.Value:
    _ods_variadic_group_length = len(self.operation.operands) - 5 + 1
    return self.operation.operands[4 + _ods_variadic_group_length - 1]

  @builtins.property
  def modeA(self) -> _ods_ir.Attribute:
    return self.operation.attributes["modeA"]

  @modeA.setter
  def modeA(self, value: _ods_ir.Attribute):
    if value is None:
      raise ValueError("'None' not allowed as value for mandatory attributes")
    self.operation.attributes["modeA"] = value

  @builtins.property
  def modeB(self) -> _ods_ir.Attribute:
    return self.operation.attributes["modeB"]

  @modeB.setter
  def modeB(self, value: _ods_ir.Attribute):
    if value is None:
      raise ValueError("'None' not allowed as value for mandatory attributes")
    self.operation.attributes["modeB"] = value

  @builtins.property
  def computeType(self) -> _ods_ir.TypeAttr:
    return self.operation.attributes["computeType"]

  @computeType.setter
  def computeType(self, value: _ods_ir.TypeAttr):
    if value is None:
      raise ValueError("'None' not allowed as value for mandatory attributes")
    self.operation.attributes["computeType"] = value

  @builtins.property
  def asyncToken(self) -> _Optional[_ods_ir.OpResult]:
    return None if len(self.operation.results) < 1 else self.operation.results[0]

def spgemm_copy(async_token, async_dependencies, desc, spmat_a, spmat_b, spmat_c, compute_type, *, mode_a=None, mode_b=None, loc=None, ip=None) -> _Union[_ods_ir.OpResult, _ods_ir.OpResultList, SpGEMMCopyOp]:
  op = SpGEMMCopyOp(asyncToken=async_token, asyncDependencies=async_dependencies, desc=desc, spmatA=spmat_a, spmatB=spmat_b, spmatC=spmat_c, computeType=compute_type, modeA=mode_a, modeB=mode_b, loc=loc, ip=ip); results = op.results
  return results if len(results) > 1 else (results[0] if len(results) == 1 else op)

@_ods_cext.register_operation(_Dialect)
class SpGEMMCreateDescrOp(_ods_ir.OpView):
  r"""
  The `gpu.spgemm_create_descr` creates a descriptor for the SpGEMM operation.
  The descriptor describes the SpGEMM operation and stores the internal data
  throughout the computation. It needs to be passed as an argument to
  spgemm_* operations.
  
  If the `async` keyword is present, the op is executed asynchronously (i.e.
  it does not block until the execution has finished on the device). In
  that case, it returns a `!gpu.async.token` in addition to the environment.
  
  Example:
  
  ```mlir
  %desc, %token = gpu.spgemm_create_descr async [%dep]
  ```
  """

  OPERATION_NAME = "gpu.spgemm_create_descr"

  _ODS_REGIONS = (0, True)

  def __init__(self, desc, asyncToken, asyncDependencies, *, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.extend(_get_op_results_or_values(asyncDependencies))
    _ods_context = _ods_get_default_loc_context(loc)
    results = []
    results.append(desc)
    if asyncToken is not None: results.append(asyncToken)
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def asyncDependencies(self) -> _ods_ir.OpOperandList:
    _ods_variadic_group_length = len(self.operation.operands) - 1 + 1
    return self.operation.operands[0:0 + _ods_variadic_group_length]

  @builtins.property
  def desc(self) -> _ods_ir.OpResult:
    return self.operation.results[0]

  @builtins.property
  def asyncToken(self) -> _Optional[_ods_ir.OpResult]:
    return None if len(self.operation.results) < 2 else self.operation.results[1]

def spgemm_create_descr(desc, async_token, async_dependencies, *, loc=None, ip=None) -> _Union[_ods_ir.OpResult, _ods_ir.OpResultList, SpGEMMCreateDescrOp]:
  op = SpGEMMCreateDescrOp(desc=desc, asyncToken=async_token, asyncDependencies=async_dependencies, loc=loc, ip=ip); results = op.results
  return results if len(results) > 1 else (results[0] if len(results) == 1 else op)

@_ods_cext.register_operation(_Dialect)
class SpGEMMDestroyDescrOp(_ods_ir.OpView):
  r"""
  The `gpu.spgemm_destroy_descr` destroys the SpGEMM operation descriptor.
  
  If the `async` keyword is present, the op is executed asynchronously (i.e.
  it does not block until the execution has finished on the device). In
  that case, it returns a `!gpu.async.token` in addition to the environment.
  
  Example:
  
  ```mlir
  %token = gpu.spgemm_destroy_descr async [%dep] %desc
  ```
  """

  OPERATION_NAME = "gpu.spgemm_destroy_descr"

  _ODS_REGIONS = (0, True)

  def __init__(self, asyncToken, asyncDependencies, desc, *, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.extend(_get_op_results_or_values(asyncDependencies))
    operands.append(desc)
    _ods_context = _ods_get_default_loc_context(loc)
    results = []
    if asyncToken is not None: results.append(asyncToken)
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def asyncDependencies(self) -> _ods_ir.OpOperandList:
    _ods_variadic_group_length = len(self.operation.operands) - 2 + 1
    return self.operation.operands[0:0 + _ods_variadic_group_length]

  @builtins.property
  def desc(self) -> _ods_ir.Value:
    _ods_variadic_group_length = len(self.operation.operands) - 2 + 1
    return self.operation.operands[1 + _ods_variadic_group_length - 1]

  @builtins.property
  def asyncToken(self) -> _Optional[_ods_ir.OpResult]:
    return None if len(self.operation.results) < 1 else self.operation.results[0]

def spgemm_destroy_descr(async_token, async_dependencies, desc, *, loc=None, ip=None) -> _Union[_ods_ir.OpResult, _ods_ir.OpResultList, SpGEMMDestroyDescrOp]:
  op = SpGEMMDestroyDescrOp(asyncToken=async_token, asyncDependencies=async_dependencies, desc=desc, loc=loc, ip=ip); results = op.results
  return results if len(results) > 1 else (results[0] if len(results) == 1 else op)

@_ods_cext.register_operation(_Dialect)
class SpGEMMWorkEstimationOrComputeOp(_ods_ir.OpView):
  r"""
  The `gpu.spgemm_work_estimation_or_compute` is used to call
  cusparseSpGEMM_workEstimation or cusparseSpGEMM_compute. Both of them are
  for both determining the buffer size and performing the actual computation.
  The operation expects handles returned by previous sparse operations to
  construct an environment and the operands for SpGEMM.
  The buffer must have been allocated on the device.
  
  C' = alpha * op(A) * op(B) + beta * C
  
  If the `async` keyword is present, the op is executed asynchronously (i.e.
  it does not block until the execution has finished on the device). In
  that case, it returns a `!gpu.async.token` in addition to the environment.
  
  Example:
  
  ```mlir
  %bufferSz, %token = gpu.spgemm_work_estimation_or_compute async [%dep] {COMPUTE}
                        %desc, %spmatA{NON_TRANSPOSE}, %spmatB{NON_TRANSPOSE},
                        %spmatC, %spgemmDesc, %c0, %alloc: f32 into
                        memref<0xi8>
  ```
  
  The matrix arguments can also be associated with one of the following
  operators: NON_TRANSPOSE, TRANSPOSE, CONJUGATE_TRANSPOSE. The default value
  is NON_TRANSPOSE.
  """

  OPERATION_NAME = "gpu.spgemm_work_estimation_or_compute"

  _ODS_REGIONS = (0, True)

  def __init__(self, bufferSzNew, asyncToken, asyncDependencies, desc, spmatA, spmatB, spmatC, computeType, bufferSz, buffer, kind, *, modeA=None, modeB=None, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.extend(_get_op_results_or_values(asyncDependencies))
    operands.append(desc)
    operands.append(spmatA)
    operands.append(spmatB)
    operands.append(spmatC)
    operands.append(bufferSz)
    operands.append(buffer)
    _ods_context = _ods_get_default_loc_context(loc)
    if modeA is not None: attributes["modeA"] = (modeA if (
        isinstance(modeA, _ods_ir.Attribute) or
        not _ods_ir.AttrBuilder.contains('GPU_TransposeModeAttr')) else
          _ods_ir.AttrBuilder.get('GPU_TransposeModeAttr')(modeA, context=_ods_context))
    if modeB is not None: attributes["modeB"] = (modeB if (
        isinstance(modeB, _ods_ir.Attribute) or
        not _ods_ir.AttrBuilder.contains('GPU_TransposeModeAttr')) else
          _ods_ir.AttrBuilder.get('GPU_TransposeModeAttr')(modeB, context=_ods_context))
    attributes["computeType"] = (computeType if (
    isinstance(computeType, _ods_ir.Attribute) or
    not _ods_ir.AttrBuilder.contains('TypeAttr')) else
      _ods_ir.AttrBuilder.get('TypeAttr')(computeType, context=_ods_context))
    attributes["kind"] = (kind if (
    isinstance(kind, _ods_ir.Attribute) or
    not _ods_ir.AttrBuilder.contains('GPU_SpGEMMWorkEstimationOrComputeKindAttr')) else
      _ods_ir.AttrBuilder.get('GPU_SpGEMMWorkEstimationOrComputeKindAttr')(kind, context=_ods_context))
    results = []
    results.append(bufferSzNew)
    if asyncToken is not None: results.append(asyncToken)
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def asyncDependencies(self) -> _ods_ir.OpOperandList:
    _ods_variadic_group_length = len(self.operation.operands) - 7 + 1
    return self.operation.operands[0:0 + _ods_variadic_group_length]

  @builtins.property
  def desc(self) -> _ods_ir.Value:
    _ods_variadic_group_length = len(self.operation.operands) - 7 + 1
    return self.operation.operands[1 + _ods_variadic_group_length - 1]

  @builtins.property
  def spmatA(self) -> _ods_ir.Value:
    _ods_variadic_group_length = len(self.operation.operands) - 7 + 1
    return self.operation.operands[2 + _ods_variadic_group_length - 1]

  @builtins.property
  def spmatB(self) -> _ods_ir.Value:
    _ods_variadic_group_length = len(self.operation.operands) - 7 + 1
    return self.operation.operands[3 + _ods_variadic_group_length - 1]

  @builtins.property
  def spmatC(self) -> _ods_ir.Value:
    _ods_variadic_group_length = len(self.operation.operands) - 7 + 1
    return self.operation.operands[4 + _ods_variadic_group_length - 1]

  @builtins.property
  def bufferSz(self) -> _ods_ir.Value[_ods_ir.IndexType]:
    _ods_variadic_group_length = len(self.operation.operands) - 7 + 1
    return self.operation.operands[5 + _ods_variadic_group_length - 1]

  @builtins.property
  def buffer(self) -> _ods_ir.Value[_ods_ir.MemRefType]:
    _ods_variadic_group_length = len(self.operation.operands) - 7 + 1
    return self.operation.operands[6 + _ods_variadic_group_length - 1]

  @builtins.property
  def modeA(self) -> _ods_ir.Attribute:
    return self.operation.attributes["modeA"]

  @modeA.setter
  def modeA(self, value: _ods_ir.Attribute):
    if value is None:
      raise ValueError("'None' not allowed as value for mandatory attributes")
    self.operation.attributes["modeA"] = value

  @builtins.property
  def modeB(self) -> _ods_ir.Attribute:
    return self.operation.attributes["modeB"]

  @modeB.setter
  def modeB(self, value: _ods_ir.Attribute):
    if value is None:
      raise ValueError("'None' not allowed as value for mandatory attributes")
    self.operation.attributes["modeB"] = value

  @builtins.property
  def computeType(self) -> _ods_ir.TypeAttr:
    return self.operation.attributes["computeType"]

  @computeType.setter
  def computeType(self, value: _ods_ir.TypeAttr):
    if value is None:
      raise ValueError("'None' not allowed as value for mandatory attributes")
    self.operation.attributes["computeType"] = value

  @builtins.property
  def kind(self) -> _ods_ir.Attribute:
    return self.operation.attributes["kind"]

  @kind.setter
  def kind(self, value: _ods_ir.Attribute):
    if value is None:
      raise ValueError("'None' not allowed as value for mandatory attributes")
    self.operation.attributes["kind"] = value

  @builtins.property
  def bufferSzNew(self) -> _ods_ir.OpResult[_ods_ir.IndexType]:
    return self.operation.results[0]

  @builtins.property
  def asyncToken(self) -> _Optional[_ods_ir.OpResult]:
    return None if len(self.operation.results) < 2 else self.operation.results[1]

def spgemm_work_estimation_or_compute(buffer_sz_new, async_token, async_dependencies, desc, spmat_a, spmat_b, spmat_c, compute_type, buffer_sz, buffer, kind, *, mode_a=None, mode_b=None, loc=None, ip=None) -> _Union[_ods_ir.OpResult, _ods_ir.OpResultList, SpGEMMWorkEstimationOrComputeOp]:
  op = SpGEMMWorkEstimationOrComputeOp(bufferSzNew=buffer_sz_new, asyncToken=async_token, asyncDependencies=async_dependencies, desc=desc, spmatA=spmat_a, spmatB=spmat_b, spmatC=spmat_c, computeType=compute_type, bufferSz=buffer_sz, buffer=buffer, kind=kind, modeA=mode_a, modeB=mode_b, loc=loc, ip=ip); results = op.results
  return results if len(results) > 1 else (results[0] if len(results) == 1 else op)

@_ods_cext.register_operation(_Dialect)
class SpMMBufferSizeOp(_ods_ir.OpView):
  r"""
  The `gpu.spmm_buffer_size` operation returns the buffer size required
  to perform the SpMM operation on the given sparse and dense matrix.
  The operation expects handles returned by previous sparse operations
  to construct an environment and the operands for SpMM.
  
  If the `async` keyword is present, the op is executed asynchronously (i.e.
  it does not block until the execution has finished on the device). In
  that case, it returns a !gpu.async.token in addition to the environment.
  
  The matrix arguments can also be associated with one of the following
  operators: NON_TRANSPOSE, TRANSPOSE, CONJUGATE_TRANSPOSE. The default value
  is NON_TRANSPOSE.
  
  Example:
  
  ```mlir
  %bufferszs, %token = gpu.spmm_buffer_size async [%dep] %spmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %dnmatC : i64 into f32
  ```
  """

  OPERATION_NAME = "gpu.spmm_buffer_size"

  _ODS_RESULT_SEGMENTS = [-1,0,]

  _ODS_REGIONS = (0, True)

  def __init__(self, bufferSzs, asyncToken, asyncDependencies, spmatA, dnmatB, dnmatC, computeType, *, modeA=None, modeB=None, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.extend(_get_op_results_or_values(asyncDependencies))
    operands.append(spmatA)
    operands.append(dnmatB)
    operands.append(dnmatC)
    _ods_context = _ods_get_default_loc_context(loc)
    if modeA is not None: attributes["modeA"] = (modeA if (
        isinstance(modeA, _ods_ir.Attribute) or
        not _ods_ir.AttrBuilder.contains('GPU_TransposeModeAttr')) else
          _ods_ir.AttrBuilder.get('GPU_TransposeModeAttr')(modeA, context=_ods_context))
    if modeB is not None: attributes["modeB"] = (modeB if (
        isinstance(modeB, _ods_ir.Attribute) or
        not _ods_ir.AttrBuilder.contains('GPU_TransposeModeAttr')) else
          _ods_ir.AttrBuilder.get('GPU_TransposeModeAttr')(modeB, context=_ods_context))
    attributes["computeType"] = (computeType if (
    isinstance(computeType, _ods_ir.Attribute) or
    not _ods_ir.AttrBuilder.contains('TypeAttr')) else
      _ods_ir.AttrBuilder.get('TypeAttr')(computeType, context=_ods_context))
    results = []
    results.append(bufferSzs)
    if asyncToken is not None: results.append(asyncToken)
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def asyncDependencies(self) -> _ods_ir.OpOperandList:
    _ods_variadic_group_length = len(self.operation.operands) - 4 + 1
    return self.operation.operands[0:0 + _ods_variadic_group_length]

  @builtins.property
  def spmatA(self) -> _ods_ir.Value:
    _ods_variadic_group_length = len(self.operation.operands) - 4 + 1
    return self.operation.operands[1 + _ods_variadic_group_length - 1]

  @builtins.property
  def dnmatB(self) -> _ods_ir.Value:
    _ods_variadic_group_length = len(self.operation.operands) - 4 + 1
    return self.operation.operands[2 + _ods_variadic_group_length - 1]

  @builtins.property
  def dnmatC(self) -> _ods_ir.Value:
    _ods_variadic_group_length = len(self.operation.operands) - 4 + 1
    return self.operation.operands[3 + _ods_variadic_group_length - 1]

  @builtins.property
  def modeA(self) -> _ods_ir.Attribute:
    return self.operation.attributes["modeA"]

  @modeA.setter
  def modeA(self, value: _ods_ir.Attribute):
    if value is None:
      raise ValueError("'None' not allowed as value for mandatory attributes")
    self.operation.attributes["modeA"] = value

  @builtins.property
  def modeB(self) -> _ods_ir.Attribute:
    return self.operation.attributes["modeB"]

  @modeB.setter
  def modeB(self, value: _ods_ir.Attribute):
    if value is None:
      raise ValueError("'None' not allowed as value for mandatory attributes")
    self.operation.attributes["modeB"] = value

  @builtins.property
  def computeType(self) -> _ods_ir.TypeAttr:
    return self.operation.attributes["computeType"]

  @computeType.setter
  def computeType(self, value: _ods_ir.TypeAttr):
    if value is None:
      raise ValueError("'None' not allowed as value for mandatory attributes")
    self.operation.attributes["computeType"] = value

  @builtins.property
  def bufferSzs(self) -> _ods_ir.OpResultList:
    result_range = _ods_segmented_accessor(
         self.operation.results,
         self.operation.attributes["resultSegmentSizes"], 0)
    return result_range

  @builtins.property
  def asyncToken(self) -> _Optional[_ods_ir.OpResult]:
    result_range = _ods_segmented_accessor(
         self.operation.results,
         self.operation.attributes["resultSegmentSizes"], 1)
    return result_range[0] if len(result_range) > 0 else None

def spmm_buffer_size(buffer_szs, async_token, async_dependencies, spmat_a, dnmat_b, dnmat_c, compute_type, *, mode_a=None, mode_b=None, loc=None, ip=None) -> _Union[_ods_ir.OpResult, _ods_ir.OpResultList, SpMMBufferSizeOp]:
  op = SpMMBufferSizeOp(bufferSzs=buffer_szs, asyncToken=async_token, asyncDependencies=async_dependencies, spmatA=spmat_a, dnmatB=dnmat_b, dnmatC=dnmat_c, computeType=compute_type, modeA=mode_a, modeB=mode_b, loc=loc, ip=ip); results = op.results
  return results if len(results) > 1 else (results[0] if len(results) == 1 else op)

@_ods_cext.register_operation(_Dialect)
class SpMMOp(_ods_ir.OpView):
  r"""
  The `gpu.spmm` operation performs the SpMM operation on the given sparse and
  dense matrix, and buffer.  The operation expects handles returned by previous
  sparse operations to construct an environment and the operands for SpMM. The
  buffer must have been allocated on the device.
  
  If the `async` keyword is present, the op is executed asynchronously (i.e.
  it does not block until the execution has finished on the device). In
  that case, it returns a !gpu.async.token in addition to the environment.
  
  The matrix arguments can also be associated with one of the following
  operators: NON_TRANSPOSE, TRANSPOSE, CONJUGATE_TRANSPOSE. The default value
  is NON_TRANSPOSE.
  
  Example:
  
  ```mlir
  %token = gpu.spmm async [%dep] %spmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %dnmatC, %buffers : type($buffers) into f32
  ```
  """

  OPERATION_NAME = "gpu.spmm"

  _ODS_OPERAND_SEGMENTS = [-1,1,1,1,-1,]

  _ODS_REGIONS = (0, True)

  def __init__(self, asyncToken, asyncDependencies, spmatA, dnmatB, dnmatC, computeType, buffers, *, modeA=None, modeB=None, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.append(_get_op_results_or_values(asyncDependencies))
    operands.append(spmatA)
    operands.append(dnmatB)
    operands.append(dnmatC)
    operands.append(_get_op_results_or_values(buffers))
    _ods_context = _ods_get_default_loc_context(loc)
    if modeA is not None: attributes["modeA"] = (modeA if (
        isinstance(modeA, _ods_ir.Attribute) or
        not _ods_ir.AttrBuilder.contains('GPU_TransposeModeAttr')) else
          _ods_ir.AttrBuilder.get('GPU_TransposeModeAttr')(modeA, context=_ods_context))
    if modeB is not None: attributes["modeB"] = (modeB if (
        isinstance(modeB, _ods_ir.Attribute) or
        not _ods_ir.AttrBuilder.contains('GPU_TransposeModeAttr')) else
          _ods_ir.AttrBuilder.get('GPU_TransposeModeAttr')(modeB, context=_ods_context))
    attributes["computeType"] = (computeType if (
    isinstance(computeType, _ods_ir.Attribute) or
    not _ods_ir.AttrBuilder.contains('TypeAttr')) else
      _ods_ir.AttrBuilder.get('TypeAttr')(computeType, context=_ods_context))
    results = []
    if asyncToken is not None: results.append(asyncToken)
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def asyncDependencies(self) -> _ods_ir.OpOperandList:
    operand_range = _ods_segmented_accessor(
         self.operation.operands,
         self.operation.attributes["operandSegmentSizes"], 0)
    return operand_range

  @builtins.property
  def spmatA(self) -> _ods_ir.Value:
    operand_range = _ods_segmented_accessor(
         self.operation.operands,
         self.operation.attributes["operandSegmentSizes"], 1)
    return operand_range[0]

  @builtins.property
  def dnmatB(self) -> _ods_ir.Value:
    operand_range = _ods_segmented_accessor(
         self.operation.operands,
         self.operation.attributes["operandSegmentSizes"], 2)
    return operand_range[0]

  @builtins.property
  def dnmatC(self) -> _ods_ir.Value:
    operand_range = _ods_segmented_accessor(
         self.operation.operands,
         self.operation.attributes["operandSegmentSizes"], 3)
    return operand_range[0]

  @builtins.property
  def buffers(self) -> _ods_ir.OpOperandList:
    operand_range = _ods_segmented_accessor(
         self.operation.operands,
         self.operation.attributes["operandSegmentSizes"], 4)
    return operand_range

  @builtins.property
  def modeA(self) -> _ods_ir.Attribute:
    return self.operation.attributes["modeA"]

  @modeA.setter
  def modeA(self, value: _ods_ir.Attribute):
    if value is None:
      raise ValueError("'None' not allowed as value for mandatory attributes")
    self.operation.attributes["modeA"] = value

  @builtins.property
  def modeB(self) -> _ods_ir.Attribute:
    return self.operation.attributes["modeB"]

  @modeB.setter
  def modeB(self, value: _ods_ir.Attribute):
    if value is None:
      raise ValueError("'None' not allowed as value for mandatory attributes")
    self.operation.attributes["modeB"] = value

  @builtins.property
  def computeType(self) -> _ods_ir.TypeAttr:
    return self.operation.attributes["computeType"]

  @computeType.setter
  def computeType(self, value: _ods_ir.TypeAttr):
    if value is None:
      raise ValueError("'None' not allowed as value for mandatory attributes")
    self.operation.attributes["computeType"] = value

  @builtins.property
  def asyncToken(self) -> _Optional[_ods_ir.OpResult]:
    return None if len(self.operation.results) < 1 else self.operation.results[0]

def spmm(async_token, async_dependencies, spmat_a, dnmat_b, dnmat_c, compute_type, buffers, *, mode_a=None, mode_b=None, loc=None, ip=None) -> _Union[_ods_ir.OpResult, _ods_ir.OpResultList, SpMMOp]:
  op = SpMMOp(asyncToken=async_token, asyncDependencies=async_dependencies, spmatA=spmat_a, dnmatB=dnmat_b, dnmatC=dnmat_c, computeType=compute_type, buffers=buffers, modeA=mode_a, modeB=mode_b, loc=loc, ip=ip); results = op.results
  return results if len(results) > 1 else (results[0] if len(results) == 1 else op)

@_ods_cext.register_operation(_Dialect)
class SpMVBufferSizeOp(_ods_ir.OpView):
  r"""
  The `gpu.spmv_buffer_size` operation returns the buffer size required
  to perform the SpMV operation on the given sparse matrix and dense vectors.
  The operation expects handles returned by previous sparse operations
  to construct an environment and the operands for SpMV.
  
  If the `async` keyword is present, the op is executed asynchronously (i.e.
  it does not block until the execution has finished on the device). In
  that case, it returns a !gpu.async.token in addition to the environment.
  
  The matrix arguments can also be associated with one of the following
  operators: NON_TRANSPOSE, TRANSPOSE, CONJUGATE_TRANSPOSE. The default value
  is NON_TRANSPOSE.
  
  Example:
  
  ```mlir
  %buffersz, %token = gpu.spmv_buffer_size async [%dep] %spmatA{TRANSPOSE}, %dnX, %dnY into f32
  ```
  """

  OPERATION_NAME = "gpu.spmv_buffer_size"

  _ODS_REGIONS = (0, True)

  def __init__(self, bufferSz, asyncToken, asyncDependencies, spmatA, dnX, dnY, computeType, *, modeA=None, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.extend(_get_op_results_or_values(asyncDependencies))
    operands.append(spmatA)
    operands.append(dnX)
    operands.append(dnY)
    _ods_context = _ods_get_default_loc_context(loc)
    if modeA is not None: attributes["modeA"] = (modeA if (
        isinstance(modeA, _ods_ir.Attribute) or
        not _ods_ir.AttrBuilder.contains('GPU_TransposeModeAttr')) else
          _ods_ir.AttrBuilder.get('GPU_TransposeModeAttr')(modeA, context=_ods_context))
    attributes["computeType"] = (computeType if (
    isinstance(computeType, _ods_ir.Attribute) or
    not _ods_ir.AttrBuilder.contains('TypeAttr')) else
      _ods_ir.AttrBuilder.get('TypeAttr')(computeType, context=_ods_context))
    results = []
    results.append(bufferSz)
    if asyncToken is not None: results.append(asyncToken)
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def asyncDependencies(self) -> _ods_ir.OpOperandList:
    _ods_variadic_group_length = len(self.operation.operands) - 4 + 1
    return self.operation.operands[0:0 + _ods_variadic_group_length]

  @builtins.property
  def spmatA(self) -> _ods_ir.Value:
    _ods_variadic_group_length = len(self.operation.operands) - 4 + 1
    return self.operation.operands[1 + _ods_variadic_group_length - 1]

  @builtins.property
  def dnX(self) -> _ods_ir.Value:
    _ods_variadic_group_length = len(self.operation.operands) - 4 + 1
    return self.operation.operands[2 + _ods_variadic_group_length - 1]

  @builtins.property
  def dnY(self) -> _ods_ir.Value:
    _ods_variadic_group_length = len(self.operation.operands) - 4 + 1
    return self.operation.operands[3 + _ods_variadic_group_length - 1]

  @builtins.property
  def modeA(self) -> _ods_ir.Attribute:
    return self.operation.attributes["modeA"]

  @modeA.setter
  def modeA(self, value: _ods_ir.Attribute):
    if value is None:
      raise ValueError("'None' not allowed as value for mandatory attributes")
    self.operation.attributes["modeA"] = value

  @builtins.property
  def computeType(self) -> _ods_ir.TypeAttr:
    return self.operation.attributes["computeType"]

  @computeType.setter
  def computeType(self, value: _ods_ir.TypeAttr):
    if value is None:
      raise ValueError("'None' not allowed as value for mandatory attributes")
    self.operation.attributes["computeType"] = value

  @builtins.property
  def bufferSz(self) -> _ods_ir.OpResult[_ods_ir.IndexType]:
    return self.operation.results[0]

  @builtins.property
  def asyncToken(self) -> _Optional[_ods_ir.OpResult]:
    return None if len(self.operation.results) < 2 else self.operation.results[1]

def spmv_buffer_size(buffer_sz, async_token, async_dependencies, spmat_a, dn_x, dn_y, compute_type, *, mode_a=None, loc=None, ip=None) -> _Union[_ods_ir.OpResult, _ods_ir.OpResultList, SpMVBufferSizeOp]:
  op = SpMVBufferSizeOp(bufferSz=buffer_sz, asyncToken=async_token, asyncDependencies=async_dependencies, spmatA=spmat_a, dnX=dn_x, dnY=dn_y, computeType=compute_type, modeA=mode_a, loc=loc, ip=ip); results = op.results
  return results if len(results) > 1 else (results[0] if len(results) == 1 else op)

@_ods_cext.register_operation(_Dialect)
class SpMVOp(_ods_ir.OpView):
  r"""
  The `gpu.spmv` operation performs the SpMV operation on the given sparse matrix,
  dense vectors, and buffer.  The operation expects handles returned by previous
  sparse operations to construct an environment and the operands for SpMV. The
  buffer must have been allocated on the device.
  
  If the `async` keyword is present, the op is executed asynchronously (i.e.
  it does not block until the execution has finished on the device). In
  that case, it returns a !gpu.async.token in addition to the environment.
  
  The matrix arguments can also be associated with one of the following
  operators: NON_TRANSPOSE, TRANSPOSE, CONJUGATE_TRANSPOSE. The default value
  is NON_TRANSPOSE.
  
  Example:
  
  ```mlir
  %token = gpu.spmv async [%dep] %spmatA{TRANSPOSE}, %dnX, %dnY : memref<?xf64> into bf16
  ```
  """

  OPERATION_NAME = "gpu.spmv"

  _ODS_REGIONS = (0, True)

  def __init__(self, asyncToken, asyncDependencies, spmatA, dnX, dnY, computeType, buffer, *, modeA=None, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.extend(_get_op_results_or_values(asyncDependencies))
    operands.append(spmatA)
    operands.append(dnX)
    operands.append(dnY)
    operands.append(buffer)
    _ods_context = _ods_get_default_loc_context(loc)
    if modeA is not None: attributes["modeA"] = (modeA if (
        isinstance(modeA, _ods_ir.Attribute) or
        not _ods_ir.AttrBuilder.contains('GPU_TransposeModeAttr')) else
          _ods_ir.AttrBuilder.get('GPU_TransposeModeAttr')(modeA, context=_ods_context))
    attributes["computeType"] = (computeType if (
    isinstance(computeType, _ods_ir.Attribute) or
    not _ods_ir.AttrBuilder.contains('TypeAttr')) else
      _ods_ir.AttrBuilder.get('TypeAttr')(computeType, context=_ods_context))
    results = []
    if asyncToken is not None: results.append(asyncToken)
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def asyncDependencies(self) -> _ods_ir.OpOperandList:
    _ods_variadic_group_length = len(self.operation.operands) - 5 + 1
    return self.operation.operands[0:0 + _ods_variadic_group_length]

  @builtins.property
  def spmatA(self) -> _ods_ir.Value:
    _ods_variadic_group_length = len(self.operation.operands) - 5 + 1
    return self.operation.operands[1 + _ods_variadic_group_length - 1]

  @builtins.property
  def dnX(self) -> _ods_ir.Value:
    _ods_variadic_group_length = len(self.operation.operands) - 5 + 1
    return self.operation.operands[2 + _ods_variadic_group_length - 1]

  @builtins.property
  def dnY(self) -> _ods_ir.Value:
    _ods_variadic_group_length = len(self.operation.operands) - 5 + 1
    return self.operation.operands[3 + _ods_variadic_group_length - 1]

  @builtins.property
  def buffer(self) -> _ods_ir.Value[_ods_ir.MemRefType]:
    _ods_variadic_group_length = len(self.operation.operands) - 5 + 1
    return self.operation.operands[4 + _ods_variadic_group_length - 1]

  @builtins.property
  def modeA(self) -> _ods_ir.Attribute:
    return self.operation.attributes["modeA"]

  @modeA.setter
  def modeA(self, value: _ods_ir.Attribute):
    if value is None:
      raise ValueError("'None' not allowed as value for mandatory attributes")
    self.operation.attributes["modeA"] = value

  @builtins.property
  def computeType(self) -> _ods_ir.TypeAttr:
    return self.operation.attributes["computeType"]

  @computeType.setter
  def computeType(self, value: _ods_ir.TypeAttr):
    if value is None:
      raise ValueError("'None' not allowed as value for mandatory attributes")
    self.operation.attributes["computeType"] = value

  @builtins.property
  def asyncToken(self) -> _Optional[_ods_ir.OpResult]:
    return None if len(self.operation.results) < 1 else self.operation.results[0]

def spmv(async_token, async_dependencies, spmat_a, dn_x, dn_y, compute_type, buffer, *, mode_a=None, loc=None, ip=None) -> _Union[_ods_ir.OpResult, _ods_ir.OpResultList, SpMVOp]:
  op = SpMVOp(asyncToken=async_token, asyncDependencies=async_dependencies, spmatA=spmat_a, dnX=dn_x, dnY=dn_y, computeType=compute_type, buffer=buffer, modeA=mode_a, loc=loc, ip=ip); results = op.results
  return results if len(results) > 1 else (results[0] if len(results) == 1 else op)

@_ods_cext.register_operation(_Dialect)
class SpMatGetSizeOp(_ods_ir.OpView):
  r"""
  The `gpu.spmat_get_size` operation retrieves the number of rows, number of
  columns, and number of non-zero elements of a sparse matrix.
  
  If the `async` keyword is present, the op is executed asynchronously (i.e.
  it does not block until the execution has finished on the device). In
  that case, it returns a `!gpu.async.token` in addition to the environment.
  
  Example:
  
  ```mlir
  %rows, %cols, %nnz, %token = gpu.spmat_get_size async [%dep] %spmatC
  ```
  """

  OPERATION_NAME = "gpu.spmat_get_size"

  _ODS_REGIONS = (0, True)

  def __init__(self, rows, cols, nnz, asyncToken, asyncDependencies, spmat, *, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.extend(_get_op_results_or_values(asyncDependencies))
    operands.append(spmat)
    _ods_context = _ods_get_default_loc_context(loc)
    results = []
    results.append(rows)
    results.append(cols)
    results.append(nnz)
    if asyncToken is not None: results.append(asyncToken)
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def asyncDependencies(self) -> _ods_ir.OpOperandList:
    _ods_variadic_group_length = len(self.operation.operands) - 2 + 1
    return self.operation.operands[0:0 + _ods_variadic_group_length]

  @builtins.property
  def spmat(self) -> _ods_ir.Value:
    _ods_variadic_group_length = len(self.operation.operands) - 2 + 1
    return self.operation.operands[1 + _ods_variadic_group_length - 1]

  @builtins.property
  def rows(self) -> _ods_ir.OpResult[_ods_ir.IndexType]:
    return self.operation.results[0]

  @builtins.property
  def cols(self) -> _ods_ir.OpResult[_ods_ir.IndexType]:
    return self.operation.results[1]

  @builtins.property
  def nnz(self) -> _ods_ir.OpResult[_ods_ir.IndexType]:
    return self.operation.results[2]

  @builtins.property
  def asyncToken(self) -> _Optional[_ods_ir.OpResult]:
    return None if len(self.operation.results) < 4 else self.operation.results[3]

def spmat_get_size(rows, cols, nnz, async_token, async_dependencies, spmat, *, loc=None, ip=None) -> _Union[_ods_ir.OpResult, _ods_ir.OpResultList, SpMatGetSizeOp]:
  op = SpMatGetSizeOp(rows=rows, cols=cols, nnz=nnz, asyncToken=async_token, asyncDependencies=async_dependencies, spmat=spmat, loc=loc, ip=ip); results = op.results
  return results if len(results) > 1 else (results[0] if len(results) == 1 else op)

@_ods_cext.register_operation(_Dialect)
class SubgroupBroadcastOp(_ods_ir.OpView):
  r"""
  Broadcasts a value from one lane to all active lanes in a subgroup. The
  result is guaranteed to be uniform across the active lanes in subgroup.
  
  The possible broadcast types are:
  
  * `first_active_lane` - broadcasts the value from the first active lane
  in the subgroup.
  * `specific_lane` - broadcasts from the specified lane. The lane index
  must be uniform and within the subgroup size. The result is poison if the
  lane index is invalid, non subgroup-uniform, or if the source lane is not
  active.
  """

  OPERATION_NAME = "gpu.subgroup_broadcast"

  _ODS_REGIONS = (0, True)

  def __init__(self, src, broadcast_type, *, lane=None, results=None, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.append(src)
    if lane is not None: operands.append(lane)
    _ods_context = _ods_get_default_loc_context(loc)
    attributes["broadcast_type"] = (broadcast_type if (
    isinstance(broadcast_type, _ods_ir.Attribute) or
    not _ods_ir.AttrBuilder.contains('GPU_BroadcastTypeAttr')) else
      _ods_ir.AttrBuilder.get('GPU_BroadcastTypeAttr')(broadcast_type, context=_ods_context))
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def src(self) -> _ods_ir.Value:
    return self.operation.operands[0]

  @builtins.property
  def lane(self) -> _Optional[_ods_ir.Value[_ods_ir.IntegerType]]:
    return None if len(self.operation.operands) < 2 else self.operation.operands[1]

  @builtins.property
  def broadcast_type(self) -> _ods_ir.Attribute:
    return self.operation.attributes["broadcast_type"]

  @broadcast_type.setter
  def broadcast_type(self, value: _ods_ir.Attribute):
    if value is None:
      raise ValueError("'None' not allowed as value for mandatory attributes")
    self.operation.attributes["broadcast_type"] = value

  @builtins.property
  def result(self) -> _ods_ir.OpResult:
    return self.operation.results[0]

def subgroup_broadcast(src, broadcast_type, *, lane=None, results=None, loc=None, ip=None) -> _ods_ir.OpResult:
  return SubgroupBroadcastOp(src=src, broadcast_type=broadcast_type, lane=lane, results=results, loc=loc, ip=ip).result

@_ods_cext.register_operation(_Dialect)
class SubgroupIdOp(_ods_ir.OpView):
  r"""
  Returns the subgroup id, i.e., the index of the current subgroup within the
  workgroup.
  
  Example:
  
  ```mlir
  %sgId = gpu.subgroup_id : index
  ```
  
  Executions where there are more than `upper_bound` subgroups per workgroup
  cause undefined behavior. There is an implicit upper bound of `kMaxDim`
  (currently uint32_t::max).
  """

  OPERATION_NAME = "gpu.subgroup_id"

  _ODS_REGIONS = (0, True)

  def __init__(self, *, upper_bound=None, results=None, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    _ods_context = _ods_get_default_loc_context(loc)
    if upper_bound is not None: attributes["upper_bound"] = (upper_bound if (
        isinstance(upper_bound, _ods_ir.Attribute) or
        not _ods_ir.AttrBuilder.contains('IndexAttr')) else
          _ods_ir.AttrBuilder.get('IndexAttr')(upper_bound, context=_ods_context))
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def upper_bound(self) -> _Optional[_ods_ir.IntegerAttr]:
    if "upper_bound" not in self.operation.attributes:
      return None
    return self.operation.attributes["upper_bound"]

  @upper_bound.setter
  def upper_bound(self, value: _Optional[_ods_ir.IntegerAttr]):
    if value is not None:
      self.operation.attributes["upper_bound"] = value
    elif "upper_bound" in self.operation.attributes:
      del self.operation.attributes["upper_bound"]

  @upper_bound.deleter
  def upper_bound(self):
    del self.operation.attributes["upper_bound"]

  @builtins.property
  def result(self) -> _ods_ir.OpResult[_ods_ir.IndexType]:
    return self.operation.results[0]

def subgroup_id(*, upper_bound=None, results=None, loc=None, ip=None) -> _ods_ir.OpResult:
  return SubgroupIdOp(upper_bound=upper_bound, results=results, loc=loc, ip=ip).result

@_ods_cext.register_operation(_Dialect)
class SubgroupMmaComputeOp(_ods_ir.OpView):
  r"""
  The `gpu.subgroup_mma_compute` operation performs a matrix-multiply accumulate (mma)
  operation using all the threads in a subgroup.
  
  This operation takes three `!gpu.mma_matrix`s as arguments: these hold `A`,
  `B` and `C`operands for the mma operation. The operation performed is represented
  as `C += A * B`. The op returns a `!gpu.mma_matrix` which contains the result of
  the operation held by all threads in a subgroup. `a_transpose` or
  `b_transpose` if present, signify that the respective operand was loaded in a
  transposed manner. The transpose operands are required to map to correct
  underlying intrisics but they currently do not seem to affect correctness
  even if they are absent given that the operands were loaded correctly using
  the `transpose` attribute in `gpu.subgroup_mma_load_matrix` op.
  
  For integer types, the `A` and `B` matrices carry their signedness with their
  types. The accumulator type is expected to be signless and imply a signed integer
  with a greater width than the other two operands.
  
  This op is meant to be used along with `gpu.subgroup_mma_store_matrix` and
  `gpu.subgroup_mma_load_matrix` ops.
  
  Example:
  
  ```mlir
  %D = gpu.subgroup_mma_compute_matrix %A, %B, %C :
    !gpu.mma_matrix<16x16xf16, "AOp">, !gpu.mma_matrix<16x16xf16, "BOp">>
    -> !gpu.mma_matrix<16x16xf16, "COp">
  ```
  """

  OPERATION_NAME = "gpu.subgroup_mma_compute"

  _ODS_REGIONS = (0, True)

  def __init__(self, opA, opB, opC, *, a_transpose=None, b_transpose=None, results=None, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.append(opA)
    operands.append(opB)
    operands.append(opC)
    _ods_context = _ods_get_default_loc_context(loc)
    if bool(a_transpose): attributes["a_transpose"] = _ods_ir.UnitAttr.get(
      _ods_get_default_loc_context(loc))
    if bool(b_transpose): attributes["b_transpose"] = _ods_ir.UnitAttr.get(
      _ods_get_default_loc_context(loc))
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def opA(self) -> _ods_ir.Value:
    return self.operation.operands[0]

  @builtins.property
  def opB(self) -> _ods_ir.Value:
    return self.operation.operands[1]

  @builtins.property
  def opC(self) -> _ods_ir.Value:
    return self.operation.operands[2]

  @builtins.property
  def a_transpose(self) -> bool:
    return "a_transpose" in self.operation.attributes

  @a_transpose.setter
  def a_transpose(self, value):
    if bool(value):
      self.operation.attributes["a_transpose"] = _ods_ir.UnitAttr.get()
    elif "a_transpose" in self.operation.attributes:
      del self.operation.attributes["a_transpose"]

  @a_transpose.deleter
  def a_transpose(self):
    del self.operation.attributes["a_transpose"]

  @builtins.property
  def b_transpose(self) -> bool:
    return "b_transpose" in self.operation.attributes

  @b_transpose.setter
  def b_transpose(self, value):
    if bool(value):
      self.operation.attributes["b_transpose"] = _ods_ir.UnitAttr.get()
    elif "b_transpose" in self.operation.attributes:
      del self.operation.attributes["b_transpose"]

  @b_transpose.deleter
  def b_transpose(self):
    del self.operation.attributes["b_transpose"]

  @builtins.property
  def res(self) -> _ods_ir.OpResult:
    return self.operation.results[0]

def subgroup_mma_compute(op_a, op_b, op_c, *, a_transpose=None, b_transpose=None, results=None, loc=None, ip=None) -> _ods_ir.OpResult:
  return SubgroupMmaComputeOp(opA=op_a, opB=op_b, opC=op_c, a_transpose=a_transpose, b_transpose=b_transpose, results=results, loc=loc, ip=ip).result

@_ods_cext.register_operation(_Dialect)
class SubgroupMmaConstantMatrixOp(_ods_ir.OpView):
  r"""
  The `gpu.subgroup_mma_constant_matrix` creates a `!gpu.mma_matrix` with
  constant elements.
  
  The operation takes a scalar input and return a `!gpu.mma_matrix` where
  each element of is equal to the operand constant. The destination
  mma_matrix type must have elememt type equal to the constant type. Since
  the layout of `!gpu.mma_matrix` is opaque this only support setting all the
  elements to the same value.
  
  This op is meant to be used along with `gpu.subgroup_mma_compute`.
  
  Example:
  
  ```mlir
   %0 = gpu.subgroup_mma_constant_matrix %a :
     !gpu.mma_matrix<16x16xf16, "AOp">
   %1 = gpu.subgroup_mma_constant_matrix %b :
     !gpu.mma_matrix<16x16xf32, "COp">
  ```
  """

  OPERATION_NAME = "gpu.subgroup_mma_constant_matrix"

  _ODS_REGIONS = (0, True)

  def __init__(self, res, value, *, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.append(value)
    _ods_context = _ods_get_default_loc_context(loc)
    results = []
    results.append(res)
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def value(self) -> _ods_ir.Value:
    return self.operation.operands[0]

  @builtins.property
  def res(self) -> _ods_ir.OpResult:
    return self.operation.results[0]

def subgroup_mma_constant_matrix(res, value, *, loc=None, ip=None) -> _ods_ir.OpResult:
  return SubgroupMmaConstantMatrixOp(res=res, value=value, loc=loc, ip=ip).result

@_ods_cext.register_operation(_Dialect)
class SubgroupMmaElementwiseOp(_ods_ir.OpView):
  r"""
  The `gpu.subgroup_mma_elementwise` takes `!gpu.mma_matrix` inputs and
  compute a new `!gpu.mma_matrix` by applying an elementwise operation to each
  element.
  
  Since the operation is elementwise and the matrix type must match, the
  matrix elements are processed independently of the matrix layout.
  
  This op is meant to be used along with `gpu.subgroup_mma_compute`.
  
  Example:
  
  ```mlir
   %0 =  %A, %B { opType = "ADD" } :
    (!gpu.mma_matrix<16x16xf16, "COp">, !gpu.mma_matrix<16x16xf16, "COp">)
    -> !gpu.mma_matrix<16x16xf16, "COp">
  ```
  """

  OPERATION_NAME = "gpu.subgroup_mma_elementwise"

  _ODS_REGIONS = (0, True)

  def __init__(self, res, args, opType, *, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.extend(_get_op_results_or_values(args))
    _ods_context = _ods_get_default_loc_context(loc)
    attributes["opType"] = (opType if (
    isinstance(opType, _ods_ir.Attribute) or
    not _ods_ir.AttrBuilder.contains('MMAElementWiseAttr')) else
      _ods_ir.AttrBuilder.get('MMAElementWiseAttr')(opType, context=_ods_context))
    results = []
    results.append(res)
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def args(self) -> _ods_ir.OpOperandList:
    _ods_variadic_group_length = len(self.operation.operands) - 1 + 1
    return self.operation.operands[0:0 + _ods_variadic_group_length]

  @builtins.property
  def opType(self) -> _ods_ir.Attribute:
    return self.operation.attributes["opType"]

  @opType.setter
  def opType(self, value: _ods_ir.Attribute):
    if value is None:
      raise ValueError("'None' not allowed as value for mandatory attributes")
    self.operation.attributes["opType"] = value

  @builtins.property
  def res(self) -> _ods_ir.OpResult:
    return self.operation.results[0]

def subgroup_mma_elementwise(res, args, op_type, *, loc=None, ip=None) -> _ods_ir.OpResult:
  return SubgroupMmaElementwiseOp(res=res, args=args, opType=op_type, loc=loc, ip=ip).result

@_ods_cext.register_operation(_Dialect)
class SubgroupMmaExtractThreadLocalOp(_ods_ir.OpView):
  r"""
  The `gpu.subgroup_mma_extract_thread_local` operation extracts a value from `!gpu.mma_matrix`
  that is stored at subgroup level.
  
  This operation takes `!gpu.mma_matrix` as its first operand. It is the source
  matrix across a subgroup. The op returns a scalar value stored in the invocation
  in the subgroup.
  
  Since `matrix` is packed into the the threads within a subgroup, `indices` are
  the indices into the values stored by each thread. That is, an index of 0 (or [0, 0])
  does not necessarily refer to the first element of the matrix, but the first element
  that a particular thread holds.
  
  The mapping of matrix elements to threads is not defined by this operation and may
  not be defined by some lowerings (such as the lowering to SPIR-V). However, if the
  size of the subgroup is S, then `subgroup_mma_extract_thread_local` at each index in
  `[0, (M * N) / S)` will have the entire matrix extracted across the subgroup.
  
  Example:
  
  ```mlir
  %c0 = arith.constant 0 : index
  %val = gpu.subgroup_mma_extract_thread_local %m[%c0] : !gpu.mma_matrix<16x16xf32, "AOp"> -> f32
  ```
  """

  OPERATION_NAME = "gpu.subgroup_mma_extract_thread_local"

  _ODS_REGIONS = (0, True)

  def __init__(self, matrix, indices, *, results=None, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.append(matrix)
    operands.extend(_get_op_results_or_values(indices))
    _ods_context = _ods_get_default_loc_context(loc)
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def matrix(self) -> _ods_ir.Value:
    return self.operation.operands[0]

  @builtins.property
  def indices(self) -> _ods_ir.OpOperandList:
    _ods_variadic_group_length = len(self.operation.operands) - 2 + 1
    return self.operation.operands[1:1 + _ods_variadic_group_length]

  @builtins.property
  def res(self) -> _ods_ir.OpResult:
    return self.operation.results[0]

def subgroup_mma_extract_thread_local(matrix, indices, *, results=None, loc=None, ip=None) -> _ods_ir.OpResult:
  return SubgroupMmaExtractThreadLocalOp(matrix=matrix, indices=indices, results=results, loc=loc, ip=ip).result

@_ods_cext.register_operation(_Dialect)
class SubgroupMmaInsertThreadLocalOp(_ods_ir.OpView):
  r"""
  The `gpu.subgroup_mma_insert_thread_local` operation inserts a value to `!gpu.mma_matrix`
  that is stored at subgroup level.
  
  This operation takes scalar value as its first operand and `!gpu.mma_matrix`
  as its second operand. The op inserts the scalar value to the matrix.
  
  Since `matrix` is packed into the the threads within a subgroup, `indices` are
  the indices into the values stored by each thread. That is, an index of 0 (or [0, 0])
  does not necessarily refer to the first element of the matrix, but the first element
  that a particular thread holds.
  
  The mapping of matrix elements to threads is not defined by this operation and may
  not be defined by some lowerings (such as the lowering to SPIR-V). However, if the
  size of the subgroup is S, then `subgroup_mma_insert_thread_local` at each index in
  `[0, (M * N) / S)` will have the entire matrix inserted across the subgroup.
  
  The op returns `!gpu.mma_matrix` with the updated value.
  
  Example:
  
  ```mlir
  %c0 = arith.constant 0 : index
  %s0 = gpu.subgroup_mma_insert_thread_local %val, %m[%c0] : f16, !gpu.mma_matrix<16x16xf16, "COp">
          -> !gpu.mma_matrix<16x16xf16, "COp">
  ```
  """

  OPERATION_NAME = "gpu.subgroup_mma_insert_thread_local"

  _ODS_REGIONS = (0, True)

  def __init__(self, res, value, matrix, indices, *, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.append(value)
    operands.append(matrix)
    operands.extend(_get_op_results_or_values(indices))
    _ods_context = _ods_get_default_loc_context(loc)
    results = []
    results.append(res)
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def value(self) -> _ods_ir.Value:
    return self.operation.operands[0]

  @builtins.property
  def matrix(self) -> _ods_ir.Value:
    return self.operation.operands[1]

  @builtins.property
  def indices(self) -> _ods_ir.OpOperandList:
    _ods_variadic_group_length = len(self.operation.operands) - 3 + 1
    return self.operation.operands[2:2 + _ods_variadic_group_length]

  @builtins.property
  def res(self) -> _ods_ir.OpResult:
    return self.operation.results[0]

def subgroup_mma_insert_thread_local(res, value, matrix, indices, *, loc=None, ip=None) -> _ods_ir.OpResult:
  return SubgroupMmaInsertThreadLocalOp(res=res, value=value, matrix=matrix, indices=indices, loc=loc, ip=ip).result

@_ods_cext.register_operation(_Dialect)
class SubgroupMmaLoadMatrixOp(_ods_ir.OpView):
  r"""
  The `gpu.subgroup_mma_load_matrix` operation loads a matrix collectively
  using all the threads in a subgroup.
  
  This operation takes a memref as its first operand: it is the source matrix
  from which data is to be loaded. The op returns a `!gpu.mma_matrix`. The
  source memref can be in global memory or shared memory. The load address is
  determined using `indices`. The matrix being loaded into is the result.  The
  `leadDimension` attribute specifies the leading dimension size of the source
  matrix which eventually allows the lowering to determine the size of each
  row.  If the `transpose` attribute is present then the op does a transposed load.
  
  For integer types, the resulting `!gpu.mma_matrix` type needs to specify the
  signedness of the data if the matrix type is an `A` or `B` operand for
  `gpu.subgroup_mma_compute`.
  
  This op is often meant to be used along with `gpu.subgroup_mma_store_matrix` and
  `gpu.subgroup_mma_compute`.
  
  Example:
  
  ```mlir
   %0 = gpu.subgroup_mma_load_matrix src[%i,%j] : {leadDimension = 32 : i32}
        : memref<32x32xf16, 3>, !gpu.mma_matrix<16x16xf16, "AOp">
  ```
  """

  OPERATION_NAME = "gpu.subgroup_mma_load_matrix"

  _ODS_REGIONS = (0, True)

  def __init__(self, res, srcMemref, indices, leadDimension, *, transpose=None, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.append(srcMemref)
    operands.extend(_get_op_results_or_values(indices))
    _ods_context = _ods_get_default_loc_context(loc)
    attributes["leadDimension"] = (leadDimension if (
    isinstance(leadDimension, _ods_ir.Attribute) or
    not _ods_ir.AttrBuilder.contains('IndexAttr')) else
      _ods_ir.AttrBuilder.get('IndexAttr')(leadDimension, context=_ods_context))
    if bool(transpose): attributes["transpose"] = _ods_ir.UnitAttr.get(
      _ods_get_default_loc_context(loc))
    results = []
    results.append(res)
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def srcMemref(self) -> _ods_ir.Value[_ods_ir.MemRefType]:
    return self.operation.operands[0]

  @builtins.property
  def indices(self) -> _ods_ir.OpOperandList:
    _ods_variadic_group_length = len(self.operation.operands) - 2 + 1
    return self.operation.operands[1:1 + _ods_variadic_group_length]

  @builtins.property
  def leadDimension(self) -> _ods_ir.IntegerAttr:
    return self.operation.attributes["leadDimension"]

  @leadDimension.setter
  def leadDimension(self, value: _ods_ir.IntegerAttr):
    if value is None:
      raise ValueError("'None' not allowed as value for mandatory attributes")
    self.operation.attributes["leadDimension"] = value

  @builtins.property
  def transpose(self) -> bool:
    return "transpose" in self.operation.attributes

  @transpose.setter
  def transpose(self, value):
    if bool(value):
      self.operation.attributes["transpose"] = _ods_ir.UnitAttr.get()
    elif "transpose" in self.operation.attributes:
      del self.operation.attributes["transpose"]

  @transpose.deleter
  def transpose(self):
    del self.operation.attributes["transpose"]

  @builtins.property
  def res(self) -> _ods_ir.OpResult:
    return self.operation.results[0]

def subgroup_mma_load_matrix(res, src_memref, indices, lead_dimension, *, transpose=None, loc=None, ip=None) -> _ods_ir.OpResult:
  return SubgroupMmaLoadMatrixOp(res=res, srcMemref=src_memref, indices=indices, leadDimension=lead_dimension, transpose=transpose, loc=loc, ip=ip).result

@_ods_cext.register_operation(_Dialect)
class SubgroupMmaStoreMatrixOp(_ods_ir.OpView):
  r"""
  The `gpu.subgroup_mma_store_matrix` operation stores a matrix collectively
  using all the threads in a subgroup.
  
  This operation takes a `!gpu.mma_matrix` and a memref as operands.
  `!gpu.mma_matrix` is the source value containing the data to be stored into the
  destination memref which can be in global or shared memory.  The store address
  is determined using the indices provided. The `leadDimension` attribute
  specifies the leading dimension of the destination matrix. If the
  `transpose` attribute is present then the op does a transposed store.
  
  This op is often meant to be used along with `gpu.subgroup_mma_load_matrix` and
  `gpu.subgroup_mma_compute`.
  
  Example:
  
  ```mlir
  gpu.subgroup_mma_store_matrix %D, %sg[%i,%j] : { leadDimension = 32 : i32}
                  : !gpu.mma_matrix<16x16xf16, "COp">, memref<32x32xf16, 3>
  ```
  """

  OPERATION_NAME = "gpu.subgroup_mma_store_matrix"

  _ODS_REGIONS = (0, True)

  def __init__(self, src, dstMemref, indices, leadDimension, *, transpose=None, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.append(src)
    operands.append(dstMemref)
    operands.extend(_get_op_results_or_values(indices))
    _ods_context = _ods_get_default_loc_context(loc)
    attributes["leadDimension"] = (leadDimension if (
    isinstance(leadDimension, _ods_ir.Attribute) or
    not _ods_ir.AttrBuilder.contains('IndexAttr')) else
      _ods_ir.AttrBuilder.get('IndexAttr')(leadDimension, context=_ods_context))
    if bool(transpose): attributes["transpose"] = _ods_ir.UnitAttr.get(
      _ods_get_default_loc_context(loc))
    results = []
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def src(self) -> _ods_ir.Value:
    return self.operation.operands[0]

  @builtins.property
  def dstMemref(self) -> _ods_ir.Value[_ods_ir.MemRefType]:
    return self.operation.operands[1]

  @builtins.property
  def indices(self) -> _ods_ir.OpOperandList:
    _ods_variadic_group_length = len(self.operation.operands) - 3 + 1
    return self.operation.operands[2:2 + _ods_variadic_group_length]

  @builtins.property
  def leadDimension(self) -> _ods_ir.IntegerAttr:
    return self.operation.attributes["leadDimension"]

  @leadDimension.setter
  def leadDimension(self, value: _ods_ir.IntegerAttr):
    if value is None:
      raise ValueError("'None' not allowed as value for mandatory attributes")
    self.operation.attributes["leadDimension"] = value

  @builtins.property
  def transpose(self) -> bool:
    return "transpose" in self.operation.attributes

  @transpose.setter
  def transpose(self, value):
    if bool(value):
      self.operation.attributes["transpose"] = _ods_ir.UnitAttr.get()
    elif "transpose" in self.operation.attributes:
      del self.operation.attributes["transpose"]

  @transpose.deleter
  def transpose(self):
    del self.operation.attributes["transpose"]

def subgroup_mma_store_matrix(src, dst_memref, indices, lead_dimension, *, transpose=None, loc=None, ip=None) -> SubgroupMmaStoreMatrixOp:
  return SubgroupMmaStoreMatrixOp(src=src, dstMemref=dst_memref, indices=indices, leadDimension=lead_dimension, transpose=transpose, loc=loc, ip=ip)

@_ods_cext.register_operation(_Dialect)
class SubgroupReduceOp(_ods_ir.OpView):
  r"""
  The `subgroup_reduce` op reduces the values of lanes (work items) across a
  subgroup.
  
  The subgroup is divided into clusters starting at lane index 0. Within each
  cluster, there are `size` lanes, and the lane index advances by `stride`.
  A reduction is done for each cluster in parallel: every lane in the cluster
  is reduced, and the result is equal for all lanes in the cluster. If `size`
  is omitted, there is a single cluster covering the entire subgroup. If
  `stride` is omitted, the stride is 1 (the cluster's lanes are contiguous).
  
  When the reduced value is of a vector type, each vector element is reduced
  independently. Only 1-d vector types are allowed.
  
  Example:
  
  ```mlir
  %1 = gpu.subgroup_reduce add %a : (f32) -> f32
  %2 = gpu.subgroup_reduce add %b : (vector<4xf16>) -> vector<4xf16>
  %3 = gpu.subgroup_reduce add %c cluster(size = 4) : (f32) -> f32
  %3 = gpu.subgroup_reduce add %c cluster(size = 4, stride = 2) : (f32) -> f32
  ```
  
  If `uniform` flag is set either none or all lanes of a subgroup need to execute
  this op in convergence.
  
  The reduction operation must be one of:
  *  Integer types: `add`, `mul`, `minui`, `minsi`, `maxui`, `maxsi`, `and`,
     `or`, `xor`
  *  Floating point types: `add`, `mul`, `minnumf`, `maxnumf`, `minimumf`,
     `maximumf`
  """

  OPERATION_NAME = "gpu.subgroup_reduce"

  _ODS_REGIONS = (0, True)

  def __init__(self, value, op, *, uniform=None, cluster_size=None, cluster_stride=None, results=None, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.append(value)
    _ods_context = _ods_get_default_loc_context(loc)
    attributes["op"] = (op if (
    isinstance(op, _ods_ir.Attribute) or
    not _ods_ir.AttrBuilder.contains('GPU_AllReduceOperationAttr')) else
      _ods_ir.AttrBuilder.get('GPU_AllReduceOperationAttr')(op, context=_ods_context))
    if bool(uniform): attributes["uniform"] = _ods_ir.UnitAttr.get(
      _ods_get_default_loc_context(loc))
    if cluster_size is not None: attributes["cluster_size"] = (cluster_size if (
        isinstance(cluster_size, _ods_ir.Attribute) or
        not _ods_ir.AttrBuilder.contains('I32Attr')) else
          _ods_ir.AttrBuilder.get('I32Attr')(cluster_size, context=_ods_context))
    if cluster_stride is not None: attributes["cluster_stride"] = (cluster_stride if (
        isinstance(cluster_stride, _ods_ir.Attribute) or
        not _ods_ir.AttrBuilder.contains('I32Attr')) else
          _ods_ir.AttrBuilder.get('I32Attr')(cluster_stride, context=_ods_context))
    if results is None: results = [operands[0].type] * 1
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def value(self) -> _ods_ir.Value:
    return self.operation.operands[0]

  @builtins.property
  def op(self) -> _ods_ir.Attribute:
    return self.operation.attributes["op"]

  @op.setter
  def op(self, value: _ods_ir.Attribute):
    if value is None:
      raise ValueError("'None' not allowed as value for mandatory attributes")
    self.operation.attributes["op"] = value

  @builtins.property
  def uniform(self) -> bool:
    return "uniform" in self.operation.attributes

  @uniform.setter
  def uniform(self, value):
    if bool(value):
      self.operation.attributes["uniform"] = _ods_ir.UnitAttr.get()
    elif "uniform" in self.operation.attributes:
      del self.operation.attributes["uniform"]

  @uniform.deleter
  def uniform(self):
    del self.operation.attributes["uniform"]

  @builtins.property
  def cluster_size(self) -> _Optional[_ods_ir.IntegerAttr]:
    if "cluster_size" not in self.operation.attributes:
      return None
    return self.operation.attributes["cluster_size"]

  @cluster_size.setter
  def cluster_size(self, value: _Optional[_ods_ir.IntegerAttr]):
    if value is not None:
      self.operation.attributes["cluster_size"] = value
    elif "cluster_size" in self.operation.attributes:
      del self.operation.attributes["cluster_size"]

  @cluster_size.deleter
  def cluster_size(self):
    del self.operation.attributes["cluster_size"]

  @builtins.property
  def cluster_stride(self) -> _ods_ir.IntegerAttr:
    return self.operation.attributes["cluster_stride"]

  @cluster_stride.setter
  def cluster_stride(self, value: _ods_ir.IntegerAttr):
    if value is None:
      raise ValueError("'None' not allowed as value for mandatory attributes")
    self.operation.attributes["cluster_stride"] = value

  @builtins.property
  def result(self) -> _ods_ir.OpResult:
    return self.operation.results[0]

def subgroup_reduce(value, op, *, uniform=None, cluster_size=None, cluster_stride=None, results=None, loc=None, ip=None) -> _ods_ir.OpResult:
  return SubgroupReduceOp(value=value, op=op, uniform=uniform, cluster_size=cluster_size, cluster_stride=cluster_stride, results=results, loc=loc, ip=ip).result

@_ods_cext.register_operation(_Dialect)
class SubgroupSizeOp(_ods_ir.OpView):
  r"""
  Returns the number of threads within a subgroup.
  
  Example:
  
  ```mlir
  %sgSz = gpu.subgroup_size : index
  ```
  
  Executions where the number of threads per subgroup exceed `upper_bound` cause
  undefined behavior. When no `upper_bound` is specified, range analyses and
  similar machinery assume the default bound of `kMaxSubgroupSize`, currently
  128.
  """

  OPERATION_NAME = "gpu.subgroup_size"

  _ODS_REGIONS = (0, True)

  def __init__(self, *, upper_bound=None, results=None, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    _ods_context = _ods_get_default_loc_context(loc)
    if upper_bound is not None: attributes["upper_bound"] = (upper_bound if (
        isinstance(upper_bound, _ods_ir.Attribute) or
        not _ods_ir.AttrBuilder.contains('IndexAttr')) else
          _ods_ir.AttrBuilder.get('IndexAttr')(upper_bound, context=_ods_context))
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def upper_bound(self) -> _Optional[_ods_ir.IntegerAttr]:
    if "upper_bound" not in self.operation.attributes:
      return None
    return self.operation.attributes["upper_bound"]

  @upper_bound.setter
  def upper_bound(self, value: _Optional[_ods_ir.IntegerAttr]):
    if value is not None:
      self.operation.attributes["upper_bound"] = value
    elif "upper_bound" in self.operation.attributes:
      del self.operation.attributes["upper_bound"]

  @upper_bound.deleter
  def upper_bound(self):
    del self.operation.attributes["upper_bound"]

  @builtins.property
  def result(self) -> _ods_ir.OpResult[_ods_ir.IndexType]:
    return self.operation.results[0]

def subgroup_size(*, upper_bound=None, results=None, loc=None, ip=None) -> _ods_ir.OpResult:
  return SubgroupSizeOp(upper_bound=upper_bound, results=results, loc=loc, ip=ip).result

@_ods_cext.register_operation(_Dialect)
class TerminatorOp(_ods_ir.OpView):
  r"""
  A terminator operation for regions that appear in the body of `gpu.launch`
  operation.  These regions are not expected to return any value so the
  terminator takes no operands.
  """

  OPERATION_NAME = "gpu.terminator"

  _ODS_REGIONS = (0, True)

  def __init__(self, *, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    _ods_context = _ods_get_default_loc_context(loc)
    results = []
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

def terminator(*, loc=None, ip=None) -> TerminatorOp:
  return TerminatorOp(loc=loc, ip=ip)

@_ods_cext.register_operation(_Dialect)
class ThreadIdOp(_ods_ir.OpView):
  r"""
  Returns the thread id, i.e. the index of the current thread within the block
  along the x, y, or z `dimension`.
  
  Example:
  
  ```mlir
  %tIdX = gpu.thread_id x
  ```
  
  If `upper_bound` is set, or if one can be inferred from `known_block_size`-type
  annotations in context, executions where the thread index would be greater
  than or equal to that bound cause undefined behavior.
  
  There is an implicit upper bound of `kMaxDim` (currently uint32_t::max).
  """

  OPERATION_NAME = "gpu.thread_id"

  _ODS_REGIONS = (0, True)

  def __init__(self, dimension, *, upper_bound=None, results=None, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    _ods_context = _ods_get_default_loc_context(loc)
    attributes["dimension"] = (dimension if (
    isinstance(dimension, _ods_ir.Attribute) or
    not _ods_ir.AttrBuilder.contains('GPU_DimensionAttr')) else
      _ods_ir.AttrBuilder.get('GPU_DimensionAttr')(dimension, context=_ods_context))
    if upper_bound is not None: attributes["upper_bound"] = (upper_bound if (
        isinstance(upper_bound, _ods_ir.Attribute) or
        not _ods_ir.AttrBuilder.contains('IndexAttr')) else
          _ods_ir.AttrBuilder.get('IndexAttr')(upper_bound, context=_ods_context))
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def dimension(self) -> _ods_ir.Attribute:
    return self.operation.attributes["dimension"]

  @dimension.setter
  def dimension(self, value: _ods_ir.Attribute):
    if value is None:
      raise ValueError("'None' not allowed as value for mandatory attributes")
    self.operation.attributes["dimension"] = value

  @builtins.property
  def upper_bound(self) -> _Optional[_ods_ir.IntegerAttr]:
    if "upper_bound" not in self.operation.attributes:
      return None
    return self.operation.attributes["upper_bound"]

  @upper_bound.setter
  def upper_bound(self, value: _Optional[_ods_ir.IntegerAttr]):
    if value is not None:
      self.operation.attributes["upper_bound"] = value
    elif "upper_bound" in self.operation.attributes:
      del self.operation.attributes["upper_bound"]

  @upper_bound.deleter
  def upper_bound(self):
    del self.operation.attributes["upper_bound"]

def thread_id(dimension, *, upper_bound=None, results=None, loc=None, ip=None) -> _ods_ir.OpResult:
  return ThreadIdOp(dimension=dimension, upper_bound=upper_bound, results=results, loc=loc, ip=ip).result

@_ods_cext.register_operation(_Dialect)
class WaitOp(_ods_ir.OpView):
  r"""
  This op synchronizes the host or the device with a list of dependent ops.
  
  If the op contains the `async` keyword, it returns a new async token which
  is synchronized with the op arguments. This new token is merely a shortcut
  to the argument list, and one could replace the uses of the result with the
  arguments for the same effect. The async version of this op is primarily
  used to make each async token have a single use during lowering and
  thereby make forks in async execution explicit. Example usage:
  
  ```mlir
  %t0 = gpu.foo async : !gpu.async.token
  %t1 = gpu.bar async : !gpu.async.token
  %t2 = gpu.wait async [%t0, %t1]
  // gpu.baz doesn't run until gpu.foo and gpu.bar have both completed, just
  // as if the async dependencies were [%t0, %t1].
  %t3 = gpu.baz async [%t2]
  ```
  
  If the op does not contain the `async` keyword, it does not return a new
  async token but blocks until all ops producing the async dependency tokens
  finished execution. All dependent memory operations are visible to the host
  once this op completes. Example usage:
  
  ```mlir
  %t0 = gpu.foo async : !gpu.async.token
  %t1 = gpu.bar async : !gpu.async.token
  // The gpu.wait op blocks until gpu.foo and gpu.bar have completed.
  gpu.wait [%t0, %t1]
  ```
  """

  OPERATION_NAME = "gpu.wait"

  _ODS_REGIONS = (0, True)

  def __init__(self, asyncToken, asyncDependencies, *, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.extend(_get_op_results_or_values(asyncDependencies))
    _ods_context = _ods_get_default_loc_context(loc)
    results = []
    if asyncToken is not None: results.append(asyncToken)
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def asyncDependencies(self) -> _ods_ir.OpOperandList:
    _ods_variadic_group_length = len(self.operation.operands) - 1 + 1
    return self.operation.operands[0:0 + _ods_variadic_group_length]

  @builtins.property
  def asyncToken(self) -> _Optional[_ods_ir.OpResult]:
    return None if len(self.operation.results) < 1 else self.operation.results[0]

def wait(async_token, async_dependencies, *, loc=None, ip=None) -> _Union[_ods_ir.OpResult, _ods_ir.OpResultList, WaitOp]:
  op = WaitOp(asyncToken=async_token, asyncDependencies=async_dependencies, loc=loc, ip=ip); results = op.results
  return results if len(results) > 1 else (results[0] if len(results) == 1 else op)

@_ods_cext.register_operation(_Dialect)
class WarpExecuteOnLane0Op(_ods_ir.OpView):
  r"""
  `warp_execute_on_lane_0` is an operation used to bridge the gap between
  vector programming and SPMD programming model like GPU SIMT. It allows to
  trivially convert a region of vector code meant to run on a multiple threads
  into a valid SPMD region and then allows incremental transformation to
  distribute vector operations on the threads.
  
  Any code present in the region would only be executed on first thread/lane
  based on the `laneid` operand. The `laneid` operand is an integer ID between
  [0, `warp_size`). The `warp_size` attribute indicates the number of lanes in
  a warp.
  
  Operands are vector values distributed on all lanes that may be used by
  the single lane execution. The matching region argument is a vector of all
  the values of those lanes available to the single active lane. The
  distributed dimension is implicit based on the shape of the operand and
  argument. the properties of the distribution may be described by extra
  attributes (e.g. affine map).
  
  Return values are distributed on all lanes using laneId as index. The
  vector is distributed based on the shape ratio between the vector type of
  the yield and the result type.
  If the shapes are the same this means the value is broadcasted to all lanes.
  In the future the distribution can be made more explicit using affine_maps
  and will support having multiple Ids.
  
  Therefore the `warp_execute_on_lane_0` operations allow to implicitly copy
  between lane0 and the lanes of the warp. When distributing a vector
  from lane0 to all the lanes, the data are distributed in a block cyclic way.
  For example `vector<64xf32>` gets distributed on 32 threads and map to
  `vector<2xf32>` where thread 0 contains vector[0] and vector[1].
  
  During lowering values passed as operands and return value need to be
  visible to different lanes within the warp. This would usually be done by
  going through memory.
  
  The region is *not* isolated from above. For values coming from the parent
  region not going through operands only the lane 0 value will be accesible so
  it generally only make sense for uniform values.
  
  Example:
  ```
  // Execute in parallel on all threads/lanes.
  gpu.warp_execute_on_lane_0 (%laneid)[32] {
    // Serial code running only on thread/lane 0.
    ...
  }
  // Execute in parallel on all threads/lanes.
  ```
  
  This may be lowered to an scf.if region as below:
  ```
    // Execute in parallel on all threads/lanes.
    %cnd = arith.cmpi eq, %laneid, %c0 : index
    scf.if %cnd {
      // Serial code running only on thread/lane 0.
      ...
    }
    // Execute in parallel on all threads/lanes.
  ```
  
  When the region has operands and/or return values:
  ```
  // Execute in parallel on all threads/lanes.
  %0 = gpu.warp_execute_on_lane_0(%laneid)[32]
  args(%v0 : vector<4xi32>) -> (vector<1xf32>) {
  ^bb0(%arg0 : vector<128xi32>) :
    // Serial code running only on thread/lane 0.
    ...
    gpu.yield %1 : vector<32xf32>
  }
  // Execute in parallel on all threads/lanes.
  ```
  
  values at the region boundary would go through memory:
  ```
  // Execute in parallel on all threads/lanes.
  ...
  // Store the data from each thread into memory and Synchronization.
  %tmp0 = memreg.alloc() : memref<128xf32>
  %tmp1 = memreg.alloc() : memref<32xf32>
  %cnd = arith.cmpi eq, %laneid, %c0 : index
  vector.store %v0, %tmp0[%laneid] : memref<128xf32>, vector<4xf32>
  some_synchronization_primitive
  scf.if %cnd {
    // Serialized code running only on thread 0.
    // Load the data from all the threads into a register from thread 0. This
    // allow threads 0 to access data from all the threads.
    %arg0 = vector.load %tmp0[%c0] : memref<128xf32>, vector<128xf32>
    ...
    // Store the data from thread 0 into memory.
    vector.store %1, %tmp1[%c0] : memref<32xf32>, vector<32xf32>
  }
  // Synchronization and load the data in a block cyclic way so that the
  // vector is distributed on all threads.
  some_synchronization_primitive
  %0 = vector.load %tmp1[%laneid] : memref<32xf32>, vector<32xf32>
  // Execute in parallel on all threads/lanes.
  ```
  
  """

  OPERATION_NAME = "gpu.warp_execute_on_lane_0"

  _ODS_REGIONS = (1, True)

  def __init__(self, results_, laneid, warp_size, args, *, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.append(laneid)
    operands.extend(_get_op_results_or_values(args))
    _ods_context = _ods_get_default_loc_context(loc)
    attributes["warp_size"] = (warp_size if (
    isinstance(warp_size, _ods_ir.Attribute) or
    not _ods_ir.AttrBuilder.contains('I64Attr')) else
      _ods_ir.AttrBuilder.get('I64Attr')(warp_size, context=_ods_context))
    results = []
    results.extend(results_)
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def laneid(self) -> _ods_ir.Value[_ods_ir.IndexType]:
    return self.operation.operands[0]

  @builtins.property
  def args(self) -> _ods_ir.OpOperandList:
    _ods_variadic_group_length = len(self.operation.operands) - 2 + 1
    return self.operation.operands[1:1 + _ods_variadic_group_length]

  @builtins.property
  def warp_size(self) -> _ods_ir.IntegerAttr:
    return self.operation.attributes["warp_size"]

  @warp_size.setter
  def warp_size(self, value: _ods_ir.IntegerAttr):
    if value is None:
      raise ValueError("'None' not allowed as value for mandatory attributes")
    self.operation.attributes["warp_size"] = value

  @builtins.property
  def results_(self) -> _ods_ir.OpResultList:
    _ods_variadic_group_length = len(self.operation.results) - 1 + 1
    return self.operation.results[0:0 + _ods_variadic_group_length]

  @builtins.property
  def warpRegion(self) -> _ods_ir.Region:
    return self.regions[0]

def warp_execute_on_lane_0(results_, laneid, warp_size, args, *, loc=None, ip=None) -> _Union[_ods_ir.OpResult, _ods_ir.OpResultList, WarpExecuteOnLane0Op]:
  op = WarpExecuteOnLane0Op(results_=results_, laneid=laneid, warp_size=warp_size, args=args, loc=loc, ip=ip); results = op.results
  return results if len(results) > 1 else (results[0] if len(results) == 1 else op)

@_ods_cext.register_operation(_Dialect)
class YieldOp(_ods_ir.OpView):
  r"""
  `gpu.yield` is a special terminator operation for blocks inside regions
  in gpu ops. It returns values to the immediately enclosing gpu op.
  
  Example:
  
  ```mlir
  gpu.yield %f0, %f1 : f32, f32
  ```
  """

  OPERATION_NAME = "gpu.yield"

  _ODS_REGIONS = (0, True)

  def __init__(self, values, *, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.extend(_get_op_results_or_values(values))
    _ods_context = _ods_get_default_loc_context(loc)
    results = []
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def values(self) -> _ods_ir.OpOperandList:
    _ods_variadic_group_length = len(self.operation.operands) - 1 + 1
    return self.operation.operands[0:0 + _ods_variadic_group_length]

def yield_(values, *, loc=None, ip=None) -> YieldOp:
  return YieldOp(values=values, loc=loc, ip=ip)
