# Autogenerated by mlir-tblgen; don't manually edit.

from ._ods_common import _cext as _ods_cext
from ._ods_common import (
    equally_sized_accessor as _ods_equally_sized_accessor,
    get_default_loc_context as _ods_get_default_loc_context,
    get_op_results_or_values as _get_op_results_or_values,
    segmented_accessor as _ods_segmented_accessor,
)
_ods_ir = _ods_cext.ir
_ods_cext.globals.register_traceback_file_exclusion(__file__)

import builtins
from typing import Sequence as _Sequence, Union as _Union, Optional as _Optional


@_ods_cext.register_dialect
class _Dialect(_ods_ir.Dialect):
  DIALECT_NAMESPACE = "nvgpu"

@_ods_cext.register_operation(_Dialect)
class DeviceAsyncCopyOp(_ods_ir.OpView):
  r"""
  The `nvgpu.device_async_copy` op initiates an asynchronous copy operation of
  elements from source (global memory) to the destination (shared memory)
  without blocking the thread. The async copy is added to a group.
  
  This op is meant to be used with `nvgpu.device_async_create_group` and
  `nvgpu.device_async_wait` to synchronize copies as explained in those ops
  descriptions.
  
  `bypassL1` attribute is hint to the hardware to bypass the L1 cache during
  async copy, this hint may be ignored by the hardware.
  
  `dstElements` attribute is the total number of elements written to
  destination (shared memory).
  
  `srcElements` argument is the total number of elements read from
  source (global memory).
  
  `srcElements` is an optional argument and when present the op only reads
  `srcElements` number of elements from the source (global memory) and zero fills
  the rest of the elements in the destination (shared memory).
  
  In order to do a copy and wait for the result we need the following
  combination:
  ```
  // copy 1.
  %cp1 = nvgpu.device_async_copy %A[%c0], %B[%c0], 4 :memref<16xf32> to memref<16xf32, 3>
  // copy 2.
  %cp2 = nvgpu.device_async_copy %C[%c0], %D[%c0], 4 : memref<16xf32> to memref<16xf32, 3>
  // group 1 contains copy 1 and copy 2.
  %token1 = nvgpu.device_async_create_group %cp1, %cp2
  // copy 3.
  %cp3 = nvgpu.device_async_copy %E[%c0], %F[%c0], 4 : memref<16xf32> to memref<16xf32, 3>
  // group 2 contains copy 3.
  %token2 = nvgpu.device_async_create_group %cp3
  // after the wait copy 1 and copy 2 are complete.
  nvgpu.device_async_wait %token1
  // after the wait copy 3 is complete.
  nvgpu.device_async_wait %token2
  ```
  
  Example:
  
  ```mlir
  %0 = nvgpu.device_async_copy %src[%c0, %c0], %dst[%c0, %c0, %c0], 4 :
    memref<4x5xf32> to memref<2x7x5xf32, 3>
  ```
  """

  OPERATION_NAME = "nvgpu.device_async_copy"

  _ODS_OPERAND_SEGMENTS = [1,-1,1,-1,0,]

  _ODS_REGIONS = (0, True)

  def __init__(self, dst, dstIndices, src, srcIndices, dstElements, *, srcElements=None, bypassL1=None, results=None, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.append(dst)
    operands.append(_get_op_results_or_values(dstIndices))
    operands.append(src)
    operands.append(_get_op_results_or_values(srcIndices))
    operands.append(srcElements)
    _ods_context = _ods_get_default_loc_context(loc)
    attributes["dstElements"] = (dstElements if (
    isinstance(dstElements, _ods_ir.Attribute) or
    not _ods_ir.AttrBuilder.contains('IndexAttr')) else
      _ods_ir.AttrBuilder.get('IndexAttr')(dstElements, context=_ods_context))
    if bool(bypassL1): attributes["bypassL1"] = _ods_ir.UnitAttr.get(
      _ods_get_default_loc_context(loc))
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def dst(self) -> _ods_ir.Value[_ods_ir.MemRefType]:
    operand_range = _ods_segmented_accessor(
         self.operation.operands,
         self.operation.attributes["operandSegmentSizes"], 0)
    return operand_range[0]

  @builtins.property
  def dstIndices(self) -> _ods_ir.OpOperandList:
    operand_range = _ods_segmented_accessor(
         self.operation.operands,
         self.operation.attributes["operandSegmentSizes"], 1)
    return operand_range

  @builtins.property
  def src(self) -> _ods_ir.Value[_ods_ir.MemRefType]:
    operand_range = _ods_segmented_accessor(
         self.operation.operands,
         self.operation.attributes["operandSegmentSizes"], 2)
    return operand_range[0]

  @builtins.property
  def srcIndices(self) -> _ods_ir.OpOperandList:
    operand_range = _ods_segmented_accessor(
         self.operation.operands,
         self.operation.attributes["operandSegmentSizes"], 3)
    return operand_range

  @builtins.property
  def srcElements(self) -> _Optional[_ods_ir.Value[_ods_ir.IndexType]]:
    operand_range = _ods_segmented_accessor(
         self.operation.operands,
         self.operation.attributes["operandSegmentSizes"], 4)
    return operand_range[0] if len(operand_range) > 0 else None

  @builtins.property
  def dstElements(self) -> _ods_ir.IntegerAttr:
    return self.operation.attributes["dstElements"]

  @dstElements.setter
  def dstElements(self, value: _ods_ir.IntegerAttr):
    if value is None:
      raise ValueError("'None' not allowed as value for mandatory attributes")
    self.operation.attributes["dstElements"] = value

  @builtins.property
  def bypassL1(self) -> bool:
    return "bypassL1" in self.operation.attributes

  @bypassL1.setter
  def bypassL1(self, value):
    if bool(value):
      self.operation.attributes["bypassL1"] = _ods_ir.UnitAttr.get()
    elif "bypassL1" in self.operation.attributes:
      del self.operation.attributes["bypassL1"]

  @bypassL1.deleter
  def bypassL1(self):
    del self.operation.attributes["bypassL1"]

  @builtins.property
  def asyncToken(self) -> _ods_ir.OpResult:
    return self.operation.results[0]

def device_async_copy(dst, dst_indices, src, src_indices, dst_elements, *, src_elements=None, bypass_l1=None, results=None, loc=None, ip=None) -> _ods_ir.OpResult:
  return DeviceAsyncCopyOp(dst=dst, dstIndices=dst_indices, src=src, srcIndices=src_indices, dstElements=dst_elements, srcElements=src_elements, bypassL1=bypass_l1, results=results, loc=loc, ip=ip).result

@_ods_cext.register_operation(_Dialect)
class DeviceAsyncCreateGroupOp(_ods_ir.OpView):
  r"""
    The `nvgpu.device_async_create_group` op creates a group of memory accesses
    containing all the pending `device_async_copy` operations associated with
    argument tokens. Each token can only be part of one group.
  
    It returns a token that can be use to wait until the group fully completes.
  
    This is meant to be used with `nvgpu.device_async_wait` to synchronize copies
    as explained in those ops descriptions.
  
    Groups are executed in the order they are created.
  
    Example:
  
    ```mlir
    %0 = nvgpu.device_async_create_group
  ```
  """

  OPERATION_NAME = "nvgpu.device_async_create_group"

  _ODS_REGIONS = (0, True)

  def __init__(self, inputTokens, *, results=None, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.extend(_get_op_results_or_values(inputTokens))
    _ods_context = _ods_get_default_loc_context(loc)
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def inputTokens(self) -> _ods_ir.OpOperandList:
    _ods_variadic_group_length = len(self.operation.operands) - 1 + 1
    return self.operation.operands[0:0 + _ods_variadic_group_length]

  @builtins.property
  def asyncToken(self) -> _ods_ir.OpResult:
    return self.operation.results[0]

def device_async_create_group(input_tokens, *, results=None, loc=None, ip=None) -> _ods_ir.OpResult:
  return DeviceAsyncCreateGroupOp(inputTokens=input_tokens, results=results, loc=loc, ip=ip).result

@_ods_cext.register_operation(_Dialect)
class DeviceAsyncWaitOp(_ods_ir.OpView):
  r"""
  The `nvgpu.device_async_wait` op will block the execution thread until the group
  associated with the source token is fully completed.
  
  The optional `$numGroups` attribute gives an upper bound of the number of
  groups uncompleted when the wait can unblock the thread. For example,  if
  16 async groups are pushe and `$numGroups` is set to 12, then the thread
  will unblock when 12 groups or fewer are in flight (4 groups have
  completed).
  
  Example:
  
  ```mlir
  nvgpu.device_async_wait %0
  ```
  """

  OPERATION_NAME = "nvgpu.device_async_wait"

  _ODS_REGIONS = (0, True)

  def __init__(self, asyncDependencies, *, numGroups=None, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.append(asyncDependencies)
    _ods_context = _ods_get_default_loc_context(loc)
    if numGroups is not None: attributes["numGroups"] = (numGroups if (
        isinstance(numGroups, _ods_ir.Attribute) or
        not _ods_ir.AttrBuilder.contains('I32Attr')) else
          _ods_ir.AttrBuilder.get('I32Attr')(numGroups, context=_ods_context))
    results = []
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def asyncDependencies(self) -> _ods_ir.Value:
    return self.operation.operands[0]

  @builtins.property
  def numGroups(self) -> _Optional[_ods_ir.IntegerAttr]:
    if "numGroups" not in self.operation.attributes:
      return None
    return self.operation.attributes["numGroups"]

  @numGroups.setter
  def numGroups(self, value: _Optional[_ods_ir.IntegerAttr]):
    if value is not None:
      self.operation.attributes["numGroups"] = value
    elif "numGroups" in self.operation.attributes:
      del self.operation.attributes["numGroups"]

  @numGroups.deleter
  def numGroups(self):
    del self.operation.attributes["numGroups"]

def device_async_wait(async_dependencies, *, num_groups=None, loc=None, ip=None) -> DeviceAsyncWaitOp:
  return DeviceAsyncWaitOp(asyncDependencies=async_dependencies, numGroups=num_groups, loc=loc, ip=ip)

@_ods_cext.register_operation(_Dialect)
class LdMatrixOp(_ods_ir.OpView):
  r"""
  The `nvgpu.ldmatrix` op represents loading a matrix fragment from
  memory to registers. The source and result type must be compatible
  with lowering to the `nvvm.ldmatrix` instruction. This op represents
  the distributed version of a `vector.transfer_read` as an intermediate
  step between lowering from `vector.transfer_read` to `nvvm.ldmatrix`.
  
  This operation is meant to follow the semantic of described here:
  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-ldmatrix
  
  Example:
  ```mlir
  %0 = nvgpu.ldmatrix %sm[%c0, %c0] {numTiles = 4 : i32, transpose = false} :
    memref<?x?xf16, 3> -> vector<4x2xf16>
  ```
  """

  OPERATION_NAME = "nvgpu.ldmatrix"

  _ODS_REGIONS = (0, True)

  def __init__(self, res, srcMemref, indices, transpose, numTiles, *, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.append(srcMemref)
    operands.extend(_get_op_results_or_values(indices))
    _ods_context = _ods_get_default_loc_context(loc)
    attributes["transpose"] = (transpose if (
    isinstance(transpose, _ods_ir.Attribute) or
    not _ods_ir.AttrBuilder.contains('BoolAttr')) else
      _ods_ir.AttrBuilder.get('BoolAttr')(transpose, context=_ods_context))
    attributes["numTiles"] = (numTiles if (
    isinstance(numTiles, _ods_ir.Attribute) or
    not _ods_ir.AttrBuilder.contains('I32Attr')) else
      _ods_ir.AttrBuilder.get('I32Attr')(numTiles, context=_ods_context))
    results = []
    results.append(res)
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def srcMemref(self) -> _ods_ir.Value[_ods_ir.MemRefType]:
    return self.operation.operands[0]

  @builtins.property
  def indices(self) -> _ods_ir.OpOperandList:
    _ods_variadic_group_length = len(self.operation.operands) - 2 + 1
    return self.operation.operands[1:1 + _ods_variadic_group_length]

  @builtins.property
  def transpose(self) -> _ods_ir.BoolAttr:
    return self.operation.attributes["transpose"]

  @transpose.setter
  def transpose(self, value: _ods_ir.BoolAttr):
    if value is None:
      raise ValueError("'None' not allowed as value for mandatory attributes")
    self.operation.attributes["transpose"] = value

  @builtins.property
  def numTiles(self) -> _ods_ir.IntegerAttr:
    return self.operation.attributes["numTiles"]

  @numTiles.setter
  def numTiles(self, value: _ods_ir.IntegerAttr):
    if value is None:
      raise ValueError("'None' not allowed as value for mandatory attributes")
    self.operation.attributes["numTiles"] = value

  @builtins.property
  def res(self) -> _ods_ir.OpResult[_ods_ir.VectorType]:
    return self.operation.results[0]

def ldmatrix(res, src_memref, indices, transpose, num_tiles, *, loc=None, ip=None) -> _ods_ir.OpResult:
  return LdMatrixOp(res=res, srcMemref=src_memref, indices=indices, transpose=transpose, numTiles=num_tiles, loc=loc, ip=ip).result

@_ods_cext.register_operation(_Dialect)
class MBarrierArriveExpectTxOp(_ods_ir.OpView):
  r"""
  A thread executing the Op performs an expect-tx operation on the mbarrier 
  object at the location specified by the address operand $barrier. The 
  expect-tx operation, with an $txcount argument, increases the tx-count of 
  an mbarrier object by the value specified by $txcount. This makes the 
  current phase of the mbarrier object to expect and track the completion of 
  additional asynchronous transactions.
  
  The `$txCount` specifies the number of element to the expect-tx operation.
  
  Example:
  ```mlir
    nvgpu.mbarrier.arrive.expect_tx %barrier, %ic0 : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
  ```
  """

  OPERATION_NAME = "nvgpu.mbarrier.arrive.expect_tx"

  _ODS_REGIONS = (0, True)

  def __init__(self, barriers, txcount, mbarId, *, predicate=None, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.append(barriers)
    operands.append(txcount)
    operands.append(mbarId)
    if predicate is not None: operands.append(predicate)
    _ods_context = _ods_get_default_loc_context(loc)
    results = []
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def barriers(self) -> _ods_ir.Value:
    return self.operation.operands[0]

  @builtins.property
  def txcount(self) -> _ods_ir.Value[_ods_ir.IndexType]:
    return self.operation.operands[1]

  @builtins.property
  def mbarId(self) -> _ods_ir.Value[_ods_ir.IndexType]:
    return self.operation.operands[2]

  @builtins.property
  def predicate(self) -> _Optional[_ods_ir.Value[_ods_ir.IntegerType]]:
    return None if len(self.operation.operands) < 4 else self.operation.operands[3]

def mbarrier_arrive_expect_tx(barriers, txcount, mbar_id, *, predicate=None, loc=None, ip=None) -> MBarrierArriveExpectTxOp:
  return MBarrierArriveExpectTxOp(barriers=barriers, txcount=txcount, mbarId=mbar_id, predicate=predicate, loc=loc, ip=ip)

@_ods_cext.register_operation(_Dialect)
class MBarrierArriveNoCompleteOp(_ods_ir.OpView):
  r"""
  The Op performs arrive-on operation on the `mbarrier` object and returns a 
  `nvgpu.mbarrier.token`.
  
  The Op does not cause the `nvgpu.mbarrier` to complete its current phase.
  
  Example:
  ```mlir
    %token = nvgpu.mbarrier.arrive.noComplete %barrier, %count : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>> -> !nvgpu.mbarrier.token
  ```
  """

  OPERATION_NAME = "nvgpu.mbarrier.arrive.nocomplete"

  _ODS_REGIONS = (0, True)

  def __init__(self, barriers, mbarId, count, *, results=None, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.append(barriers)
    operands.append(mbarId)
    operands.append(count)
    _ods_context = _ods_get_default_loc_context(loc)
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def barriers(self) -> _ods_ir.Value:
    return self.operation.operands[0]

  @builtins.property
  def mbarId(self) -> _ods_ir.Value[_ods_ir.IndexType]:
    return self.operation.operands[1]

  @builtins.property
  def count(self) -> _ods_ir.Value[_ods_ir.IndexType]:
    return self.operation.operands[2]

  @builtins.property
  def token(self) -> _ods_ir.OpResult:
    return self.operation.results[0]

def mbarrier_arrive_nocomplete(barriers, mbar_id, count, *, results=None, loc=None, ip=None) -> _ods_ir.OpResult:
  return MBarrierArriveNoCompleteOp(barriers=barriers, mbarId=mbar_id, count=count, results=results, loc=loc, ip=ip).result

@_ods_cext.register_operation(_Dialect)
class MBarrierArriveOp(_ods_ir.OpView):
  r"""
  The Op performs arrive-on operation on the `mbarrier` object and returns a 
  `nvgpu.mbarrier.token`.
  
  For more information, see
  https://docs.nvidia.com/cuda/parallel-thread-execution/#arrive-on-operation-on-mbarrier-object
  
  Example:
  ```mlir
    %token = nvgpu.mbarrier.arrive %barrier : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>> -> !nvgpu.mbarrier.token
  ```
  """

  OPERATION_NAME = "nvgpu.mbarrier.arrive"

  _ODS_REGIONS = (0, True)

  def __init__(self, barriers, mbarId, *, results=None, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.append(barriers)
    operands.append(mbarId)
    _ods_context = _ods_get_default_loc_context(loc)
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def barriers(self) -> _ods_ir.Value:
    return self.operation.operands[0]

  @builtins.property
  def mbarId(self) -> _ods_ir.Value[_ods_ir.IndexType]:
    return self.operation.operands[1]

  @builtins.property
  def token(self) -> _ods_ir.OpResult:
    return self.operation.results[0]

def mbarrier_arrive(barriers, mbar_id, *, results=None, loc=None, ip=None) -> _ods_ir.OpResult:
  return MBarrierArriveOp(barriers=barriers, mbarId=mbar_id, results=results, loc=loc, ip=ip).result

@_ods_cext.register_operation(_Dialect)
class MBarrierCreateOp(_ods_ir.OpView):
  r"""
  The Op generates one or more `mbarrier` object, which is a barrier created in 
  shared memory and supports various synchronization behaviors for threads.
  
  The `mbarrier` object has the following type and alignment requirements:
    Type: .b64, Alignment: 8, Memory space: .shared
  
  Example:
  ```mlir
    %barrier = nvgpu.mbarrier.create -> !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
  ```
  """

  OPERATION_NAME = "nvgpu.mbarrier.create"

  _ODS_REGIONS = (0, True)

  def __init__(self, barriers, *, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    _ods_context = _ods_get_default_loc_context(loc)
    results = []
    results.append(barriers)
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def barriers(self) -> _ods_ir.OpResult:
    return self.operation.results[0]

def mbarrier_create(barriers, *, loc=None, ip=None) -> _ods_ir.OpResult:
  return MBarrierCreateOp(barriers=barriers, loc=loc, ip=ip).result

@_ods_cext.register_operation(_Dialect)
class MBarrierGetOp(_ods_ir.OpView):
  r"""
  The `nvgpu.mbarrier.get` operation retrieves a pointer to a specific 
  `mbarrier` object from a group of barriers created by the `nvgpu.mbarrier.create` operation.
  
  Example:
  ```mlir
    %mbars = nvgpu.mbarrier.create -> !nvgpu.mbarrier.group<memorySpace = #gpu.address_space<workgroup>, num_barriers = 10>
    %mbar_pointer = nvgpu.mbarrier.get %mbars[%c2] : !nvgpu.mbarrier.group<memorySpace = #gpu.address_space<workgroup>>
  ```
  """

  OPERATION_NAME = "nvgpu.mbarrier.get"

  _ODS_REGIONS = (0, True)

  def __init__(self, mbarrierPointer, barriers, mbarId, *, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.append(barriers)
    operands.append(mbarId)
    _ods_context = _ods_get_default_loc_context(loc)
    results = []
    results.append(mbarrierPointer)
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def barriers(self) -> _ods_ir.Value:
    return self.operation.operands[0]

  @builtins.property
  def mbarId(self) -> _ods_ir.Value[_ods_ir.IndexType]:
    return self.operation.operands[1]

  @builtins.property
  def mbarrierPointer(self) -> _ods_ir.OpResult:
    return self.operation.results[0]

def mbarrier_get(mbarrier_pointer, barriers, mbar_id, *, loc=None, ip=None) -> _ods_ir.OpResult:
  return MBarrierGetOp(mbarrierPointer=mbarrier_pointer, barriers=barriers, mbarId=mbar_id, loc=loc, ip=ip).result

@_ods_cext.register_operation(_Dialect)
class MBarrierInitOp(_ods_ir.OpView):
  r"""
  The Op initializes the `mbarrier` object with the given number of threads.
  
  Example:
  ```mlir
    %num_threads = gpu.block_dim x
    %barrier = nvgpu.mbarrier.create -> !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
    nvgpu.mbarrier.init %barrier, %num_threads : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
  ```
  """

  OPERATION_NAME = "nvgpu.mbarrier.init"

  _ODS_REGIONS = (0, True)

  def __init__(self, barriers, count, mbarId, *, predicate=None, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.append(barriers)
    operands.append(count)
    operands.append(mbarId)
    if predicate is not None: operands.append(predicate)
    _ods_context = _ods_get_default_loc_context(loc)
    results = []
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def barriers(self) -> _ods_ir.Value:
    return self.operation.operands[0]

  @builtins.property
  def count(self) -> _ods_ir.Value[_ods_ir.IndexType]:
    return self.operation.operands[1]

  @builtins.property
  def mbarId(self) -> _ods_ir.Value[_ods_ir.IndexType]:
    return self.operation.operands[2]

  @builtins.property
  def predicate(self) -> _Optional[_ods_ir.Value[_ods_ir.IntegerType]]:
    return None if len(self.operation.operands) < 4 else self.operation.operands[3]

def mbarrier_init(barriers, count, mbar_id, *, predicate=None, loc=None, ip=None) -> MBarrierInitOp:
  return MBarrierInitOp(barriers=barriers, count=count, mbarId=mbar_id, predicate=predicate, loc=loc, ip=ip)

@_ods_cext.register_operation(_Dialect)
class MBarrierTestWaitOp(_ods_ir.OpView):
  r"""
  Checks whether the mbarrier object has completed the phase. It is is a 
  non-blocking instruction which tests for the completion of the phase.
  
  Example:
  ```mlir
    %isComplete = nvgpu.mbarrier.test.wait %barrier, %token : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>, !nvgpu.mbarrier.token
  ```
  """

  OPERATION_NAME = "nvgpu.mbarrier.test.wait"

  _ODS_REGIONS = (0, True)

  def __init__(self, barriers, token, mbarId, *, results=None, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.append(barriers)
    operands.append(token)
    operands.append(mbarId)
    _ods_context = _ods_get_default_loc_context(loc)
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def barriers(self) -> _ods_ir.Value:
    return self.operation.operands[0]

  @builtins.property
  def token(self) -> _ods_ir.Value:
    return self.operation.operands[1]

  @builtins.property
  def mbarId(self) -> _ods_ir.Value[_ods_ir.IndexType]:
    return self.operation.operands[2]

  @builtins.property
  def waitComplete(self) -> _ods_ir.OpResult[_ods_ir.IntegerType]:
    return self.operation.results[0]

def mbarrier_test_wait(barriers, token, mbar_id, *, results=None, loc=None, ip=None) -> _ods_ir.OpResult:
  return MBarrierTestWaitOp(barriers=barriers, token=token, mbarId=mbar_id, results=results, loc=loc, ip=ip).result

@_ods_cext.register_operation(_Dialect)
class MBarrierTryWaitParityOp(_ods_ir.OpView):
  r"""
  Checks whether the mbarrier object has completed the phase. It is is a 
  potentially blocking instruction which tests for the completion of the 
  phase. Suspended thread resumes execution when the specified phase completes 
  OR before the phase completes following a system-dependent time limit. 
  
  The `$phaseParity` specifies either even phase (0) or odd phase (1) to 
  wait.
  
  Example:
  ```mlir
    nvgpu.mbarrier.try_wait.parity %barrier, %phaseParity, %ticks : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
  ```
  """

  OPERATION_NAME = "nvgpu.mbarrier.try_wait.parity"

  _ODS_REGIONS = (0, True)

  def __init__(self, barriers, phaseParity, ticks, mbarId, *, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.append(barriers)
    operands.append(phaseParity)
    operands.append(ticks)
    operands.append(mbarId)
    _ods_context = _ods_get_default_loc_context(loc)
    results = []
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def barriers(self) -> _ods_ir.Value:
    return self.operation.operands[0]

  @builtins.property
  def phaseParity(self) -> _ods_ir.Value[_ods_ir.IntegerType]:
    return self.operation.operands[1]

  @builtins.property
  def ticks(self) -> _ods_ir.Value[_ods_ir.IndexType]:
    return self.operation.operands[2]

  @builtins.property
  def mbarId(self) -> _ods_ir.Value[_ods_ir.IndexType]:
    return self.operation.operands[3]

def mbarrier_try_wait_parity(barriers, phase_parity, ticks, mbar_id, *, loc=None, ip=None) -> MBarrierTryWaitParityOp:
  return MBarrierTryWaitParityOp(barriers=barriers, phaseParity=phase_parity, ticks=ticks, mbarId=mbar_id, loc=loc, ip=ip)

@_ods_cext.register_operation(_Dialect)
class MmaSparseSyncOp(_ods_ir.OpView):
  r"""
  The `nvgu.mma.sp.sync` operation performs a warp-distributed MMA operation
  where operand A is "structured sparse". In this case, the `matrixA` operand
  represents the (warp-distributed) non-zero values of operand A, and the
  `sparse_metadata` operand provides the indices.
  
  The full description of the sparsity storage format and distribution scheme is
  described in the PTX docs. This operation is meant to follow the semantic
  described in the PTX documentation here:
  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-for-sparse-mma
  
  The way the indices are distributed among the threads in a warp is controlled
  by the optional `sparsity_selector` operand, which is `0` by default. For
  more information, please consult the PTX documentation linked above.
  
  Example (targetingthe f16 16x8x32 `mma.sp` PTX instruction):
  
  ```mlir
  nvgpu.mma.sp.sync (%a, %b, %c) metadata (%meta) {mmaShape = [16, 8, 32]} :
    (vector<4x2xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16>
  ```
  """

  OPERATION_NAME = "nvgpu.mma.sp.sync"

  _ODS_REGIONS = (0, True)

  def __init__(self, res, matrixA, matrixB, matrixC, sparseMetadata, mmaShape, *, sparsitySelector=None, tf32Enabled=None, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.append(matrixA)
    operands.append(matrixB)
    operands.append(matrixC)
    operands.append(sparseMetadata)
    _ods_context = _ods_get_default_loc_context(loc)
    attributes["mmaShape"] = (mmaShape if (
    isinstance(mmaShape, _ods_ir.Attribute) or
    not _ods_ir.AttrBuilder.contains('I64ArrayAttr')) else
      _ods_ir.AttrBuilder.get('I64ArrayAttr')(mmaShape, context=_ods_context))
    if sparsitySelector is not None: attributes["sparsitySelector"] = (sparsitySelector if (
        isinstance(sparsitySelector, _ods_ir.Attribute) or
        not _ods_ir.AttrBuilder.contains('I32Attr')) else
          _ods_ir.AttrBuilder.get('I32Attr')(sparsitySelector, context=_ods_context))
    if bool(tf32Enabled): attributes["tf32Enabled"] = _ods_ir.UnitAttr.get(
      _ods_get_default_loc_context(loc))
    results = []
    results.append(res)
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def matrixA(self) -> _ods_ir.Value[_ods_ir.VectorType]:
    return self.operation.operands[0]

  @builtins.property
  def matrixB(self) -> _ods_ir.Value[_ods_ir.VectorType]:
    return self.operation.operands[1]

  @builtins.property
  def matrixC(self) -> _ods_ir.Value[_ods_ir.VectorType]:
    return self.operation.operands[2]

  @builtins.property
  def sparseMetadata(self) -> _ods_ir.Value[_ods_ir.VectorType]:
    return self.operation.operands[3]

  @builtins.property
  def mmaShape(self) -> _ods_ir.ArrayAttr:
    return self.operation.attributes["mmaShape"]

  @mmaShape.setter
  def mmaShape(self, value: _ods_ir.ArrayAttr):
    if value is None:
      raise ValueError("'None' not allowed as value for mandatory attributes")
    self.operation.attributes["mmaShape"] = value

  @builtins.property
  def sparsitySelector(self) -> _ods_ir.IntegerAttr:
    return self.operation.attributes["sparsitySelector"]

  @sparsitySelector.setter
  def sparsitySelector(self, value: _ods_ir.IntegerAttr):
    if value is None:
      raise ValueError("'None' not allowed as value for mandatory attributes")
    self.operation.attributes["sparsitySelector"] = value

  @builtins.property
  def tf32Enabled(self) -> bool:
    return "tf32Enabled" in self.operation.attributes

  @tf32Enabled.setter
  def tf32Enabled(self, value):
    if bool(value):
      self.operation.attributes["tf32Enabled"] = _ods_ir.UnitAttr.get()
    elif "tf32Enabled" in self.operation.attributes:
      del self.operation.attributes["tf32Enabled"]

  @tf32Enabled.deleter
  def tf32Enabled(self):
    del self.operation.attributes["tf32Enabled"]

  @builtins.property
  def res(self) -> _ods_ir.OpResult[_ods_ir.VectorType]:
    return self.operation.results[0]

def mma_sp_sync(res, matrix_a, matrix_b, matrix_c, sparse_metadata, mma_shape, *, sparsity_selector=None, tf32_enabled=None, loc=None, ip=None) -> _ods_ir.OpResult:
  return MmaSparseSyncOp(res=res, matrixA=matrix_a, matrixB=matrix_b, matrixC=matrix_c, sparseMetadata=sparse_metadata, mmaShape=mma_shape, sparsitySelector=sparsity_selector, tf32Enabled=tf32_enabled, loc=loc, ip=ip).result

@_ods_cext.register_operation(_Dialect)
class MmaSyncOp(_ods_ir.OpView):
  r"""
  The `nvgpu.mma.sync` op represents the warp-level matrix-multiply-and-
  accumulate (mma) operation that is compatible with `nvvm.mma.sync`.
  The operands and results vector sizes are thread-level onwership to
  the warp-level mma operation shape. `mmaShape` attribute holds the
  warp-level matrix-multiply shape.
  
  The `nvgpu.mma.sync` op serves as an intermediate point between lowering from
  `vector.contract` to `nvvm.mma.sync`.
  
  This operation is meant to follow the semantic of described here:
    https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-mma
  
  Example:
  
  ```mlir
  %res = nvgpu.mma.sync (%matrixA, %matrixB, %matrixC) {mmaShape = [16, 8, 16]} :
      (vector<4x2xf16>, vector<2x2xf16>, vector<2x2xf32>) -> vector<2x2xf32>
  ```
  """

  OPERATION_NAME = "nvgpu.mma.sync"

  _ODS_REGIONS = (0, True)

  def __init__(self, res, matrixA, matrixB, matrixC, mmaShape, *, tf32Enabled=None, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.append(matrixA)
    operands.append(matrixB)
    operands.append(matrixC)
    _ods_context = _ods_get_default_loc_context(loc)
    attributes["mmaShape"] = (mmaShape if (
    isinstance(mmaShape, _ods_ir.Attribute) or
    not _ods_ir.AttrBuilder.contains('I64ArrayAttr')) else
      _ods_ir.AttrBuilder.get('I64ArrayAttr')(mmaShape, context=_ods_context))
    if bool(tf32Enabled): attributes["tf32Enabled"] = _ods_ir.UnitAttr.get(
      _ods_get_default_loc_context(loc))
    results = []
    results.append(res)
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def matrixA(self) -> _ods_ir.Value[_ods_ir.VectorType]:
    return self.operation.operands[0]

  @builtins.property
  def matrixB(self) -> _ods_ir.Value[_ods_ir.VectorType]:
    return self.operation.operands[1]

  @builtins.property
  def matrixC(self) -> _ods_ir.Value[_ods_ir.VectorType]:
    return self.operation.operands[2]

  @builtins.property
  def mmaShape(self) -> _ods_ir.ArrayAttr:
    return self.operation.attributes["mmaShape"]

  @mmaShape.setter
  def mmaShape(self, value: _ods_ir.ArrayAttr):
    if value is None:
      raise ValueError("'None' not allowed as value for mandatory attributes")
    self.operation.attributes["mmaShape"] = value

  @builtins.property
  def tf32Enabled(self) -> bool:
    return "tf32Enabled" in self.operation.attributes

  @tf32Enabled.setter
  def tf32Enabled(self, value):
    if bool(value):
      self.operation.attributes["tf32Enabled"] = _ods_ir.UnitAttr.get()
    elif "tf32Enabled" in self.operation.attributes:
      del self.operation.attributes["tf32Enabled"]

  @tf32Enabled.deleter
  def tf32Enabled(self):
    del self.operation.attributes["tf32Enabled"]

  @builtins.property
  def res(self) -> _ods_ir.OpResult[_ods_ir.VectorType]:
    return self.operation.results[0]

def mma_sync(res, matrix_a, matrix_b, matrix_c, mma_shape, *, tf32_enabled=None, loc=None, ip=None) -> _ods_ir.OpResult:
  return MmaSyncOp(res=res, matrixA=matrix_a, matrixB=matrix_b, matrixC=matrix_c, mmaShape=mma_shape, tf32Enabled=tf32_enabled, loc=loc, ip=ip).result

@_ods_cext.register_operation(_Dialect)
class RcpOp(_ods_ir.OpView):
  r"""
  Reciprocal calculation for `vector` types using `nvvm.rcp` OPs.
  
  Currently, only the `approx` rounding mode and `ftz` are supported, and only for the `f32` type.
  
  The input and output must be of the same vector type and shape.
  """

  OPERATION_NAME = "nvgpu.rcp"

  _ODS_REGIONS = (0, True)

  def __init__(self, in_, *, rounding=None, ftz=None, results=None, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.append(in_)
    _ods_context = _ods_get_default_loc_context(loc)
    if rounding is not None: attributes["rounding"] = (rounding if (
        isinstance(rounding, _ods_ir.Attribute) or
        not _ods_ir.AttrBuilder.contains('RcpRoundingModeAttr')) else
          _ods_ir.AttrBuilder.get('RcpRoundingModeAttr')(rounding, context=_ods_context))
    if bool(ftz): attributes["ftz"] = _ods_ir.UnitAttr.get(
      _ods_get_default_loc_context(loc))
    if results is None: results = [operands[0].type] * 1
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def in_(self) -> _ods_ir.Value[_ods_ir.VectorType]:
    return self.operation.operands[0]

  @builtins.property
  def rounding(self) -> _ods_ir.Attribute:
    return self.operation.attributes["rounding"]

  @rounding.setter
  def rounding(self, value: _ods_ir.Attribute):
    if value is None:
      raise ValueError("'None' not allowed as value for mandatory attributes")
    self.operation.attributes["rounding"] = value

  @builtins.property
  def ftz(self) -> bool:
    return "ftz" in self.operation.attributes

  @ftz.setter
  def ftz(self, value):
    if bool(value):
      self.operation.attributes["ftz"] = _ods_ir.UnitAttr.get()
    elif "ftz" in self.operation.attributes:
      del self.operation.attributes["ftz"]

  @ftz.deleter
  def ftz(self):
    del self.operation.attributes["ftz"]

  @builtins.property
  def out(self) -> _ods_ir.OpResult[_ods_ir.VectorType]:
    return self.operation.results[0]

def rcp(in_, *, rounding=None, ftz=None, results=None, loc=None, ip=None) -> _ods_ir.OpResult:
  return RcpOp(in_=in_, rounding=rounding, ftz=ftz, results=results, loc=loc, ip=ip).result

@_ods_cext.register_operation(_Dialect)
class TmaAsyncLoadOp(_ods_ir.OpView):
  r"""
  The Op loads a tile memory region from global memory to shared memory by 
  Tensor Memory Access (TMA).
  
  `$tensorMapDescriptor` is tensor map descriptor which has information about
  tile shape. The descriptor is created by `nvgpu.tma.create.descriptor`
  
  The Op uses `$barrier` mbarrier based completion mechanism. 
  """

  OPERATION_NAME = "nvgpu.tma.async.load"

  _ODS_OPERAND_SEGMENTS = [1,1,1,-1,1,0,0,]

  _ODS_REGIONS = (0, True)

  def __init__(self, dst, barriers, tensorMapDescriptor, coordinates, mbarId, *, multicastMask=None, predicate=None, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.append(dst)
    operands.append(barriers)
    operands.append(tensorMapDescriptor)
    operands.append(_get_op_results_or_values(coordinates))
    operands.append(mbarId)
    operands.append(multicastMask)
    operands.append(predicate)
    _ods_context = _ods_get_default_loc_context(loc)
    results = []
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def dst(self) -> _ods_ir.Value[_ods_ir.MemRefType]:
    operand_range = _ods_segmented_accessor(
         self.operation.operands,
         self.operation.attributes["operandSegmentSizes"], 0)
    return operand_range[0]

  @builtins.property
  def barriers(self) -> _ods_ir.Value:
    operand_range = _ods_segmented_accessor(
         self.operation.operands,
         self.operation.attributes["operandSegmentSizes"], 1)
    return operand_range[0]

  @builtins.property
  def tensorMapDescriptor(self) -> _ods_ir.Value:
    operand_range = _ods_segmented_accessor(
         self.operation.operands,
         self.operation.attributes["operandSegmentSizes"], 2)
    return operand_range[0]

  @builtins.property
  def coordinates(self) -> _ods_ir.OpOperandList:
    operand_range = _ods_segmented_accessor(
         self.operation.operands,
         self.operation.attributes["operandSegmentSizes"], 3)
    return operand_range

  @builtins.property
  def mbarId(self) -> _ods_ir.Value[_ods_ir.IndexType]:
    operand_range = _ods_segmented_accessor(
         self.operation.operands,
         self.operation.attributes["operandSegmentSizes"], 4)
    return operand_range[0]

  @builtins.property
  def multicastMask(self) -> _Optional[_ods_ir.Value[_ods_ir.IntegerType]]:
    operand_range = _ods_segmented_accessor(
         self.operation.operands,
         self.operation.attributes["operandSegmentSizes"], 5)
    return operand_range[0] if len(operand_range) > 0 else None

  @builtins.property
  def predicate(self) -> _Optional[_ods_ir.Value[_ods_ir.IntegerType]]:
    operand_range = _ods_segmented_accessor(
         self.operation.operands,
         self.operation.attributes["operandSegmentSizes"], 6)
    return operand_range[0] if len(operand_range) > 0 else None

def tma_async_load(dst, barriers, tensor_map_descriptor, coordinates, mbar_id, *, multicast_mask=None, predicate=None, loc=None, ip=None) -> TmaAsyncLoadOp:
  return TmaAsyncLoadOp(dst=dst, barriers=barriers, tensorMapDescriptor=tensor_map_descriptor, coordinates=coordinates, mbarId=mbar_id, multicastMask=multicast_mask, predicate=predicate, loc=loc, ip=ip)

@_ods_cext.register_operation(_Dialect)
class TmaAsyncStoreOp(_ods_ir.OpView):
  r"""
  The Op store a tile memory region from global memory to shared memory by 
  Tensor Memory Access (TMA).
  
  `$tensorMapDescriptor` is tensor map descriptor which has information about
  tile shape. The descriptor is created by `nvgpu.tma.create.descriptor`
  """

  OPERATION_NAME = "nvgpu.tma.async.store"

  _ODS_OPERAND_SEGMENTS = [1,1,-1,0,]

  _ODS_REGIONS = (0, True)

  def __init__(self, src, tensorMapDescriptor, coordinates, *, predicate=None, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.append(src)
    operands.append(tensorMapDescriptor)
    operands.append(_get_op_results_or_values(coordinates))
    operands.append(predicate)
    _ods_context = _ods_get_default_loc_context(loc)
    results = []
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def src(self) -> _ods_ir.Value[_ods_ir.MemRefType]:
    operand_range = _ods_segmented_accessor(
         self.operation.operands,
         self.operation.attributes["operandSegmentSizes"], 0)
    return operand_range[0]

  @builtins.property
  def tensorMapDescriptor(self) -> _ods_ir.Value:
    operand_range = _ods_segmented_accessor(
         self.operation.operands,
         self.operation.attributes["operandSegmentSizes"], 1)
    return operand_range[0]

  @builtins.property
  def coordinates(self) -> _ods_ir.OpOperandList:
    operand_range = _ods_segmented_accessor(
         self.operation.operands,
         self.operation.attributes["operandSegmentSizes"], 2)
    return operand_range

  @builtins.property
  def predicate(self) -> _Optional[_ods_ir.Value[_ods_ir.IntegerType]]:
    operand_range = _ods_segmented_accessor(
         self.operation.operands,
         self.operation.attributes["operandSegmentSizes"], 3)
    return operand_range[0] if len(operand_range) > 0 else None

def tma_async_store(src, tensor_map_descriptor, coordinates, *, predicate=None, loc=None, ip=None) -> TmaAsyncStoreOp:
  return TmaAsyncStoreOp(src=src, tensorMapDescriptor=tensor_map_descriptor, coordinates=coordinates, predicate=predicate, loc=loc, ip=ip)

@_ods_cext.register_operation(_Dialect)
class TmaCreateDescriptorOp(_ods_ir.OpView):
  r"""
  The Op creates a tensor map descriptor object representing tiled memory 
  region. To do that it calls CUDA Driver's `cuTensorMapEncodeTiled`. The 
  descriptor is used by Tensor Memory Access (TMA).
  
  The `tensor` is the source tensor to be tiled. 
  
  The `boxDimensions` is the size of the tiled memory region in each dimension.
  
  For more information see below:
  https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TENSOR__MEMORY.html
  """

  OPERATION_NAME = "nvgpu.tma.create.descriptor"

  _ODS_REGIONS = (0, True)

  def __init__(self, tensorMap, tensor, boxDimensions, *, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.append(tensor)
    operands.extend(_get_op_results_or_values(boxDimensions))
    _ods_context = _ods_get_default_loc_context(loc)
    results = []
    results.append(tensorMap)
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def tensor(self) -> _ods_ir.Value[_ods_ir.UnrankedMemRefType]:
    return self.operation.operands[0]

  @builtins.property
  def boxDimensions(self) -> _ods_ir.OpOperandList:
    _ods_variadic_group_length = len(self.operation.operands) - 2 + 1
    return self.operation.operands[1:1 + _ods_variadic_group_length]

  @builtins.property
  def tensorMap(self) -> _ods_ir.OpResult:
    return self.operation.results[0]

def tma_create_descriptor(tensor_map, tensor, box_dimensions, *, loc=None, ip=None) -> _ods_ir.OpResult:
  return TmaCreateDescriptorOp(tensorMap=tensor_map, tensor=tensor, boxDimensions=box_dimensions, loc=loc, ip=ip).result

@_ods_cext.register_operation(_Dialect)
class TmaFenceOp(_ods_ir.OpView):
  r"""
  The Op fences the given `$tmaDescriptor`. This is necessary if the tensor map
  descriptor was modified from the host using cudaMemcpy. In this case, the
  kernel needs a fence after which it is safe to use `tensor.map`.
  """

  OPERATION_NAME = "nvgpu.tma.fence.descriptor"

  _ODS_REGIONS = (0, True)

  def __init__(self, tensorMapDescriptor, *, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.append(tensorMapDescriptor)
    _ods_context = _ods_get_default_loc_context(loc)
    results = []
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def tensorMapDescriptor(self) -> _ods_ir.Value:
    return self.operation.operands[0]

def tma_fence_descriptor(tensor_map_descriptor, *, loc=None, ip=None) -> TmaFenceOp:
  return TmaFenceOp(tensorMapDescriptor=tensor_map_descriptor, loc=loc, ip=ip)

@_ods_cext.register_operation(_Dialect)
class TmaPrefetchOp(_ods_ir.OpView):
  r"""
  The Op brings the cache line containing the given `$tmaDescriptor` for 
  subsequent use by the `tma.async.load` instruction.
  """

  OPERATION_NAME = "nvgpu.tma.prefetch.descriptor"

  _ODS_REGIONS = (0, True)

  def __init__(self, tensorMapDescriptor, *, predicate=None, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.append(tensorMapDescriptor)
    if predicate is not None: operands.append(predicate)
    _ods_context = _ods_get_default_loc_context(loc)
    results = []
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def tensorMapDescriptor(self) -> _ods_ir.Value:
    return self.operation.operands[0]

  @builtins.property
  def predicate(self) -> _Optional[_ods_ir.Value[_ods_ir.IntegerType]]:
    return None if len(self.operation.operands) < 2 else self.operation.operands[1]

def tma_prefetch_descriptor(tensor_map_descriptor, *, predicate=None, loc=None, ip=None) -> TmaPrefetchOp:
  return TmaPrefetchOp(tensorMapDescriptor=tensor_map_descriptor, predicate=predicate, loc=loc, ip=ip)

@_ods_cext.register_operation(_Dialect)
class WarpgroupGenerateDescriptorOp(_ods_ir.OpView):
  r"""
  This Op builds a `nvgpu.warpgroup.descriptor` that is used by 
  `nvgpu.warpgroup.mma` to perform warpgroup-level matrix multiply and 
  accumulate.
  
  The descriptor specifies the properties of the matrix in shared memory that 
  is a multiplicand in the matrix multiply and accumulate operation. 
  """

  OPERATION_NAME = "nvgpu.warpgroup.generate.descriptor"

  _ODS_REGIONS = (0, True)

  def __init__(self, descriptor, tensor, tensorMap, *, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.append(tensor)
    operands.append(tensorMap)
    _ods_context = _ods_get_default_loc_context(loc)
    results = []
    results.append(descriptor)
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def tensor(self) -> _ods_ir.Value[_ods_ir.MemRefType]:
    return self.operation.operands[0]

  @builtins.property
  def tensorMap(self) -> _ods_ir.Value:
    return self.operation.operands[1]

  @builtins.property
  def descriptor(self) -> _ods_ir.OpResult:
    return self.operation.results[0]

def warpgroup_generate_descriptor(descriptor, tensor, tensor_map, *, loc=None, ip=None) -> _ods_ir.OpResult:
  return WarpgroupGenerateDescriptorOp(descriptor=descriptor, tensor=tensor, tensorMap=tensor_map, loc=loc, ip=ip).result

@_ods_cext.register_operation(_Dialect)
class WarpgroupMmaInitAccumulatorOp(_ods_ir.OpView):
  r"""
  This Op generates and initializes the accumulator matrix for 
  `nvgpu.warpgroup.mma` op to perform matrix-multiply-and-accumulate.
  """

  OPERATION_NAME = "nvgpu.warpgroup.mma.init.accumulator"

  _ODS_REGIONS = (0, True)

  def __init__(self, matrixC, *, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    _ods_context = _ods_get_default_loc_context(loc)
    results = []
    results.append(matrixC)
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def matrixC(self) -> _ods_ir.OpResult:
    return self.operation.results[0]

def warpgroup_mma_init_accumulator(matrix_c, *, loc=None, ip=None) -> _ods_ir.OpResult:
  return WarpgroupMmaInitAccumulatorOp(matrixC=matrix_c, loc=loc, ip=ip).result

@_ods_cext.register_operation(_Dialect)
class WarpgroupMmaOp(_ods_ir.OpView):
  r"""
  The `nvgpu.warpgroup.mma` op performs the warpgroup-level (4 warps) 
  matrix-multiply-and-accumulate (mma) operation that results in 
  `nvvm.wgmma.mma_async`. 
  
  The operands are `descriptorA` and `descriptorB` that are wgmma matrix 
  descriptors that shows the properties of the matrix in shared memory. The 
  results are thread-level ownership to the warpgroup-level mma operation 
  shape. The shape is deduced from the descriptor types and output vector.
  
  The Op encapsulates multiple `nvvm.wgmma.mma_async` operations to complete 
  the given shape. As `nvvm.wgmma.async` Op, or its corresponding PTX 
  instruction, is asynchronous, this Op groups the `nvvm.wgmma.async` and 
  surrounds them between `wgmma.fence.aligned` and 
  `wgmma.commit.group.sync.aligned`, `wgmma.wait.group.sync.aligned` Ops.
  
  Example:
  ```mlir
    %r1,%r2 = nvgpu.warpgroup.mma %descA, %descB, %acc1, %acc2: 
               !nvgpu.warpgroup.descriptor<tensor = memref<128x64xf16, 3>>, 
               !nvgpu.warpgroup.descriptor<tensor = memref<64x128xf16, 3>>, 
               !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>>,
               !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>>
               -> 
               !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>>,
               !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>>
  ```
  """

  OPERATION_NAME = "nvgpu.warpgroup.mma"

  _ODS_REGIONS = (0, True)

  def __init__(self, matrixD, descriptorA, descriptorB, matrixC, *, waitGroup=None, transposeA=None, transposeB=None, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.append(descriptorA)
    operands.append(descriptorB)
    operands.append(matrixC)
    _ods_context = _ods_get_default_loc_context(loc)
    if waitGroup is not None: attributes["waitGroup"] = (waitGroup if (
        isinstance(waitGroup, _ods_ir.Attribute) or
        not _ods_ir.AttrBuilder.contains('I64Attr')) else
          _ods_ir.AttrBuilder.get('I64Attr')(waitGroup, context=_ods_context))
    if bool(transposeA): attributes["transposeA"] = _ods_ir.UnitAttr.get(
      _ods_get_default_loc_context(loc))
    if bool(transposeB): attributes["transposeB"] = _ods_ir.UnitAttr.get(
      _ods_get_default_loc_context(loc))
    results = []
    results.append(matrixD)
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def descriptorA(self) -> _ods_ir.Value:
    return self.operation.operands[0]

  @builtins.property
  def descriptorB(self) -> _ods_ir.Value:
    return self.operation.operands[1]

  @builtins.property
  def matrixC(self) -> _ods_ir.Value:
    return self.operation.operands[2]

  @builtins.property
  def waitGroup(self) -> _Optional[_ods_ir.IntegerAttr]:
    if "waitGroup" not in self.operation.attributes:
      return None
    return self.operation.attributes["waitGroup"]

  @waitGroup.setter
  def waitGroup(self, value: _Optional[_ods_ir.IntegerAttr]):
    if value is not None:
      self.operation.attributes["waitGroup"] = value
    elif "waitGroup" in self.operation.attributes:
      del self.operation.attributes["waitGroup"]

  @waitGroup.deleter
  def waitGroup(self):
    del self.operation.attributes["waitGroup"]

  @builtins.property
  def transposeA(self) -> bool:
    return "transposeA" in self.operation.attributes

  @transposeA.setter
  def transposeA(self, value):
    if bool(value):
      self.operation.attributes["transposeA"] = _ods_ir.UnitAttr.get()
    elif "transposeA" in self.operation.attributes:
      del self.operation.attributes["transposeA"]

  @transposeA.deleter
  def transposeA(self):
    del self.operation.attributes["transposeA"]

  @builtins.property
  def transposeB(self) -> bool:
    return "transposeB" in self.operation.attributes

  @transposeB.setter
  def transposeB(self, value):
    if bool(value):
      self.operation.attributes["transposeB"] = _ods_ir.UnitAttr.get()
    elif "transposeB" in self.operation.attributes:
      del self.operation.attributes["transposeB"]

  @transposeB.deleter
  def transposeB(self):
    del self.operation.attributes["transposeB"]

  @builtins.property
  def matrixD(self) -> _ods_ir.OpResult:
    return self.operation.results[0]

def warpgroup_mma(matrix_d, descriptor_a, descriptor_b, matrix_c, *, wait_group=None, transpose_a=None, transpose_b=None, loc=None, ip=None) -> _ods_ir.OpResult:
  return WarpgroupMmaOp(matrixD=matrix_d, descriptorA=descriptor_a, descriptorB=descriptor_b, matrixC=matrix_c, waitGroup=wait_group, transposeA=transpose_a, transposeB=transpose_b, loc=loc, ip=ip).result

@_ods_cext.register_operation(_Dialect)
class WarpgroupMmaStoreOp(_ods_ir.OpView):
  r"""
  The `nvgpu.warpgroup.mma.store` op performs the store of fragmented result 
  in $matrixD to given memref. 
  
  [See the details of register fragment layout for accumulator matrix D]
  (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#wgmma-64n16-d) 
  
  Note that, the op must be run with warp group.
  """

  OPERATION_NAME = "nvgpu.warpgroup.mma.store"

  _ODS_REGIONS = (0, True)

  def __init__(self, matrixD, dstMemref, *, loc=None, ip=None):
    operands = []
    attributes = {}
    regions = None
    operands.append(matrixD)
    operands.append(dstMemref)
    _ods_context = _ods_get_default_loc_context(loc)
    results = []
    _ods_successors = None
    super().__init__(self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, attributes=attributes, results=results, operands=operands, successors=_ods_successors, regions=regions, loc=loc, ip=ip)

  @builtins.property
  def matrixD(self) -> _ods_ir.Value:
    return self.operation.operands[0]

  @builtins.property
  def dstMemref(self) -> _ods_ir.Value[_ods_ir.MemRefType]:
    return self.operation.operands[1]

def warpgroup_mma_store(matrix_d, dst_memref, *, loc=None, ip=None) -> WarpgroupMmaStoreOp:
  return WarpgroupMmaStoreOp(matrixD=matrix_d, dstMemref=dst_memref, loc=loc, ip=ip)