Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/coreclr/jit/abi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,8 @@ var_types ABIPassingSegment::GetRegisterType() const
{
switch (Size)
{
case 2:
return TYP_HALF;
case 4:
return TYP_FLOAT;
case 8:
Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/jit/codegencommon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7247,7 +7247,7 @@ bool CodeGen::isStructReturn(GenTree* treeNode)
}

#if defined(TARGET_AMD64) && !defined(UNIX_AMD64_ABI)
assert(!varTypeIsStruct(treeNode));
assert(!varTypeIsStruct(treeNode) || treeNode->TypeGet() == TYP_HALF);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
assert(!varTypeIsStruct(treeNode) || treeNode->TypeGet() == TYP_HALF);
assert(!varTypeIsStruct(treeNode) || treeNode->TypeIs(TYP_HALF));

How do SIMD types avoid hitting this assert?

Copy link
Contributor Author

@anthonycanino anthonycanino Jan 5, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe its due to on windows the SIMD type would be passed/returned via a buffer, which we have avoided doing with the half type and should have been transformed prior.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's my understanding as well.

Notably that is "incorrect" as SIMD types are supposed to be returned in register on Windows and this is a known inconsistency we want to fix long term (#9578)

return false;
#else
return varTypeIsStruct(treeNode) && (compiler->info.compRetNativeType == TYP_STRUCT);
Expand Down
4 changes: 2 additions & 2 deletions src/coreclr/jit/codegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6175,7 +6175,7 @@ void CodeGen::genCall(GenTreeCall* call)
}
else
#endif // TARGET_X86
if (varTypeIsFloating(returnType))
if (varTypeIsFloating(returnType) || returnType == TYP_HALF)
{
returnReg = REG_FLOATRET;
}
Expand Down Expand Up @@ -6305,7 +6305,7 @@ void CodeGen::genCallInstruction(GenTreeCall* call X86_ARG(target_ssize_t stackA
}
else
{
assert(!varTypeIsStruct(call));
assert(!varTypeIsStruct(call) || call->TypeIs(TYP_HALF));

if (call->TypeIs(TYP_REF))
{
Expand Down
28 changes: 27 additions & 1 deletion src/coreclr/jit/compiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -572,6 +572,32 @@ bool Compiler::isNativePrimitiveStructType(CORINFO_CLASS_HANDLE clsHnd)
return strcmp(typeName, "CLong") == 0 || strcmp(typeName, "CULong") == 0 || strcmp(typeName, "NFloat") == 0;
}

bool Compiler::isNativeHalfStructType(CORINFO_CLASS_HANDLE clsHnd)
{
#if defined(TARGET_XARCH)
if (!compOpportunisticallyDependsOn(InstructionSet_AVX10v1))
{
return false;
}
Comment on lines +578 to +581
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need this last, not first, otherwise code gets tagged as benefiting from using AVX10v1 unnecessarily


if (!isIntrinsicType(clsHnd))
{
return false;
}
const char* namespaceName = nullptr;
const char* typeName = getClassNameFromMetadata(clsHnd, &namespaceName);

if (strcmp(namespaceName, "System") != 0)
{
return false;
}

return strcmp(typeName, "Half") == 0;
#else
return false;
#endif
}

//-----------------------------------------------------------------------------
// getPrimitiveTypeForStruct:
// Get the "primitive" type that is used for a struct
Expand Down Expand Up @@ -646,7 +672,7 @@ var_types Compiler::getPrimitiveTypeForStruct(unsigned structSize, CORINFO_CLASS
break;

case 2:
useType = TYP_USHORT;
useType = isNativeHalfStructType(clsHnd) ? TYP_HALF : TYP_USHORT;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

isn't TYP_HALF classified as HFA?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am actually not familiar with HFA here --- should it be moved up into that logic?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So HFA is homogenous floating-point aggregate and is basically a struct that contains 1-4 floating-point fields of the same type. These get special classification and passing conventions in many cases.

Windows x64 doesn't have HFA but Linux x64 does, so where the handling needs to be probably depends on the wording of the SysV ABI spec (which defines the x64 ABI for Linux).

break;

#if !defined(TARGET_XARCH) || defined(UNIX_AMD64_ABI)
Expand Down
10 changes: 9 additions & 1 deletion src/coreclr/jit/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -4727,6 +4727,10 @@ class Compiler

NamedIntrinsic lookupPrimitiveFloatNamedIntrinsic(CORINFO_METHOD_HANDLE method, const char* methodName);
NamedIntrinsic lookupPrimitiveIntNamedIntrinsic(CORINFO_METHOD_HANDLE method, const char* methodName);

NamedIntrinsic lookupHalfIntrinsic(NamedIntrinsic ni);
NamedIntrinsic lookupHalfConversionIntrinsic(var_types fromType, var_types toType);

GenTree* impUnsupportedNamedIntrinsic(unsigned helper,
CORINFO_METHOD_HANDLE method,
CORINFO_SIG_INFO* sig,
Expand Down Expand Up @@ -5992,6 +5996,7 @@ class Compiler
// Returns true if the provided type should be treated as a primitive type
// for the unmanaged calling conventions.
bool isNativePrimitiveStructType(CORINFO_CLASS_HANDLE clsHnd);
bool isNativeHalfStructType(CORINFO_CLASS_HANDLE clsHnd);

enum structPassingKind
{
Expand Down Expand Up @@ -9595,8 +9600,11 @@ class Compiler

// Use to determine if a struct *might* be a SIMD type. As this function only takes a size, many
// structs will fit the criteria.
bool structSizeMightRepresentSIMDType(size_t structSize)
bool structSizeMightRepresentAcceleratedType(size_t structSize)
{
if (structSize == 2)
return true;

#ifdef FEATURE_SIMD
return (structSize >= getMinVectorByteLength()) && (structSize <= getMaxVectorByteLength());
#else
Expand Down
64 changes: 51 additions & 13 deletions src/coreclr/jit/emitxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -127,8 +127,9 @@ bool emitter::Is3OpRmwInstruction(instruction ins)
default:
{
return ((ins >= FIRST_FMA_INSTRUCTION) && (ins <= LAST_FMA_INSTRUCTION)) ||
(IsAVXVNNIFamilyInstruction(ins)) ||
((ins >= FIRST_AVXIFMA_INSTRUCTION) && (ins <= LAST_AVXIFMA_INSTRUCTION));
IsAVXVNNIFamilyInstruction(ins) ||
((ins >= FIRST_AVXIFMA_INSTRUCTION) && (ins <= LAST_AVXIFMA_INSTRUCTION)) ||
((ins >= FIRST_AVX10V1_FMA_INSTR) && (ins <= LAST_AVX10V1_FMA_INSTR));
}
}
}
Expand Down Expand Up @@ -2994,7 +2995,7 @@ emitter::code_t emitter::emitExtractEvexPrefix(instruction ins, code_t& code) co
if (sizePrefix == 0)
{
// no simd prefix for EVEX2 - AVX10.2 and above
assert(emitComp->compIsaSupportedDebugOnly(InstructionSet_AVX10v2) ||
assert(emitComp->compIsaSupportedDebugOnly(InstructionSet_AVX10v1) ||
emitComp->compIsaSupportedDebugOnly(InstructionSet_AVXVNNIINT) ||
emitComp->compIsaSupportedDebugOnly(InstructionSet_AVXVNNIINT_V512));
}
Expand Down Expand Up @@ -3077,7 +3078,7 @@ emitter::code_t emitter::emitExtractEvexPrefix(instruction ins, code_t& code) co
// 1. An escape byte 0F (For isa before AVX10.2)
// 2. A map number from 0 to 7 (For AVX10.2 and above)
leadingBytes = check;
assert((leadingBytes == 0x0F) || ((emitComp->compIsaSupportedDebugOnly(InstructionSet_AVX10v2) ||
assert((leadingBytes == 0x0F) || ((emitComp->compIsaSupportedDebugOnly(InstructionSet_AVX10v1) ||
(emitComp->compIsaSupportedDebugOnly(InstructionSet_APX))) &&
(leadingBytes >= 0x00) && (leadingBytes <= 0x07)));

Expand All @@ -3104,7 +3105,7 @@ emitter::code_t emitter::emitExtractEvexPrefix(instruction ins, code_t& code) co
// 0x0000RM11.
leadingBytes = (code >> 16) & 0xFF;
assert(leadingBytes == 0x0F ||
(emitComp->compIsaSupportedDebugOnly(InstructionSet_AVX10v2) && leadingBytes >= 0x00 &&
((emitComp->compIsaSupportedDebugOnly(InstructionSet_AVX10v1)) && leadingBytes >= 0x00 &&
leadingBytes <= 0x07) ||
(IsApxExtendedEvexInstruction(ins) && leadingBytes == 0));
code &= 0xFFFF;
Expand Down Expand Up @@ -3159,15 +3160,21 @@ emitter::code_t emitter::emitExtractEvexPrefix(instruction ins, code_t& code) co

case 0x05:
{
assert(emitComp->compIsaSupportedDebugOnly(InstructionSet_AVX10v2));
assert(emitComp->compIsaSupportedDebugOnly(InstructionSet_AVX10v1));
evexPrefix |= (0x05 << 16);
break;
}

case 0x06:
{
assert(emitComp->compIsaSupportedDebugOnly(InstructionSet_AVX10v1));
evexPrefix |= (0x06 << 16);
break;
}

case 0x01:
case 0x02:
case 0x03:
case 0x06:
case 0x07:
default:
{
Expand Down Expand Up @@ -5388,10 +5395,8 @@ UNATIVE_OFFSET emitter::emitInsSizeAM(instrDesc* id, code_t code)

assert((attrSize == EA_4BYTE) || (attrSize == EA_PTRSIZE) // Only for x64
|| (attrSize == EA_16BYTE) || (attrSize == EA_32BYTE) || (attrSize == EA_64BYTE) // only for x64
|| (ins == INS_movzx) || (ins == INS_movsx) ||
|| (ins == INS_movzx) || (ins == INS_movsx) || (ins == INS_vmovsh) ||
(ins == INS_cmpxchg)
// kmov instructions reach this path with EA_8BYTE size, even on x86
|| IsKMOVInstruction(ins)
Comment on lines -5393 to -5394
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the reason for removing this part of the assert?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Think that was an error, will fix.

// The prefetch instructions are always 3 bytes and have part of their modr/m byte hardcoded
|| isPrefetch(ins));

Expand Down Expand Up @@ -7430,6 +7435,7 @@ bool emitter::IsMovInstruction(instruction ins)

#if defined(TARGET_AMD64)
case INS_movsxd:
case INS_vmovsh:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This isn't TARGET_AMD64 exclusive as vmovsh is listed with V/V for support, so is valid for both 64 and 32-bit mode.

{
return true;
}
Expand Down Expand Up @@ -7622,6 +7628,12 @@ bool emitter::HasSideEffect(instruction ins, emitAttr size)
break;
}

case INS_vmovsh:
{
hasSideEffect = false;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doesn't this have a side effect of clearing the upper-bits?

That is, it always does DEST[MAXVL:128] := 0

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are correct, I will change.

break;
}

default:
{
unreached();
Expand Down Expand Up @@ -7895,6 +7907,12 @@ bool emitter::emitIns_Mov(
break;
}

case INS_vmovsh:
{
assert(isFloatReg(dstReg) && isFloatReg(srcReg));
break;
}

default:
{
unreached();
Expand Down Expand Up @@ -11798,7 +11816,11 @@ const char* emitter::emitRegName(regNumber reg, emitAttr attr, bool varName) con
case EA_2BYTE:
{
#if defined(TARGET_AMD64)
if (reg > REG_RDI)
if (IsXMMReg(reg))
{
return emitXMMregName(reg);
}
Comment on lines +11819 to +11822
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This shouldn't be TARGET_AMD64 exclusive either.

else if (reg > REG_RDI)
{
suffix = 'w';
goto APPEND_SUFFIX;
Expand Down Expand Up @@ -14520,7 +14542,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
// Is this a 'big' opcode?
else if (code & 0xFF000000)
{
if (size == EA_2BYTE)
if (size == EA_2BYTE && ins != INS_vmovsh)
{
assert(ins == INS_movbe);

Expand Down Expand Up @@ -15388,7 +15410,7 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
// Is this a 'big' opcode?
else if (code & 0xFF000000)
{
if (size == EA_2BYTE)
if (size == EA_2BYTE && (ins != INS_vmovsh && ins != INS_vaddsh))
Copy link
Member

@tannergooding tannergooding Jan 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we just use && !IsSimdInstruction(ins)?

{
assert(ins == INS_movbe);

Expand Down Expand Up @@ -20892,6 +20914,8 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
case INS_movups:
case INS_movapd:
case INS_movupd:
// todo-xarch-half: come back to fix
case INS_vmovsh:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't this be grouped with vmovss and vmovsd? While we may not have exact numbers, I'd expect it to have identical perf/latency to those rather than the more general movaps and friends.

{
if (memAccessKind == PERFSCORE_MEMORY_NONE)
{
Expand Down Expand Up @@ -21383,6 +21407,20 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
assert((unsigned)ins < ArrLen(insLatencyInfos));
float insLatency = insLatencyInfos[ins];

// todo-xarch-half: hacking an exit on the unhandled ins to make prototyping easier
if (ins == INS_vcvtss2sh || ins == INS_vcvtsh2ss || ins == INS_vaddsh || ins == INS_vsubsh ||
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we want to put most of these with the v*ss and v*sd equivalents prior to mergine this PR.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, and for the above, I will get the proper numbers before putting the PR in non-draft.

ins == INS_vmulsh || ins == INS_vdivsh || ins == INS_vcomish || ins == INS_vsqrtsh ||
ins == INS_vmaxsh || ins == INS_vminsh || ins == INS_vrcpsh || ins == INS_vrsqrtsh ||
ins == INS_vrndscalesh || ins == INS_vfmadd213sh || ins == INS_vcvtsd2sh || ins == INS_vcvtsh2sd ||
ins == INS_vcvtsi2sh32 || ins == INS_vcvtsi2sh64 || ins == INS_vcvtsh2si32 || ins == INS_vcvtsh2si64 ||
ins == INS_vcvtusi2sh32 || ins == INS_vcvtusi2sh64 || ins == INS_vcvtsh2usi32 ||
ins == INS_vcvtsh2usi64)
{
result.insLatency = PERFSCORE_LATENCY_1C;
result.insThroughput = PERFSCORE_THROUGHPUT_1C;
break;
}

if ((insLatency == PERFSCORE_LATENCY_ILLEGAL) || (insThroughput == PERFSCORE_THROUGHPUT_ILLEGAL))
{
// unhandled instruction insFmt combination
Expand Down
31 changes: 29 additions & 2 deletions src/coreclr/jit/gentree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8115,6 +8115,7 @@ GenTree* Compiler::gtNewZeroConNode(var_types type)
return gtNewLconNode(0);
}

case TYP_HALF:
case TYP_FLOAT:
case TYP_DOUBLE:
{
Expand Down Expand Up @@ -23075,6 +23076,20 @@ GenTree* Compiler::gtNewSimdCreateScalarUnsafeNode(var_types type,
break;
}

case TYP_HALF:
{
// todo-half: this is only to create zero constant half nodes for use in instrincis, anything
// else will not work
Comment on lines +23081 to +23082
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure I understand this comment.

Presumably we just need a FloatingPointUtils::convertDoubleToHalf(...) method which returns a float16_t type (these were added in C++23, which is newer than our baseline, so we'd just typedef uint16_t float16_t; for the time being).

We then vecCon->gtSimdVal.f16[i] = cnsVal

float cnsVal = static_cast<float>(op1->AsDblCon()->DconValue());
assert(cnsVal == 0.0f); // Only 0.0 is supported for half constants currently

for (unsigned i = 0; i < (simdSize / 4); i++)
{
vecCon->gtSimdVal.f32[i] = cnsVal;
}
break;
}

case TYP_FLOAT:
{
float cnsVal = static_cast<float>(op1->AsDblCon()->DconValue());
Expand Down Expand Up @@ -27252,12 +27267,12 @@ GenTree* Compiler::gtNewSimdTernaryLogicNode(
//
GenTree* Compiler::gtNewSimdToScalarNode(var_types type, GenTree* op1, var_types simdBaseType, unsigned simdSize)
{
assert(varTypeIsArithmetic(type));
assert(varTypeIsArithmetic(type) || TypeGet(type) == TYP_HALF);

assert(op1 != nullptr);
assert(varTypeIsSIMD(op1));

assert(varTypeIsArithmetic(simdBaseType));
assert(varTypeIsArithmetic(simdBaseType) || TypeGet(simdBaseType) == TYP_HALF);

NamedIntrinsic intrinsic = NI_Illegal;

Expand Down Expand Up @@ -28598,6 +28613,7 @@ bool GenTreeHWIntrinsic::OperIsEmbRoundingEnabled() const
case NI_AVX512_FusedMultiplySubtractNegated:
case NI_AVX512_FusedMultiplySubtractNegatedScalar:
case NI_AVX512_FusedMultiplySubtractScalar:
case NI_AVX10v1_FusedMultiplyAddScalar:
{
return numArgs == 4;
}
Expand All @@ -28615,6 +28631,13 @@ bool GenTreeHWIntrinsic::OperIsEmbRoundingEnabled() const
case NI_AVX512_X64_ConvertScalarToVector128Double:
case NI_AVX512_X64_ConvertScalarToVector128Single:
#endif // TARGET_AMD64
case NI_AVX10v1_AddScalar:
case NI_AVX10v1_DivideScalar:
case NI_AVX10v1_MultiplyScalar:
case NI_AVX10v1_SubtractScalar:
case NI_AVX10v1_ConvertScalarToVector128Half:
case NI_AVX10v1_ConvertScalarToVector128Single:
case NI_AVX10v1_ConvertScalarToVector128Double:
{
return numArgs == 3;
}
Expand All @@ -28631,9 +28654,13 @@ bool GenTreeHWIntrinsic::OperIsEmbRoundingEnabled() const
case NI_AVX512_ConvertToVector512UInt32:
case NI_AVX512_ConvertToVector512UInt64:
case NI_AVX512_Sqrt:
case NI_AVX10v1_ConvertToInt32:
case NI_AVX10v1_ConvertToUInt32:
#if defined(TARGET_AMD64)
case NI_AVX512_X64_ConvertToInt64:
case NI_AVX512_X64_ConvertToUInt64:
case NI_AVX10v1_ConvertToInt64:
case NI_AVX10v1_ConvertToUInt64:
#endif // TARGET_AMD64
case NI_AVX10v2_ConvertToSByteWithSaturationAndZeroExtendToInt32:
case NI_AVX10v2_ConvertToByteWithSaturationAndZeroExtendToInt32:
Expand Down
18 changes: 17 additions & 1 deletion src/coreclr/jit/gentree.h
Original file line number Diff line number Diff line change
Expand Up @@ -3484,7 +3484,7 @@ struct GenTreeDblCon : public GenTree
GenTreeDblCon(double val, var_types type = TYP_DOUBLE)
: GenTree(GT_CNS_DBL, type)
{
assert(varTypeIsFloating(type));
assert(varTypeIsFloating(type) || type == TYP_HALF);
SetDconValue(val);
}
#if DEBUGGABLE_GENTREE
Expand Down Expand Up @@ -6869,6 +6869,22 @@ struct GenTreeVecCon : public GenTree
break;
}

case TYP_HALF:
{
if (arg->IsCnsFltOrDbl())
{
simdVal.f16[argIdx] = static_cast<uint16_t>(arg->AsDblCon()->DconValue());
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks incorrect as it does a double->uint16_t cast, when we rather need double->float16_t

return true;
}
else
{
// We expect the constant to have been already zeroed
// We check against the i16, rather than f16, to account for -0.0
assert(simdVal.i16[argIdx] == 0);
}
break;
}

case TYP_FLOAT:
{
if (arg->IsCnsFltOrDbl())
Expand Down
Loading
Loading