-
Notifications
You must be signed in to change notification settings - Fork 5.3k
[Draft] Accelerate Half with FP16 ISA
#122649
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
d9e2f9d
f769822
2788bd0
00fcca9
9a62ea1
c2004c8
f456a27
355a13c
db92fec
62cf092
7a93360
e38aeb0
8f0aebe
257986c
db60c6a
fd2a07e
28e04ac
3d48e9f
483e85d
48e38b4
9e9ac36
f633726
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -572,6 +572,32 @@ bool Compiler::isNativePrimitiveStructType(CORINFO_CLASS_HANDLE clsHnd) | |
| return strcmp(typeName, "CLong") == 0 || strcmp(typeName, "CULong") == 0 || strcmp(typeName, "NFloat") == 0; | ||
| } | ||
|
|
||
| bool Compiler::isNativeHalfStructType(CORINFO_CLASS_HANDLE clsHnd) | ||
| { | ||
| #if defined(TARGET_XARCH) | ||
| if (!compOpportunisticallyDependsOn(InstructionSet_AVX10v1)) | ||
| { | ||
| return false; | ||
| } | ||
|
Comment on lines
+578
to
+581
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We need this last, not first, otherwise code gets tagged as benefiting from using AVX10v1 unnecessarily |
||
|
|
||
| if (!isIntrinsicType(clsHnd)) | ||
| { | ||
| return false; | ||
| } | ||
| const char* namespaceName = nullptr; | ||
| const char* typeName = getClassNameFromMetadata(clsHnd, &namespaceName); | ||
|
|
||
| if (strcmp(namespaceName, "System") != 0) | ||
| { | ||
| return false; | ||
| } | ||
|
|
||
| return strcmp(typeName, "Half") == 0; | ||
| #else | ||
| return false; | ||
| #endif | ||
| } | ||
|
|
||
| //----------------------------------------------------------------------------- | ||
| // getPrimitiveTypeForStruct: | ||
| // Get the "primitive" type that is used for a struct | ||
|
|
@@ -646,7 +672,7 @@ var_types Compiler::getPrimitiveTypeForStruct(unsigned structSize, CORINFO_CLASS | |
| break; | ||
|
|
||
| case 2: | ||
| useType = TYP_USHORT; | ||
| useType = isNativeHalfStructType(clsHnd) ? TYP_HALF : TYP_USHORT; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. isn't
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am actually not familiar with HFA here --- should it be moved up into that logic?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So HFA is Windows x64 doesn't have HFA but Linux x64 does, so where the handling needs to be probably depends on the wording of the SysV ABI spec (which defines the x64 ABI for Linux). |
||
| break; | ||
|
|
||
| #if !defined(TARGET_XARCH) || defined(UNIX_AMD64_ABI) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -127,8 +127,9 @@ bool emitter::Is3OpRmwInstruction(instruction ins) | |
| default: | ||
| { | ||
| return ((ins >= FIRST_FMA_INSTRUCTION) && (ins <= LAST_FMA_INSTRUCTION)) || | ||
| (IsAVXVNNIFamilyInstruction(ins)) || | ||
| ((ins >= FIRST_AVXIFMA_INSTRUCTION) && (ins <= LAST_AVXIFMA_INSTRUCTION)); | ||
| IsAVXVNNIFamilyInstruction(ins) || | ||
| ((ins >= FIRST_AVXIFMA_INSTRUCTION) && (ins <= LAST_AVXIFMA_INSTRUCTION)) || | ||
| ((ins >= FIRST_AVX10V1_FMA_INSTR) && (ins <= LAST_AVX10V1_FMA_INSTR)); | ||
| } | ||
| } | ||
| } | ||
|
|
@@ -2994,7 +2995,7 @@ emitter::code_t emitter::emitExtractEvexPrefix(instruction ins, code_t& code) co | |
| if (sizePrefix == 0) | ||
| { | ||
| // no simd prefix for EVEX2 - AVX10.2 and above | ||
| assert(emitComp->compIsaSupportedDebugOnly(InstructionSet_AVX10v2) || | ||
| assert(emitComp->compIsaSupportedDebugOnly(InstructionSet_AVX10v1) || | ||
| emitComp->compIsaSupportedDebugOnly(InstructionSet_AVXVNNIINT) || | ||
| emitComp->compIsaSupportedDebugOnly(InstructionSet_AVXVNNIINT_V512)); | ||
| } | ||
|
|
@@ -3077,7 +3078,7 @@ emitter::code_t emitter::emitExtractEvexPrefix(instruction ins, code_t& code) co | |
| // 1. An escape byte 0F (For isa before AVX10.2) | ||
| // 2. A map number from 0 to 7 (For AVX10.2 and above) | ||
| leadingBytes = check; | ||
| assert((leadingBytes == 0x0F) || ((emitComp->compIsaSupportedDebugOnly(InstructionSet_AVX10v2) || | ||
| assert((leadingBytes == 0x0F) || ((emitComp->compIsaSupportedDebugOnly(InstructionSet_AVX10v1) || | ||
| (emitComp->compIsaSupportedDebugOnly(InstructionSet_APX))) && | ||
| (leadingBytes >= 0x00) && (leadingBytes <= 0x07))); | ||
|
|
||
|
|
@@ -3104,7 +3105,7 @@ emitter::code_t emitter::emitExtractEvexPrefix(instruction ins, code_t& code) co | |
| // 0x0000RM11. | ||
| leadingBytes = (code >> 16) & 0xFF; | ||
| assert(leadingBytes == 0x0F || | ||
| (emitComp->compIsaSupportedDebugOnly(InstructionSet_AVX10v2) && leadingBytes >= 0x00 && | ||
| ((emitComp->compIsaSupportedDebugOnly(InstructionSet_AVX10v1)) && leadingBytes >= 0x00 && | ||
| leadingBytes <= 0x07) || | ||
| (IsApxExtendedEvexInstruction(ins) && leadingBytes == 0)); | ||
| code &= 0xFFFF; | ||
|
|
@@ -3159,15 +3160,21 @@ emitter::code_t emitter::emitExtractEvexPrefix(instruction ins, code_t& code) co | |
|
|
||
| case 0x05: | ||
| { | ||
| assert(emitComp->compIsaSupportedDebugOnly(InstructionSet_AVX10v2)); | ||
| assert(emitComp->compIsaSupportedDebugOnly(InstructionSet_AVX10v1)); | ||
| evexPrefix |= (0x05 << 16); | ||
| break; | ||
| } | ||
|
|
||
| case 0x06: | ||
| { | ||
| assert(emitComp->compIsaSupportedDebugOnly(InstructionSet_AVX10v1)); | ||
| evexPrefix |= (0x06 << 16); | ||
| break; | ||
| } | ||
|
|
||
| case 0x01: | ||
| case 0x02: | ||
| case 0x03: | ||
| case 0x06: | ||
| case 0x07: | ||
| default: | ||
| { | ||
|
|
@@ -5388,10 +5395,8 @@ UNATIVE_OFFSET emitter::emitInsSizeAM(instrDesc* id, code_t code) | |
|
|
||
| assert((attrSize == EA_4BYTE) || (attrSize == EA_PTRSIZE) // Only for x64 | ||
| || (attrSize == EA_16BYTE) || (attrSize == EA_32BYTE) || (attrSize == EA_64BYTE) // only for x64 | ||
| || (ins == INS_movzx) || (ins == INS_movsx) || | ||
| || (ins == INS_movzx) || (ins == INS_movsx) || (ins == INS_vmovsh) || | ||
| (ins == INS_cmpxchg) | ||
| // kmov instructions reach this path with EA_8BYTE size, even on x86 | ||
| || IsKMOVInstruction(ins) | ||
|
Comment on lines
-5393
to
-5394
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What's the reason for removing this part of the assert?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Think that was an error, will fix. |
||
| // The prefetch instructions are always 3 bytes and have part of their modr/m byte hardcoded | ||
| || isPrefetch(ins)); | ||
|
|
||
|
|
@@ -7430,6 +7435,7 @@ bool emitter::IsMovInstruction(instruction ins) | |
|
|
||
| #if defined(TARGET_AMD64) | ||
| case INS_movsxd: | ||
| case INS_vmovsh: | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This isn't TARGET_AMD64 exclusive as |
||
| { | ||
| return true; | ||
| } | ||
|
|
@@ -7622,6 +7628,12 @@ bool emitter::HasSideEffect(instruction ins, emitAttr size) | |
| break; | ||
| } | ||
|
|
||
| case INS_vmovsh: | ||
| { | ||
| hasSideEffect = false; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Doesn't this have a side effect of clearing the upper-bits? That is, it always does
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You are correct, I will change. |
||
| break; | ||
| } | ||
|
|
||
| default: | ||
| { | ||
| unreached(); | ||
|
|
@@ -7895,6 +7907,12 @@ bool emitter::emitIns_Mov( | |
| break; | ||
| } | ||
|
|
||
| case INS_vmovsh: | ||
| { | ||
| assert(isFloatReg(dstReg) && isFloatReg(srcReg)); | ||
| break; | ||
| } | ||
|
|
||
| default: | ||
| { | ||
| unreached(); | ||
|
|
@@ -11798,7 +11816,11 @@ const char* emitter::emitRegName(regNumber reg, emitAttr attr, bool varName) con | |
| case EA_2BYTE: | ||
| { | ||
| #if defined(TARGET_AMD64) | ||
| if (reg > REG_RDI) | ||
| if (IsXMMReg(reg)) | ||
| { | ||
| return emitXMMregName(reg); | ||
| } | ||
|
Comment on lines
+11819
to
+11822
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This shouldn't be |
||
| else if (reg > REG_RDI) | ||
| { | ||
| suffix = 'w'; | ||
| goto APPEND_SUFFIX; | ||
|
|
@@ -14520,7 +14542,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) | |
| // Is this a 'big' opcode? | ||
| else if (code & 0xFF000000) | ||
| { | ||
| if (size == EA_2BYTE) | ||
| if (size == EA_2BYTE && ins != INS_vmovsh) | ||
| { | ||
| assert(ins == INS_movbe); | ||
|
|
||
|
|
@@ -15388,7 +15410,7 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) | |
| // Is this a 'big' opcode? | ||
| else if (code & 0xFF000000) | ||
| { | ||
| if (size == EA_2BYTE) | ||
| if (size == EA_2BYTE && (ins != INS_vmovsh && ins != INS_vaddsh)) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we just use |
||
| { | ||
| assert(ins == INS_movbe); | ||
|
|
||
|
|
@@ -20892,6 +20914,8 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins | |
| case INS_movups: | ||
| case INS_movapd: | ||
| case INS_movupd: | ||
| // todo-xarch-half: come back to fix | ||
| case INS_vmovsh: | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Shouldn't this be grouped with |
||
| { | ||
| if (memAccessKind == PERFSCORE_MEMORY_NONE) | ||
| { | ||
|
|
@@ -21383,6 +21407,20 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins | |
| assert((unsigned)ins < ArrLen(insLatencyInfos)); | ||
| float insLatency = insLatencyInfos[ins]; | ||
|
|
||
| // todo-xarch-half: hacking an exit on the unhandled ins to make prototyping easier | ||
| if (ins == INS_vcvtss2sh || ins == INS_vcvtsh2ss || ins == INS_vaddsh || ins == INS_vsubsh || | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we want to put most of these with the
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, and for the above, I will get the proper numbers before putting the PR in non-draft. |
||
| ins == INS_vmulsh || ins == INS_vdivsh || ins == INS_vcomish || ins == INS_vsqrtsh || | ||
| ins == INS_vmaxsh || ins == INS_vminsh || ins == INS_vrcpsh || ins == INS_vrsqrtsh || | ||
| ins == INS_vrndscalesh || ins == INS_vfmadd213sh || ins == INS_vcvtsd2sh || ins == INS_vcvtsh2sd || | ||
| ins == INS_vcvtsi2sh32 || ins == INS_vcvtsi2sh64 || ins == INS_vcvtsh2si32 || ins == INS_vcvtsh2si64 || | ||
| ins == INS_vcvtusi2sh32 || ins == INS_vcvtusi2sh64 || ins == INS_vcvtsh2usi32 || | ||
| ins == INS_vcvtsh2usi64) | ||
| { | ||
| result.insLatency = PERFSCORE_LATENCY_1C; | ||
| result.insThroughput = PERFSCORE_THROUGHPUT_1C; | ||
| break; | ||
| } | ||
|
|
||
| if ((insLatency == PERFSCORE_LATENCY_ILLEGAL) || (insThroughput == PERFSCORE_THROUGHPUT_ILLEGAL)) | ||
| { | ||
| // unhandled instruction insFmt combination | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -8115,6 +8115,7 @@ GenTree* Compiler::gtNewZeroConNode(var_types type) | |
| return gtNewLconNode(0); | ||
| } | ||
|
|
||
| case TYP_HALF: | ||
| case TYP_FLOAT: | ||
| case TYP_DOUBLE: | ||
| { | ||
|
|
@@ -23075,6 +23076,20 @@ GenTree* Compiler::gtNewSimdCreateScalarUnsafeNode(var_types type, | |
| break; | ||
| } | ||
|
|
||
| case TYP_HALF: | ||
| { | ||
| // todo-half: this is only to create zero constant half nodes for use in instrincis, anything | ||
| // else will not work | ||
|
Comment on lines
+23081
to
+23082
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure I understand this comment. Presumably we just need a We then |
||
| float cnsVal = static_cast<float>(op1->AsDblCon()->DconValue()); | ||
| assert(cnsVal == 0.0f); // Only 0.0 is supported for half constants currently | ||
|
|
||
| for (unsigned i = 0; i < (simdSize / 4); i++) | ||
| { | ||
| vecCon->gtSimdVal.f32[i] = cnsVal; | ||
| } | ||
| break; | ||
| } | ||
|
|
||
| case TYP_FLOAT: | ||
| { | ||
| float cnsVal = static_cast<float>(op1->AsDblCon()->DconValue()); | ||
|
|
@@ -27252,12 +27267,12 @@ GenTree* Compiler::gtNewSimdTernaryLogicNode( | |
| // | ||
| GenTree* Compiler::gtNewSimdToScalarNode(var_types type, GenTree* op1, var_types simdBaseType, unsigned simdSize) | ||
| { | ||
| assert(varTypeIsArithmetic(type)); | ||
| assert(varTypeIsArithmetic(type) || TypeGet(type) == TYP_HALF); | ||
|
|
||
| assert(op1 != nullptr); | ||
| assert(varTypeIsSIMD(op1)); | ||
|
|
||
| assert(varTypeIsArithmetic(simdBaseType)); | ||
| assert(varTypeIsArithmetic(simdBaseType) || TypeGet(simdBaseType) == TYP_HALF); | ||
|
|
||
| NamedIntrinsic intrinsic = NI_Illegal; | ||
|
|
||
|
|
@@ -28598,6 +28613,7 @@ bool GenTreeHWIntrinsic::OperIsEmbRoundingEnabled() const | |
| case NI_AVX512_FusedMultiplySubtractNegated: | ||
| case NI_AVX512_FusedMultiplySubtractNegatedScalar: | ||
| case NI_AVX512_FusedMultiplySubtractScalar: | ||
| case NI_AVX10v1_FusedMultiplyAddScalar: | ||
| { | ||
| return numArgs == 4; | ||
| } | ||
|
|
@@ -28615,6 +28631,13 @@ bool GenTreeHWIntrinsic::OperIsEmbRoundingEnabled() const | |
| case NI_AVX512_X64_ConvertScalarToVector128Double: | ||
| case NI_AVX512_X64_ConvertScalarToVector128Single: | ||
| #endif // TARGET_AMD64 | ||
| case NI_AVX10v1_AddScalar: | ||
| case NI_AVX10v1_DivideScalar: | ||
| case NI_AVX10v1_MultiplyScalar: | ||
| case NI_AVX10v1_SubtractScalar: | ||
| case NI_AVX10v1_ConvertScalarToVector128Half: | ||
| case NI_AVX10v1_ConvertScalarToVector128Single: | ||
| case NI_AVX10v1_ConvertScalarToVector128Double: | ||
| { | ||
| return numArgs == 3; | ||
| } | ||
|
|
@@ -28631,9 +28654,13 @@ bool GenTreeHWIntrinsic::OperIsEmbRoundingEnabled() const | |
| case NI_AVX512_ConvertToVector512UInt32: | ||
| case NI_AVX512_ConvertToVector512UInt64: | ||
| case NI_AVX512_Sqrt: | ||
| case NI_AVX10v1_ConvertToInt32: | ||
| case NI_AVX10v1_ConvertToUInt32: | ||
| #if defined(TARGET_AMD64) | ||
| case NI_AVX512_X64_ConvertToInt64: | ||
| case NI_AVX512_X64_ConvertToUInt64: | ||
| case NI_AVX10v1_ConvertToInt64: | ||
| case NI_AVX10v1_ConvertToUInt64: | ||
| #endif // TARGET_AMD64 | ||
| case NI_AVX10v2_ConvertToSByteWithSaturationAndZeroExtendToInt32: | ||
| case NI_AVX10v2_ConvertToByteWithSaturationAndZeroExtendToInt32: | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3484,7 +3484,7 @@ struct GenTreeDblCon : public GenTree | |
| GenTreeDblCon(double val, var_types type = TYP_DOUBLE) | ||
| : GenTree(GT_CNS_DBL, type) | ||
| { | ||
| assert(varTypeIsFloating(type)); | ||
| assert(varTypeIsFloating(type) || type == TYP_HALF); | ||
| SetDconValue(val); | ||
| } | ||
| #if DEBUGGABLE_GENTREE | ||
|
|
@@ -6869,6 +6869,22 @@ struct GenTreeVecCon : public GenTree | |
| break; | ||
| } | ||
|
|
||
| case TYP_HALF: | ||
| { | ||
| if (arg->IsCnsFltOrDbl()) | ||
| { | ||
| simdVal.f16[argIdx] = static_cast<uint16_t>(arg->AsDblCon()->DconValue()); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This looks incorrect as it does a |
||
| return true; | ||
| } | ||
| else | ||
| { | ||
| // We expect the constant to have been already zeroed | ||
| // We check against the i16, rather than f16, to account for -0.0 | ||
| assert(simdVal.i16[argIdx] == 0); | ||
| } | ||
| break; | ||
| } | ||
|
|
||
| case TYP_FLOAT: | ||
| { | ||
| if (arg->IsCnsFltOrDbl()) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How do SIMD types avoid hitting this assert?
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I believe its due to on windows the SIMD type would be passed/returned via a buffer, which we have avoided doing with the half type and should have been transformed prior.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That's my understanding as well.
Notably that is "incorrect" as SIMD types are supposed to be returned in register on Windows and this is a known inconsistency we want to fix long term (#9578)